Repository: openai/gpt-oss
Branch: main
Commit: 599476783c6f
Files: 155
Total size: 1.3 MB

Directory structure:
gitextract_586tz2l0/

├── .github/
│   ├── CODEOWNERS
│   ├── ISSUE_TEMPLATE/
│   │   └── config.yml
│   └── workflows/
│       └── CI.yml
├── .gitignore
├── CMakeLists.txt
├── LICENSE
├── MANIFEST.in
├── README.md
├── USAGE_POLICY
├── _build/
│   └── gpt_oss_build_backend/
│       ├── __init__.py
│       └── backend.py
├── awesome-gpt-oss.md
├── compatibility-test/
│   ├── .gitignore
│   ├── README.md
│   ├── analysis.ts
│   ├── cases.jsonl
│   ├── index.ts
│   ├── package.json
│   ├── providers.ts
│   ├── runCase.ts
│   └── tools.ts
├── examples/
│   ├── agents-sdk-js/
│   │   ├── index.ts
│   │   └── package.json
│   ├── agents-sdk-python/
│   │   ├── example.py
│   │   └── pyproject.toml
│   ├── gradio/
│   │   └── gradio_chat.py
│   ├── reinforcement-fine-tuning.ipynb
│   └── streamlit/
│       └── streamlit_chat.py
├── gpt-oss-mcp-server/
│   ├── README.md
│   ├── browser_server.py
│   ├── build-system-prompt.py
│   ├── pyproject.toml
│   ├── python_server.py
│   └── reference-system-prompt.py
├── gpt_oss/
│   ├── __init__.py
│   ├── chat.py
│   ├── evals/
│   │   ├── README.md
│   │   ├── __init__.py
│   │   ├── __main__.py
│   │   ├── abcd_grader.py
│   │   ├── aime_eval.py
│   │   ├── basic_eval.py
│   │   ├── chat_completions_sampler.py
│   │   ├── gpqa_eval.py
│   │   ├── healthbench_eval.py
│   │   ├── report.py
│   │   ├── responses_sampler.py
│   │   └── types.py
│   ├── generate.py
│   ├── metal/
│   │   ├── CMakeLists.txt
│   │   ├── __init__.py
│   │   ├── benchmark/
│   │   │   ├── end-to-end-threadgroup.cc
│   │   │   ├── end-to-end.cc
│   │   │   ├── f32-bf16w-rmsnorm.cc
│   │   │   ├── f32-random.cc
│   │   │   ├── mf4-f32-convert.cc
│   │   │   └── u32-random.cc
│   │   ├── examples/
│   │   │   ├── chat.py
│   │   │   └── generate.py
│   │   ├── include/
│   │   │   ├── gpt-oss/
│   │   │   │   ├── functions.h
│   │   │   │   ├── macros.h
│   │   │   │   └── types.h
│   │   │   └── gpt-oss.h
│   │   ├── python/
│   │   │   ├── context.c
│   │   │   ├── model.c
│   │   │   ├── module.c
│   │   │   ├── module.h
│   │   │   └── tokenizer.c
│   │   ├── scripts/
│   │   │   └── create-local-model.py
│   │   ├── source/
│   │   │   ├── accumulate.metal
│   │   │   ├── context.c
│   │   │   ├── convert.metal
│   │   │   ├── embeddings.metal
│   │   │   ├── expert_routing_metadata.metal
│   │   │   ├── gather_and_accumulate.metal
│   │   │   ├── generate.c
│   │   │   ├── include/
│   │   │   │   └── internal/
│   │   │   │       ├── datatype.h
│   │   │   │       ├── datatype.hpp
│   │   │   │       ├── kernel-args.h
│   │   │   │       ├── log.h
│   │   │   │       ├── macros.h
│   │   │   │       ├── math.h
│   │   │   │       ├── metal-kernels.h
│   │   │   │       ├── metal.h
│   │   │   │       ├── metal.hpp
│   │   │   │       ├── model.h
│   │   │   │       ├── rng.h
│   │   │   │       ├── rng.hpp
│   │   │   │       ├── storage.h
│   │   │   │       └── uuid.h
│   │   │   ├── log.c
│   │   │   ├── matmul.metal
│   │   │   ├── metal-kernels.c
│   │   │   ├── metal.m
│   │   │   ├── model.c
│   │   │   ├── moematmul.metal
│   │   │   ├── random.metal
│   │   │   ├── rmsnorm.metal
│   │   │   ├── rope.metal
│   │   │   ├── sample.metal
│   │   │   ├── scatter.metal
│   │   │   ├── sdpa.metal
│   │   │   ├── tokenizer.c
│   │   │   └── topk.metal
│   │   └── test/
│   │       ├── bf16-f32-embeddings.cc
│   │       ├── embeddings-kernel-tester.hpp
│   │       ├── f32-bf16w-matmul.cc
│   │       ├── f32-bf16w-rmsnorm.cc
│   │       ├── f32-random.cc
│   │       ├── f32-rope.cc
│   │       ├── fill-random-kernel-tester.hpp
│   │       ├── matmul-kernel-tester.hpp
│   │       ├── mf4-f32-convert.cc
│   │       ├── rmsnorm-kernel-tester.hpp
│   │       ├── rope-kernel-tester.hpp
│   │       └── u32-random.cc
│   ├── responses_api/
│   │   ├── __init__.py
│   │   ├── api_server.py
│   │   ├── events.py
│   │   ├── inference/
│   │   │   ├── __init__.py
│   │   │   ├── metal.py
│   │   │   ├── ollama.py
│   │   │   ├── stub.py
│   │   │   ├── transformers.py
│   │   │   ├── triton.py
│   │   │   └── vllm.py
│   │   ├── serve.py
│   │   ├── types.py
│   │   └── utils.py
│   ├── tokenizer.py
│   ├── tools/
│   │   ├── __init__.py
│   │   ├── apply_patch.md
│   │   ├── apply_patch.py
│   │   ├── python_docker/
│   │   │   └── docker_tool.py
│   │   ├── simple_browser/
│   │   │   ├── __init__.py
│   │   │   ├── backend.py
│   │   │   ├── page_contents.py
│   │   │   └── simple_browser_tool.py
│   │   └── tool.py
│   ├── torch/
│   │   ├── __init__.py
│   │   ├── model.py
│   │   ├── utils.py
│   │   └── weights.py
│   ├── triton/
│   │   ├── __init__.py
│   │   ├── attention.py
│   │   ├── model.py
│   │   └── moe.py
│   └── vllm/
│       └── token_generator.py
├── pyproject.toml
├── tests/
│   ├── conftest.py
│   ├── gpt_oss/
│   │   └── tools/
│   │       └── simple_browser/
│   │           └── test_backend.py
│   ├── test_api_endpoints.py
│   └── test_responses_api.py
└── tests-data/
    ├── basic-event-stream.txt
    └── web-search-event-stream.txt

================================================
FILE CONTENTS
================================================

================================================
FILE: .github/CODEOWNERS
================================================
@openai/developer-experience
dkundel-openai
Maratyszcza
scott-oai
volsgd


================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false
contact_links:
  - name: 🐛 Model Issues
    url: https://huggingface.co/openai/gpt-oss-120b/discussions
    about: For general questions about the models, please use the Community feature on Hugging Face.
  - name: 💡 General Feedback
    url: https://openai.com/open-models
    about: Suggest new features on our feature request page.


================================================
FILE: .github/workflows/CI.yml
================================================
name: CI

on:
  release:
    types: [published]
  push:
    tags:
      - "v*"
  workflow_dispatch:

# Minimal repo-level permissions; job-level permissions override where needed.
permissions:
  contents: read
  id-token: write

jobs:
  publish:
    name: Build & Publish to PyPI (Trusted Publishing)
    runs-on: ubuntu-latest

    # Run in the GitHub environment named "release" so you can gate it with approvals.
    environment: release

    # Extra permissions required for pypa action to do OIDC exchange:
    permissions:
      contents: read
      id-token: write

    steps:
      - name: Checkout repository
        uses: actions/checkout@v4

      - name: Set up Python
        uses: actions/setup-python@v4
        with:
          python-version: "3.12"

      - name: Install build tools
        run: |
          python -m pip install --upgrade pip setuptools wheel build

      - name: Install uv (if needed)
        run: |
          python -m pip install --upgrade uv || true

      - name: Build package with uv
        run: |
          pwd
          ls -la
          uv build

      - name: Inspect dist folder
        run: |
          ls -la dist || ls -la build || echo "no dist/ or build/ — check uv output"

      - name: Publish to PyPI using Trusted Publishing
        # Note: No pypi_token / username / password provided — Trusted Publishing via OIDC is used.
        uses: pypa/gh-action-pypi-publish@release/v1
        with:
          attestations: true # optional (default for Trusted Publishing) - set to false to disable


================================================
FILE: .gitignore
================================================
build
_skbuild
tmp*
__pycache__
*.egg*
node_modules/
*.log

================================================
FILE: CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.26)
project(gpt_oss LANGUAGES C CXX)

# If not defined externally, auto-detect
if(NOT DEFINED GPTOSS_BUILD_METAL)
  if(APPLE AND CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
    message(STATUS "Apple Silicon detected → enabling GPTOSS_BUILD_METAL")
    set(GPTOSS_BUILD_METAL ON)
  else()
    message(STATUS "Non-Apple Silicon → disabling GPTOSS_BUILD_METAL")
    set(GPTOSS_BUILD_METAL OFF)
  endif()
else()
  message(STATUS "GPTOSS_BUILD_METAL manually set to: ${GPTOSS_BUILD_METAL}")
endif()

# Now declare it as a cache variable (respects user-provided value)
set(GPTOSS_BUILD_METAL "${GPTOSS_BUILD_METAL}" CACHE BOOL "Enable Metal backend")

if(GPTOSS_BUILD_METAL)
  enable_language(OBJC)
  add_subdirectory(gpt_oss/metal)
endif()


================================================
FILE: LICENSE
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: MANIFEST.in
================================================
recursive-include _build * 

================================================
FILE: README.md
================================================
<img alt="gpt-oss-120" src="./docs/gpt-oss.svg">
<p align="center">
  <a href="https://gpt-oss.com"><strong>Try gpt-oss</strong></a> ·
  <a href="https://cookbook.openai.com/topic/gpt-oss"><strong>Guides</strong></a> ·
  <a href="https://arxiv.org/abs/2508.10925"><strong>Model card</strong></a> ·
  <a href="https://openai.com/index/introducing-gpt-oss/"><strong>OpenAI blog</strong></a>
</p>
<p align="center">
  <strong>Download <a href="https://huggingface.co/openai/gpt-oss-120b">gpt-oss-120b</a> and <a href="https://huggingface.co/openai/gpt-oss-20b">gpt-oss-20b</a> on Hugging Face</strong>
</p>

<br>

Welcome to the gpt-oss series, [OpenAI's open-weight models](https://openai.com/open-models/) designed for powerful reasoning, agentic tasks, and versatile developer use cases.

We're releasing two flavors of these open models:

- `gpt-oss-120b` — for production, general purpose, high reasoning use cases that fit into a single 80GB GPU (like NVIDIA H100 or AMD MI300X) (117B parameters with 5.1B active parameters)
- `gpt-oss-20b` — for lower latency, and local or specialized use cases (21B parameters with 3.6B active parameters)

Both models were trained using our [harmony response format][harmony] and should only be used with this format; otherwise, they will not work correctly.

## Table of Contents
- [Highlights](#highlights)
- [Inference examples](#inference-examples)
- [About this repository](#about-this-repository)
- [Setup](#setup)
- [Download the model](#download-the-model)
- [Reference PyTorch implementation](#reference-pytorch-implementation)
- [Reference Triton implementation (single GPU)](#reference-triton-implementation-single-gpu)
- [Reference Metal implementation](#reference-metal-implementation)
- [Harmony format & tools](#harmony-format--tools)
- [Clients](#clients)
- [Tools](#tools)
- [Other details](#other-details)
- [Contributing](#contributing)

### Highlights

- **Permissive Apache 2.0 license:** Build freely without copyleft restrictions or patent risk—ideal for experimentation, customization, and commercial deployment.
- **Configurable reasoning effort:** Easily adjust the reasoning effort (low, medium, high) based on your specific use case and latency needs.
- **Full chain-of-thought:** Provides complete access to the model's reasoning process, facilitating easier debugging and greater trust in outputs. This information is not intended to be shown to end users.
- **Fine-tunable:** Fully customize models to your specific use case through parameter fine-tuning.
- **Agentic capabilities:** Use the models' native capabilities for function calling, [web browsing](#browser), [Python code execution](#python), and Structured Outputs.
- **MXFP4 quantization:** The models were post-trained with MXFP4 quantization of the MoE weights, making `gpt-oss-120b` run on a single 80GB GPU (like NVIDIA H100 or AMD MI300X) and the `gpt-oss-20b` model run within 16GB of memory. All evals were performed with the same MXFP4 quantization.

### Inference examples

#### Transformers

You can use `gpt-oss-120b` and `gpt-oss-20b` with the Transformers library. If you use Transformers' chat template, it will automatically apply the [harmony response format][harmony]. If you use `model.generate` directly, you need to apply the harmony format manually using the chat template or use our [`openai-harmony`][harmony] package.

```python
from transformers import pipeline
import torch

model_id = "openai/gpt-oss-120b"

pipe = pipeline(
    "text-generation",
    model=model_id,
    torch_dtype="auto",
    device_map="auto",
)

messages = [
    {"role": "user", "content": "Explain quantum mechanics clearly and concisely."},
]

outputs = pipe(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])
```

[Learn more about how to use gpt-oss with Transformers.](https://cookbook.openai.com/articles/gpt-oss/run-transformers)

#### vLLM

vLLM recommends using [`uv`](https://docs.astral.sh/uv/) for Python dependency management. You can use vLLM to spin up an OpenAI-compatible web server. The following command will automatically download the model and start the server.

```bash
uv pip install --pre vllm==0.10.1+gptoss \
    --extra-index-url https://wheels.vllm.ai/gpt-oss/ \
    --extra-index-url https://download.pytorch.org/whl/nightly/cu128 \
    --index-strategy unsafe-best-match

vllm serve openai/gpt-oss-20b
```

[Learn more about how to use gpt-oss with vLLM.](https://cookbook.openai.com/articles/gpt-oss/run-vllm)

Offline Serve Code:
- run this code after installing proper libraries as described, while additionally installing this:
- `uv pip install openai-harmony`
```python
# source .oss/bin/activate

import os
os.environ["VLLM_USE_FLASHINFER_SAMPLER"] = "0"

import json
from openai_harmony import (
    HarmonyEncodingName,
    load_harmony_encoding,
    Conversation,
    Message,
    Role,
    SystemContent,
    DeveloperContent,
)
 
from vllm import LLM, SamplingParams
import os

# --- 1) Render the prefill with Harmony ---
encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
 
convo = Conversation.from_messages(
    [
        Message.from_role_and_content(Role.SYSTEM, SystemContent.new()),
        Message.from_role_and_content(
            Role.DEVELOPER,
            DeveloperContent.new().with_instructions("Always respond in riddles"),
        ),
        Message.from_role_and_content(Role.USER, "What is the weather like in SF?"),
    ]
)
 
prefill_ids = encoding.render_conversation_for_completion(convo, Role.ASSISTANT)
 
# Harmony stop tokens (pass to sampler so they won't be included in output)
stop_token_ids = encoding.stop_tokens_for_assistant_actions()
 
# --- 2) Run vLLM with prefill ---
llm = LLM(
    model="openai/gpt-oss-20b",
    trust_remote_code=True,
    gpu_memory_utilization = 0.95,
    max_num_batched_tokens=4096,
    max_model_len=5000,
    tensor_parallel_size=1
)
 
sampling = SamplingParams(
    max_tokens=128,
    temperature=1,
    stop_token_ids=stop_token_ids,
)
 
outputs = llm.generate(
    prompt_token_ids=[prefill_ids],   # batch of size 1
    sampling_params=sampling,
)
 
# vLLM gives you both text and token IDs
gen = outputs[0].outputs[0]
text = gen.text
output_tokens = gen.token_ids  # <-- these are the completion token IDs (no prefill)
 
# --- 3) Parse the completion token IDs back into structured Harmony messages ---
entries = encoding.parse_messages_from_completion_tokens(output_tokens, Role.ASSISTANT)
 
# 'entries' is a sequence of structured conversation entries (assistant messages, tool calls, etc.).
for message in entries:
    print(f"{json.dumps(message.to_dict())}")
```

#### PyTorch / Triton / Metal

These implementations are largely reference implementations for educational purposes and are not expected to be run in production.

[Learn more below.](#reference-pytorch-implementation)

#### Ollama

If you are trying to run `gpt-oss` on consumer hardware, you can use Ollama by running the following commands after [installing Ollama](https://ollama.com/download).

```bash
# gpt-oss-20b
ollama pull gpt-oss:20b
ollama run gpt-oss:20b

# gpt-oss-120b
ollama pull gpt-oss:120b
ollama run gpt-oss:120b
```

[Learn more about how to use gpt-oss with Ollama.](https://cookbook.openai.com/articles/gpt-oss/run-locally-ollama)

#### LM Studio

If you are using [LM Studio](https://lmstudio.ai/) you can use the following commands to download.

```bash
# gpt-oss-20b
lms get openai/gpt-oss-20b
# gpt-oss-120b
lms get openai/gpt-oss-120b
```

Check out our [awesome list](./awesome-gpt-oss.md) for a broader collection of gpt-oss resources and inference partners.

## About this repository

This repository provides a collection of reference implementations:

- **Inference:**
  - [`torch`](#reference-pytorch-implementation) — a non-optimized [PyTorch](https://pytorch.org/) implementation for educational purposes only. Requires at least 4× H100 GPUs due to lack of optimization.
  - [`triton`](#reference-triton-implementation-single-gpu) — a more optimized implementation using [PyTorch](https://pytorch.org/) & [Triton](https://github.com/triton-lang/triton) incl. using CUDA graphs and basic caching
  - [`metal`](#reference-metal-implementation) — a Metal-specific implementation for running the models on Apple Silicon hardware
- **Tools:**
  - [`browser`](#browser) — a reference implementation of the browser tool the models got trained on
  - [`python`](#python) — a stateless reference implementation of the python tool the model got trained on
- **Client examples:**
  - [`chat`](#terminal-chat) — a basic terminal chat application that uses the PyTorch or Triton implementations for inference along with the python and browser tools
  - [`responses_api`](#responses-api) — an example Responses API compatible server that implements the browser tool along with other Responses-compatible functionality

## Setup

### Requirements

- Python 3.12
- On macOS: Install the Xcode CLI tools --> `xcode-select --install`
- On Linux: These reference implementations require CUDA
- On Windows: These reference implementations have not been tested on Windows. Try using solutions like Ollama if you are trying to run the model locally.

### Installation

If you want to try any of the code you can install it directly from [PyPI](https://pypi.org/project/gpt-oss/)

```shell
# if you just need the tools
pip install gpt-oss
# if you want to try the torch implementation
pip install gpt-oss[torch]
# if you want to try the triton implementation
pip install gpt-oss[triton]
```

If you want to modify the code or try the metal implementation set the project up locally:

```shell
git clone https://github.com/openai/gpt-oss.git
GPTOSS_BUILD_METAL=1 pip install -e ".[metal]"
```

## Download the model

You can download the model weights from the [Hugging Face Hub](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4) directly from Hugging Face CLI:

```shell
# gpt-oss-120b
hf download openai/gpt-oss-120b --include "original/*" --local-dir gpt-oss-120b/

# gpt-oss-20b
hf download openai/gpt-oss-20b --include "original/*" --local-dir gpt-oss-20b/
```

## Reference PyTorch implementation

We include an inefficient reference PyTorch implementation in [gpt_oss/torch/model.py](gpt_oss/torch/model.py). This code uses basic PyTorch operators to show the exact model architecture, with a small addition of supporting tensor parallelism in MoE so that the larger model can run with this code (e.g., on 4xH100 or 2xH200). In this implementation, we upcast all weights to BF16 and run the model in BF16.

To run the reference implementation, install the dependencies:

```shell
pip install -e ".[torch]"
```

And then run:

```shell
# On 4xH100:
torchrun --nproc-per-node=4 -m gpt_oss.generate gpt-oss-120b/original/
```

## Reference Triton implementation (single GPU)

We also include an optimized reference implementation that uses [an optimized triton MoE kernel](https://github.com/triton-lang/triton/tree/main/python/triton_kernels/triton_kernels) that supports MXFP4. It also has some optimization on the attention code to reduce the memory cost. To run this implementation, the nightly version of triton and torch will be installed. This version can be run on a single 80GB GPU for `gpt-oss-120b`.

To install the reference Triton implementation run

```shell
# You need to install triton from source to use the triton implementation
git clone https://github.com/triton-lang/triton
cd triton/
pip install -r python/requirements.txt
pip install -e . --verbose --no-build-isolation
pip install -e python/triton_kernels

# Install the gpt-oss triton implementation
pip install -e ".[triton]"
```

And then run:

```shell
# On 1xH100
export PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
python -m gpt_oss.generate --backend triton gpt-oss-120b/original/
```

If you encounter `torch.OutOfMemoryError`, make sure to turn on the expandable allocator to avoid crashes when loading weights from the checkpoint.

## Reference Metal implementation

Additionally we are providing a reference implementation for Metal to run on Apple Silicon. This implementation is not production-ready but is accurate to the PyTorch implementation.

The implementation will get automatically compiled when running the `.[metal]` installation on an Apple Silicon device:

```shell
GPTOSS_BUILD_METAL=1 pip install -e ".[metal]"
```

To perform inference you'll need to first convert the SafeTensor weights from Hugging Face into the right format using:

```shell
python gpt_oss/metal/scripts/create-local-model.py -s <model_dir> -d <output_file>
```

Or download the pre-converted weights:

```shell
hf download openai/gpt-oss-120b --include "metal/*" --local-dir gpt-oss-120b/metal/
hf download openai/gpt-oss-20b --include "metal/*" --local-dir gpt-oss-20b/metal/
```

To test it you can run:

```shell
python gpt_oss/metal/examples/generate.py gpt-oss-20b/metal/model.bin -p "why did the chicken cross the road?"
```

## Harmony format & tools

Along with the model, we are also releasing a new chat format library `harmony` to interact with the model. Check [this guide](https://cookbook.openai.com/articles/openai-harmony) for more info about harmony.

We also include two system tools for the model: browsing and python container. Check [gpt_oss/tools](gpt_oss/tools) for the tool implementation.

## Clients

### Terminal Chat

The terminal chat application is a basic example of how to use the harmony format together with the PyTorch, Triton, and vLLM implementations. It also exposes both the python and browser tool as optional tools that can be used.

```bash
usage: python -m gpt_oss.chat [-h] [-r REASONING_EFFORT] [-a] [-b] [--show-browser-results] [-p] [--developer-message DEVELOPER_MESSAGE] [-c CONTEXT] [--raw] [--backend {triton,torch,vllm}] FILE

Chat example

positional arguments:
  FILE                  Path to the SafeTensors checkpoint

options:
  -h, --help            show this help message and exit
  -r REASONING_EFFORT, --reasoning-effort REASONING_EFFORT
                        Reasoning effort (default: low)
  -a, --apply-patch     Make apply_patch tool available to the model (default: False)
  -b, --browser         Use browser tool (default: False)
  --show-browser-results
                        Show browser results (default: False)
  -p, --python          Use python tool (default: False)
  --developer-message DEVELOPER_MESSAGE
                        Developer message (default: )
  -c CONTEXT, --context CONTEXT
                        Max context length (default: 8192)
  --raw                 Raw mode (does not render Harmony encoding) (default: False)
  --backend {triton,torch,vllm}
                        Inference backend (default: triton)
```

> [!NOTE]
> The torch and triton implementations require original checkpoint under `gpt-oss-120b/original/` and `gpt-oss-20b/original/` respectively. While vLLM uses the Hugging Face converted checkpoint under `gpt-oss-120b/` and `gpt-oss-20b/` root directory respectively.

### Responses API

We also include an example Responses API server. This server does not implement every feature and event of the Responses API but should be compatible with most of the basic use cases and serve as inspiration for anyone building their own server. Some of our inference partners are also offering their own Responses API.

You can start this server with the following inference backends:

- `triton` — uses the triton implementation
- `metal` — uses the metal implementation on Apple Silicon only
- `ollama` — uses the Ollama /api/generate API as an inference solution
- `vllm` — uses your installed vllm version to perform inference
- `transformers` — uses your installed transformers version to perform local inference

```bash
usage: python -m gpt_oss.responses_api.serve [-h] [--checkpoint FILE] [--port PORT] [--inference-backend BACKEND]

Responses API server

options:
  -h, --help                    show this help message and exit
  --checkpoint FILE             Path to the SafeTensors checkpoint
  --port PORT                   Port to run the server on
  --inference-backend BACKEND   Inference backend to use
```

### Codex

We support [codex](https://github.com/openai/codex) as a client for gpt-oss. To run the 20b version, set this to `~/.codex/config.toml`:

```
disable_response_storage = true
show_reasoning_content = true

[model_providers.local]
name = "local"
base_url = "http://localhost:11434/v1"

[profiles.oss]
model = "gpt-oss:20b"
model_provider = "local"
```

This will work with any chat completions-API compatible server listening on port 11434, like ollama. Start the server and point codex to the oss model:

```
ollama run gpt-oss:20b
codex -p oss
```

## Tools

### Browser

> [!WARNING]
> This implementation is purely for educational purposes and should not be used in production. You should implement your own equivalent of the [`YouComBackend`](gpt_oss/tools/simple_browser/backend.py) class with your own browsing environment. Currently we have available `YouComBackend` and `ExaBackend`. 

Both gpt-oss models were trained with the capability to browse using the `browser` tool that exposes the following three methods:

- `search` to search for key phrases
- `open` to open a particular page
- `find` to look for contents on a page

#### Usage

To enable the browser tool, you'll have to place the definition into the `system` message of your harmony formatted prompt. You can either use the `with_browser_tool()` method if your tool implements the full interface or modify the definition using `with_tools()`. For example:

```python
import datetime
from gpt_oss.tools.simple_browser import SimpleBrowserTool
from gpt_oss.tools.simple_browser.backend import YouComBackend
from openai_harmony import SystemContent, Message, Conversation, Role, load_harmony_encoding, HarmonyEncodingName

encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)

# Depending on the choice of the browser backend you need corresponding env variables setup
# In case you use You.com backend requires you to have set the YDC_API_KEY environment variable,
# while for Exa you might need EXA_API_KEY environment variable set
backend = YouComBackend(
    source="web",
)
# backend = ExaBackend(
#  source="web",
# )
browser_tool = SimpleBrowserTool(backend=backend)

# create a basic system prompt
system_message_content = SystemContent.new().with_conversation_start_date(
    datetime.datetime.now().strftime("%Y-%m-%d")
)

# if you want to use the browser tool
if use_browser_tool:
    # enables the tool
    system_message_content = system_message_content.with_tools(browser_tool.tool_config)
    # alternatively you could use the following if your tool is not stateless
    system_message_content = system_message_content.with_browser_tool()

# construct the system message
system_message = Message.from_role_and_content(Role.SYSTEM, system_message_content)

# create the overall prompt
messages = [system_message, Message.from_role_and_content(Role.USER, "What's the weather in SF?")]
conversation = Conversation.from_messages(messages)

# convert to tokens
token_ids = encoding.render_conversation_for_completion(conversation, Role.ASSISTANT)

# perform inference
# ...

# parse the output
messages = encoding.parse_messages_from_completion_tokens(output_tokens, Role.ASSISTANT)
last_message = messages[-1]
if last_message.recipient.startswith("browser"):
  # perform browser call
  response_messages = await browser_tool.process(last_message)

  # extend the current messages and run inference again
  messages.extend(response_messages)
```

#### Details

To control the context window size this tool uses a scrollable window of text that the model can interact with. So it might fetch the first 50 lines of a page and then scroll to the next 20 lines after that. The model has also been trained to then use citations from this tool in its answers.

To improve performance the tool caches requests so that the model can revisit a different part of a page without having to reload the page. For that reason you should create a new browser instance for every request.

### Python

The model was trained to use a python tool to perform calculations and other actions as part of its chain-of-thought. During the training the model used a stateful tool which makes running tools between CoT loops easier. This reference implementation, however, uses a stateless mode. As a result the PythonTool defines its own tool description to override the definition in [`openai-harmony`][harmony].

> [!WARNING]
> This implementation runs in a permissive Docker container which could be problematic in cases like prompt injections. It's serving as an example and you should consider implementing your own container restrictions in production.

#### Usage

To enable the python tool, you'll have to place the definition into the `system` message of your harmony formatted prompt. You can either use the `with_python()` method if your tool implements the full interface or modify the definition using `with_tools()`. For example:

```python
import datetime
from gpt_oss.tools.python_docker.docker_tool import PythonTool
from openai_harmony import SystemContent, Message, Conversation, Role, load_harmony_encoding, HarmonyEncodingName

encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)

python_tool = PythonTool()

# create a basic system prompt
system_message_content = SystemContent.new().with_conversation_start_date(
    datetime.datetime.now().strftime("%Y-%m-%d")
)

# if you want to use the python tool
if use_python_tool:
    # enables the tool making sure that the prompt gets set with the stateless tool description
    system_message_content = system_message_content.with_tools(python_tool.tool_config)
    # alternatively you could use the following if your tool is not stateless
    system_message_content = system_message_content.with_python()

# construct the system message
system_message = Message.from_role_and_content(Role.SYSTEM, system_message_content)

# create the overall prompt
messages = [system_message, Message.from_role_and_content(Role.USER, "What's the square root of 9001?")]
conversation = Conversation.from_messages(messages)

# convert to tokens
token_ids = encoding.render_conversation_for_completion(conversation, Role.ASSISTANT)

# perform inference
# ...

# parse the output
messages = encoding.parse_messages_from_completion_tokens(output_tokens, Role.ASSISTANT)
last_message = messages[-1]
if last_message.recipient == "python":
  # perform python call
  response_messages = await python_tool.process(last_message)

  # extend the current messages and run inference again
  messages.extend(response_messages)
```

### Apply Patch

`apply_patch` can be used to create, update or delete files locally.

## Other details

### Precision format

We released the models with native quantization support. Specifically, we use [MXFP4](https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf) for the linear projection weights in the MoE layer. We store the MoE tensor in two parts:

- `tensor.blocks` stores the actual fp4 values. We pack every two values in one `uint8` value.
- `tensor.scales` stores the block scale. The block scaling is done among the last dimension for all MXFP4 tensors.

All other tensors will be in BF16. We also recommend using BF16 as the activation precision for the model.

### Recommended Sampling Parameters

We recommend sampling with `temperature=1.0` and `top_p=1.0`.

## Contributing

The reference implementations in this repository are meant as a starting point and inspiration. Outside of bug fixes we do not intend to accept new feature contributions. If you build implementations based on this code such as new tool implementations you are welcome to contribute them to the [`awesome-gpt-oss.md`](./awesome-gpt-oss.md) file.

[harmony]: https://github.com/openai/harmony

## Citation

```bibtex
@misc{openai2025gptoss120bgptoss20bmodel,
      title={gpt-oss-120b & gpt-oss-20b Model Card}, 
      author={OpenAI},
      year={2025},
      eprint={2508.10925},
      archivePrefix={arXiv},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2508.10925}, 
}
```


================================================
FILE: USAGE_POLICY
================================================
We aim for our tools to be used safely, responsibly, and democratically, while maximizing your control over how you use them. By using OpenAI gpt-oss-120b and gpt-oss-20b, you agree to comply with all applicable law.

================================================
FILE: _build/gpt_oss_build_backend/__init__.py
================================================
"""In-tree PEP 517 backend package for gpt-oss.""" 

================================================
FILE: _build/gpt_oss_build_backend/backend.py
================================================
"""
Build backend for gpt-oss that supports two modes:

1) Default (pure wheel for PyPI)
   - Delegates to setuptools.build_meta.
   - Produces a py3-none-any wheel so PyPI accepts it (no linux_x86_64 tag).

2) Optional Metal/C extension build (local only)
   - If the environment variable GPTOSS_BUILD_METAL is set to a truthy value
     (1/true/on/yes), delegates to scikit_build_core.build.
   - Dynamically injects build requirements (scikit-build-core, cmake, ninja,
     pybind11) only for this mode.

Why this is needed
- PyPI rejects Linux wheels tagged linux_x86_64; manylinux/musllinux is required
  for binary wheels. We ship a pure wheel by default, but still allow developers
  to build/install the native Metal backend locally when needed.

Typical usage
- Publish pure wheel: `python -m build` (do not set GPTOSS_BUILD_METAL).
- Local Metal dev: `GPTOSS_BUILD_METAL=1 pip install -e ".[metal]"`.
- CI: keep GPTOSS_BUILD_METAL unset for releases; set it in internal jobs that
  exercise the extension.

Notes
- The base package remains importable without the extension. The Metal backend
  is only used when `gpt_oss.metal` is explicitly imported.
- This file is discovered via `backend-path = ["_build"]` and
  `build-backend = "gpt_oss_build_backend.backend"` in pyproject.toml.
"""
import os
from importlib import import_module
from typing import Any, Mapping, Sequence


TRUE_VALUES = {"1", "true", "TRUE", "on", "ON", "yes", "YES"}


def _use_metal_backend() -> bool:
    return str(os.environ.get("GPTOSS_BUILD_METAL", "")).strip() in TRUE_VALUES


def _setuptools_backend():
    from setuptools import build_meta as _bm  # type: ignore

    return _bm


def _scikit_build_backend():
    return import_module("scikit_build_core.build")


def _backend():
    return _scikit_build_backend() if _use_metal_backend() else _setuptools_backend()


# Required PEP 517 hooks

def build_wheel(
    wheel_directory: str,
    config_settings: Mapping[str, Any] | None = None,
    metadata_directory: str | None = None,
) -> str:
    return _backend().build_wheel(wheel_directory, config_settings, metadata_directory)


def build_sdist(
    sdist_directory: str, config_settings: Mapping[str, Any] | None = None
) -> str:
    return _backend().build_sdist(sdist_directory, config_settings)


def prepare_metadata_for_build_wheel(
    metadata_directory: str, config_settings: Mapping[str, Any] | None = None
) -> str:
    # Fallback if backend doesn't implement it
    be = _backend()
    fn = getattr(be, "prepare_metadata_for_build_wheel", None)
    if fn is None:
        # setuptools exposes it; scikit-build-core may not. Defer to building a wheel for metadata.
        return _setuptools_backend().prepare_metadata_for_build_wheel(
            metadata_directory, config_settings
        )
    return fn(metadata_directory, config_settings)


# Optional hooks

def build_editable(
    editable_directory: str, config_settings: Mapping[str, Any] | None = None, metadata_directory: str | None = None
) -> str:
    be = _backend()
    fn = getattr(be, "build_editable", None)
    if fn is None:
        # setuptools implements build_editable; if not available, raise the standard error
        raise RuntimeError("Editable installs not supported by the selected backend")
    return fn(editable_directory, config_settings)


def get_requires_for_build_wheel(
    config_settings: Mapping[str, Any] | None = None,
) -> Sequence[str]:
    if _use_metal_backend():
        # Add dynamic build requirements only when building the Metal backend
        return [
            "scikit-build-core>=0.10",
            "pybind11>=2.12",
            "cmake>=3.26",
            "ninja",
        ]
    # setuptools usually returns []
    return list(_setuptools_backend().get_requires_for_build_wheel(config_settings))


def get_requires_for_build_sdist(
    config_settings: Mapping[str, Any] | None = None,
) -> Sequence[str]:
    # No special requirements for SDist
    be = _backend()
    fn = getattr(be, "get_requires_for_build_sdist", None)
    if fn is None:
        return []
    return list(fn(config_settings))


def get_requires_for_build_editable(
    config_settings: Mapping[str, Any] | None = None,
) -> Sequence[str]:
    if _use_metal_backend():
        return [
            "scikit-build-core>=0.10",
            "pybind11>=2.12",
            "cmake>=3.26",
            "ninja",
        ]
    be = _setuptools_backend()
    fn = getattr(be, "get_requires_for_build_editable", None)
    if fn is None:
        return []
    return list(fn(config_settings)) 

================================================
FILE: awesome-gpt-oss.md
================================================
![gpt-oss](./docs/gpt-oss.svg)

# Awesome gpt-oss

This is a list of guides and resources to help you get started with the gpt-oss models.

- [Inference](#inference)
  - [Local](#local)
  - [Server](#server)
  - [Cloud](#cloud)
- [Examples / Tutorials](#examples--tutorials)
- [Tools](#tools)
- [Training](#training)

## Inference

### Local

- Ollama
  - [How to run gpt-oss locally with Ollama](https://cookbook.openai.com/articles/gpt-oss/run-locally-ollama)
  - [Ollama & gpt-oss launch blog](https://ollama.com/blog/gpt-oss)
  - [Check out the models Ollama](https://ollama.com/library/gpt-oss)
- LM Studio
  - [LM Studio & gpt-oss launch blog](https://lmstudio.ai/blog/gpt-oss)
  - [Use gpt-oss-20b with LM Studio](https://lmstudio.ai/models/openai/gpt-oss-20b)
  - [Use gpt-oss-120b with LM Studio](https://lmstudio.ai/models/openai/gpt-oss-120b)
- Hugging Face & Transformers
  - [How to run gpt-oss with Transformers](https://cookbook.openai.com/articles/gpt-oss/run-transformers)
  - [Hugging Face & gpt-oss launch blog](https://huggingface.co/blog/welcome-openai-gpt-oss)
  - [Collection of Hugging Face examples](https://github.com/huggingface/gpt-oss-recipes)
- NVIDIA
  - [gpt-oss on RTX](https://blogs.nvidia.com/blog/rtx-ai-garage-openai-oss)
- AMD
  - [Running gpt-oss models on AMD Ryzen AI Processors and Radeon Graphics Cards](https://www.amd.com/en/blogs/2025/how-to-run-openai-gpt-oss-20b-120b-models-on-amd-ryzen-ai-radeon.html)
  - [Running gpt-oss on STX Halo and Radeon dGPUs using Lemonade](https://lemonade-server.ai/news/gpt-oss.html)
- llama.cpp
  - [Running gpt-oss with llama.cpp](https://github.com/ggml-org/llama.cpp/discussions/15396)
  - [Running gpt-oss with Unsloth GGUFs](https://docs.unsloth.ai/new/gpt-oss-how-to-run-and-fine-tune#run-gpt-oss-20b)

### Server

- vLLM
  - [How to run gpt-oss with vLLM](https://cookbook.openai.com/articles/gpt-oss/run-vllm)
  - [vLLM & gpt-oss recipies](https://docs.vllm.ai/projects/recipes/en/latest/OpenAI/GPT-OSS.html)
- NVIDIA
  - [Optimizing gpt-oss with NVIDIA TensorRT-LLM](https://cookbook.openai.com/articles/run-nvidia)
  - [Deploying gpt-oss on TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.md)
- AMD
  - [Running the Latest Open Models from OpenAI on AMD AI Hardware](https://rocm.blogs.amd.com/ecosystems-and-partners/openai-day-0/README.html)

### Cloud

- Groq
  - [Groq & gpt-oss launch blog](https://groq.com/blog/day-zero-support-for-openai-open-models)
  - [gpt-oss-120b model on the GroqCloud Playground](https://console.groq.com/playground?model=openai/gpt-oss-120b)
  - [gpt-oss-20b model on the GroqCloud Playground](https://console.groq.com/playground?model=openai/gpt-oss-20b)
  - [gpt-oss with built-in web search on GroqCloud](https://console.groq.com/docs/browser-search)
  - [gpt-oss with built-in code execution on GroqCloud](https://console.groq.com/docs/code-execution)
  - [Responses API on Groq](https://console.groq.com/docs/responses-api)
- NVIDIA
  - [NVIDIA launch blog post](https://blogs.nvidia.com/blog/openai-gpt-oss/)
  - [NVIDIA & gpt-oss developer launch blog post](https://developer.nvidia.com/blog/delivering-1-5-m-tps-inference-on-nvidia-gb200-nvl72-nvidia-accelerates-openai-gpt-oss-models-from-cloud-to-edge/)
  - Use [gpt-oss-120b](https://build.nvidia.com/openai/gpt-oss-120b) and [gpt-oss-20b](https://build.nvidia.com/openai/gpt-oss-20b) on NVIDIA's Cloud
- Cloudflare
  - [Cloudflare & gpt-oss launch blog post](https://blog.cloudflare.com/openai-gpt-oss-on-workers-ai)
  - [gpt-oss-120b on Cloudflare Workers AI](https://developers.cloudflare.com/workers-ai/models/gpt-oss-120b)
  - [gpt-oss-20b on Cloudflare Workers AI](https://developers.cloudflare.com/workers-ai/models/gpt-oss-20b)
- AMD
  - [gpt-oss-120B on AMD MI300X](https://huggingface.co/spaces/amd/gpt-oss-120b-chatbot)
- AWS
  - Deploy via Tensorfuse: [Deploy gpt-oss for both 20b and 120b models on AWS EKS](https://tensorfuse.io/docs/guides/modality/text/openai_oss)
  - [AWS launch blog post](https://aws.amazon.com/blogs/aws/openai-open-weight-models-now-available-on-aws/)
- Google Colab
  - [gpt-oss-20b inference notebook](https://colab.research.google.com/github/unslothai/notebooks/blob/main/nb/GPT_OSS_MXFP4_(20B)-Inference.ipynb)

## Examples & Tutorials

- [OpenAI harmony response format](https://cookbook.openai.com/articles/openai-harmony)

## Tools

- [Example `python` tool for gpt-oss](./gpt_oss/tools/python_docker/)
- [Example `browser` tool for gpt-oss](./gpt_oss/tools/simple_browser/)

## Training

- [Hugging Face TRL examples](https://github.com/huggingface/gpt-oss-recipes)
- [LlamaFactory examples](https://llamafactory.readthedocs.io/en/latest/advanced/best_practice/gpt-oss.html)
- [Unsloth examples](https://docs.unsloth.ai/basics/gpt-oss-how-to-run-and-fine-tune)

### Reinforcement Learning
- [Auto solving the 2048 game](https://github.com/openai/gpt-oss/blob/main/examples/reinforcement-fine-tuning.ipynb)

## Contributing

Feel free to open a PR to add your own guides and resources on how to run gpt-oss. We will try to review it and add it here.


================================================
FILE: compatibility-test/.gitignore
================================================
# Logs
logs
*.log
npm-debug.log*
yarn-debug.log*
yarn-error.log*
lerna-debug.log*

# Diagnostic reports (https://nodejs.org/api/report.html)
report.[0-9]*.[0-9]*.[0-9]*.[0-9]*.json

# Runtime data
pids
*.pid
*.seed
*.pid.lock

# Directory for instrumented libs generated by jscoverage/JSCover
lib-cov

# Coverage directory used by tools like istanbul
coverage
*.lcov

# nyc test coverage
.nyc_output

# Grunt intermediate storage (https://gruntjs.com/creating-plugins#storing-task-files)
.grunt

# Bower dependency directory (https://bower.io/)
bower_components

# node-waf configuration
.lock-wscript

# Compiled binary addons (https://nodejs.org/api/addons.html)
build/Release

# Dependency directories
node_modules/
jspm_packages/

# Snowpack dependency directory (https://snowpack.dev/)
web_modules/

# TypeScript cache
*.tsbuildinfo

# Optional npm cache directory
.npm

# Optional eslint cache
.eslintcache

# Optional stylelint cache
.stylelintcache

# Optional REPL history
.node_repl_history

# Output of 'npm pack'
*.tgz

# Yarn Integrity file
.yarn-integrity

# dotenv environment variable files
.env
.env.*
!.env.example

# parcel-bundler cache (https://parceljs.org/)
.cache
.parcel-cache

# Next.js build output
.next
out

# Nuxt.js build / generate output
.nuxt
dist

# Gatsby files
.cache/
# Comment in the public line in if your project uses Gatsby and not Next.js
# https://nextjs.org/blog/next-9-1#public-directory-support
# public

# vuepress build output
.vuepress/dist

# vuepress v2.x temp and cache directory
.temp
.cache

# Sveltekit cache directory
.svelte-kit/

# vitepress build output
**/.vitepress/dist

# vitepress cache directory
**/.vitepress/cache

# Docusaurus cache and generated files
.docusaurus

# Serverless directories
.serverless/

# FuseBox cache
.fusebox/

# DynamoDB Local files
.dynamodb/

# Firebase cache directory
.firebase/

# TernJS port file
.tern-port

# Stores VSCode versions used for testing VSCode extensions
.vscode-test

# yarn v3
.pnp.*
.yarn/*
!.yarn/patches
!.yarn/plugins
!.yarn/releases
!.yarn/sdks
!.yarn/versions

# Vite logs files
vite.config.js.timestamp-*
vite.config.ts.timestamp-*

rollout_*.jsonl
analysis_*.json

================================================
FILE: compatibility-test/README.md
================================================
# API Compatibility Test

This script uses the Agents SDK in TypeScript and the underlying OpenAI client to verify the shape of the API calls but also whether the API performs tool calling.

## What it tests

1.

## How to run

0. Run `npm install` in this directory.
1. Update `providers.ts` to create an entry for the API to test. Change `vllm` to the provider name of your choice. Use `chat` for Chat Completions tests and `responses` for Responses API tests.
2. Run an initial quick test to make sure things work. This will only run one test

```
npm start -- --provider <name> -n 1 -k 1
```

3. Run the full test (runs each test 5 times to test consistency)

```
npm start -- --provider <name> -k 5
```

## Considerations

1. The tests will fail if the API shape does not match the expected behavior
2. Events in the chat API are currently not tested
3. If the schema validation succeeds but the input is wrong the test will still pass for this test. That's because it's likely more of a prompt engineering issue or a validator issue than an API issue as it still nailed the input


================================================
FILE: compatibility-test/analysis.ts
================================================
export function analyze(caseResults: any[], tries: number) {
  // Group results by unique task: test_case + apiType
  type TaskKey = string;
  const taskKeyFor = (r: any): TaskKey =>
    `${r.test_case}::${r.result?.apiType}`;

  const successesByTask: Map<TaskKey, Map<number, boolean>> = new Map();

  // Count wrong-input tool calls (schema correct but incorrect arguments)
  let wrongInputToolCalls = 0;

  // Count invalid response shapes per API type
  const totalByApiType: Record<string, number> = {};
  const invalidByApiType: Record<string, number> = {};

  for (const r of caseResults) {
    if (!r?.result || typeof r.result.apiType !== "string") continue;

    // Parse attempt index from run_id `${i}_${k}` safely
    let attemptIndex: number | undefined;
    if (typeof r.run_id === "string") {
      const parts = r.run_id.split("_");
      const k = Number(parts[1]);
      if (Number.isFinite(k)) attemptIndex = k;
    }

    const key = taskKeyFor(r);
    if (!successesByTask.has(key)) successesByTask.set(key, new Map());
    if (attemptIndex != null) {
      successesByTask.get(key)!.set(attemptIndex, Boolean(r.success));
    }

    const d = r.result.toolCallingDetails ?? {};
    const calledToolAtLeastOnce = Boolean(d.calledToolAtLeastOnce);
    const calledToolWithRightSchema = Boolean(d.calledToolWithRightSchema);
    const calledToolWithRightArguments = Boolean(
      d.calledToolWithRightArguments
    );
    if (
      calledToolAtLeastOnce &&
      calledToolWithRightSchema &&
      !calledToolWithRightArguments
    ) {
      wrongInputToolCalls++;
    }

    // Track invalid/total per apiType for response shape
    const apiType = r.result.apiType as string;
    totalByApiType[apiType] = (totalByApiType[apiType] ?? 0) + 1;
    const isValidResponse = r.result.validResponse === true;
    if (!isValidResponse) {
      invalidByApiType[apiType] = (invalidByApiType[apiType] ?? 0) + 1;
    }
  }

  const totalTasks = successesByTask.size;

  // Compute pass@k and pass^k for k = 1..tries
  const passAtKByK: number[] = [];
  const passHatKByK: number[] = [];

  for (let k = 1; k <= tries; k++) {
    let tasksSuccessfulK = 0; // any success in first k attempts
    let tasksAllSuccessfulK = 0; // all success in first k attempts

    for (const [, attemptsMap] of successesByTask) {
      let anySuccess = false;
      let allSuccess = true;
      for (let i = 0; i < k; i++) {
        const v = attemptsMap.get(i) === true;
        anySuccess = anySuccess || v;
        if (!v) allSuccess = false;
      }
      if (anySuccess) tasksSuccessfulK++;
      if (allSuccess) tasksAllSuccessfulK++;
    }

    const passAtK = totalTasks > 0 ? tasksSuccessfulK / totalTasks : 0;
    const passHatK = totalTasks > 0 ? tasksAllSuccessfulK / totalTasks : 0;
    passAtKByK.push(passAtK);
    passHatKByK.push(passHatK);
  }

  // Convenience: final k=tries values
  const passAtK = passAtKByK[tries - 1] ?? 0;
  const passHatK = passHatKByK[tries - 1] ?? 0;

  return {
    totalTasks,
    passAtKByK,
    passHatKByK,
    passAtK,
    passHatK,
    wrongInputToolCalls,
    // New stats for invalid response shapes per API
    invalidByApiType,
    totalByApiType,
  };
}

export function printAnalysis(
  stats: ReturnType<typeof analyze>,
  caseResults: any[],
  provider: string,
  selectedLines: string[],
  tries: number,
  skipped: number,
  analysisFile: string
) {
  const formatPerK = (arr: number[]) =>
    Array.from({ length: tries }, (_, i) => {
      const v = arr[i] ?? 0;
      return `${i + 1}=${v.toFixed(3)}`;
    }).join(", ");

  console.log("Summary:");
  console.log(`  Provider: ${provider}`);
  console.log(`  Total input cases: ${selectedLines.length}`);
  console.log(`  Tries: ${tries}`);
  console.log(`  Total tasks: ${stats.totalTasks}`);
  console.log(`  Total runs: ${caseResults.length}`);
  // Conditionally print invalid response shape stats per API type
  if ((stats.totalByApiType["responses"] ?? 0) > 0) {
    const bad = stats.invalidByApiType["responses"] ?? 0;
    const tot = stats.totalByApiType["responses"] ?? 0;
    console.log(`  Invalid Responses API responses: ${bad} (out of ${tot})`);
  }
  if ((stats.totalByApiType["chat"] ?? 0) > 0) {
    const bad = stats.invalidByApiType["chat"] ?? 0;
    const tot = stats.totalByApiType["chat"] ?? 0;
    console.log(
      `  Invalid Chat Completions API responses: ${bad} (out of ${tot})`
    );
  }
  console.log(`  pass@k (k=1..${tries}): ${formatPerK(stats.passAtKByK)}`);
  console.log(`  pass^k (k=1..${tries}): ${formatPerK(stats.passHatKByK)}`);
  console.log(`  pass@k (k=${tries}): ${stats.passAtK.toFixed(3)}`);
  console.log(`  pass^k (k=${tries}): ${stats.passHatK.toFixed(3)}`);
  console.log(`  Wrong-input tool calls: ${stats.wrongInputToolCalls}`);
  console.log(`  Invalid cases.jsonl lines: ${skipped}`);
  console.log(`  Analysis written to ${analysisFile}`);
}


================================================
FILE: compatibility-test/cases.jsonl
================================================
{"tool_name":"get_system_health","input":"Hey, quick check: is everything up and running?","expected_arguments":"{}"}
{"tool_name":"get_system_health","input":"Status report please.","expected_arguments":"{}"}
{"tool_name":"get_system_health","input":"Can you confirm the LLM health before we start?","expected_arguments":"{}"}
{"tool_name":"get_system_health","input":"Need a health snapshot.","expected_arguments":"{}"}
{"tool_name":"get_system_health","input":"Hi, what's the current system health?","expected_arguments":"{}"}
{"tool_name":"markdown_to_html","input":"Convert this markdown to HTML:\n\n# Title\n\nSome *italic* text.","expected_arguments":"{\"markdown\":\"# Title\\n\\nSome *italic* text.\"}"}
{"tool_name":"markdown_to_html","input":"Hey, could you turn `## Docs` into HTML?","expected_arguments":"{\"markdown\":\"## Docs\"}"}
{"tool_name":"markdown_to_html","input":"Please render the following markdown:\n\n- item 1\n- item 2","expected_arguments":"{\"markdown\":\"- item 1\\n- item 2\"}"}
{"tool_name":"markdown_to_html","input":"I have `**bold**` markdown; give me HTML.","expected_arguments":"{\"markdown\":\"**bold**\"}"}
{"tool_name":"markdown_to_html","input":"Markdown to HTML: > quote","expected_arguments":"{\"markdown\":\"> quote\"}"}
{"tool_name":"detect_language","input":"Hey, what language is this: 'Buenos días, ¿cómo estás?'","expected_arguments":"{\"text\":\"Buenos días, ¿cómo estás?\"}"}
{"tool_name":"detect_language","input":"Identify the language: \"Guten Morgen\"","expected_arguments":"{\"text\":\"Guten Morgen\"}"}
{"tool_name":"detect_language","input":"Language detection needed: こんにちは、お元気ですか？","expected_arguments":"{\"text\":\"こんにちは、お元気ですか？\"}"}
{"tool_name":"detect_language","input":"Detect language for: 'Привет, как дела?'","expected_arguments":"{\"text\":\"Привет, как дела?\"}"}
{"tool_name":"detect_language","input":"What language is 'Bonjour tout le monde'?","expected_arguments":"{\"text\":\"Bonjour tout le monde\"}"}
{"tool_name":"generate_chart","input":"Plot a simple line chart for these points: (1,2),(2,4),(3,9).","expected_arguments":"{\"data\":[[1,2],[2,4],[3,9]],\"chart_type\":\"line\"}"}
{"tool_name":"generate_chart","input":"Hey, can I get a bar chart of my sales: 10, 20, 30 across Q1–Q3?","expected_arguments":"{\"data\":[[1,10],[2,20],[3,30]],\"chart_type\":\"bar\",\"title\":\"Quarterly Sales\"}"}
{"tool_name":"generate_chart","input":"Make a scatter chart titled 'Experiment' with x label Time and y label Value for data [ [0,1], [1,1.5], [2,2.2] ].","expected_arguments":"{\"data\":[[0,1],[1,1.5],[2,2.2]],\"chart_type\":\"scatter\",\"title\":\"Experiment\",\"x_label\":\"Time\",\"y_label\":\"Value\"}"}
{"tool_name":"generate_chart","input":"Create a line chart of temperatures 70,72,68,65 over 4 days, label x as 'Day'.","expected_arguments":"{\"data\":[[1,70],[2,72],[3,68],[4,65]],\"chart_type\":\"line\",\"x_label\":\"Day\"}"}
{"tool_name":"generate_chart","input":"Visualize visits per day with a bar chart; numbers: 100,150,120.","expected_arguments":"{\"data\":[[1,100],[2,150],[3,120]],\"chart_type\":\"bar\",\"title\":\"Daily Visits\",\"y_label\":\"Visitors\"}"}
{"tool_name":"query_database","input":"Give me the ids and emails from users table, limit 5.","expected_arguments":"{\"table\":\"users\",\"columns\":[\"id\",\"email\"],\"limit\":5}"}
{"tool_name":"query_database","input":"Hey, fetch order_id and amount from orders where status is 'shipped'.","expected_arguments":"{\"table\":\"orders\",\"columns\":[\"order_id\",\"amount\"],\"filters\":\"status = 'shipped'\"}"}
{"tool_name":"query_database","input":"Retrieve name and price from products ordered by price descending, top 10 please.","expected_arguments":"{\"table\":\"products\",\"columns\":[\"name\",\"price\"],\"limit\":10,\"order_by\":\"price DESC\"}"}
{"tool_name":"query_database","input":"I need the first 3 log entries from audit_log table.","expected_arguments":"{\"table\":\"audit_log\",\"columns\":[\"id\",\"timestamp\",\"action\"],\"limit\":3}"}
{"tool_name":"query_database","input":"Query the customers table for name, city where city = 'Berlin'.","expected_arguments":"{\"table\":\"customers\",\"columns\":[\"name\",\"city\"],\"filters\":\"city = 'Berlin'\"}"}
{"tool_name":"get_weather","input":"What's the weather in San Francisco right now?","expected_arguments":"{\"location\":\"San Francisco\"}"}
{"tool_name":"get_weather","input":"Weather for Tokyo, please.","expected_arguments":"{\"location\":\"Tokyo\"}"}
{"tool_name":"get_weather","input":"Get me the current weather for 10001.","expected_arguments":"{\"location\":\"10001\"}"}
{"tool_name":"get_weather","input":"How's the weather in Paris today?","expected_arguments":"{\"location\":\"Paris\"}"}
{"tool_name":"get_weather","input":"Check the weather for Sydney.","expected_arguments":"{\"location\":\"Sydney\"}"}


================================================
FILE: compatibility-test/index.ts
================================================
import { parseArgs } from "node:util";
import { createWriteStream } from "node:fs";
import { readFile, writeFile } from "node:fs/promises";
import path from "node:path";
import process from "node:process";
import { runCase, RunCaseSummary } from "./runCase";
import { Listr, ListrTaskWrapper } from "listr2";
import { analyze, printAnalysis } from "./analysis";

function formatTimestamp(d: Date): string {
  const pad = (n: number) => String(n).padStart(2, "0");
  const yyyy = d.getFullYear();
  const mm = pad(d.getMonth() + 1);
  const dd = pad(d.getDate());
  const hh = pad(d.getHours());
  const mi = pad(d.getMinutes());
  const ss = pad(d.getSeconds());
  return `${yyyy}${mm}${dd}_${hh}${mi}${ss}`;
}

async function main() {
  const args = parseArgs({
    options: {
      cases: { type: "string", short: "c", default: "cases.jsonl" },
      provider: { type: "string", short: "p", default: "openai" },
      streaming: { type: "boolean", short: "s", default: false },
      maxTurns: { type: "string", short: "t", default: "10" },
      n: { type: "string", short: "n" },
      strict: { type: "boolean", short: "s", default: false },
      tries: { type: "string", short: "k", default: "1" },
    },
  });
  const casesPathArg = args.values.cases;
  const provider = args.values.provider as string;
  const streaming = Boolean(args.values.streaming);
  const maxTurns = Number(args.values.maxTurns ?? 10);
  const nRaw = args.values.n as string | undefined;
  const triesRaw = args.values.tries as string | undefined;
  const tries = triesRaw != null ? Number(triesRaw) : 1;
  const limit = nRaw != null ? Number(nRaw) : undefined;
  if (limit != null && (!Number.isFinite(limit) || limit <= 0)) {
    console.error("--n must be a positive integer");
    process.exitCode = 1;
    return;
  }

  if (!casesPathArg) {
    console.error("--cases is required (path to JSONL file)");
    process.exitCode = 1;
    return;
  }

  const casesPath = path.isAbsolute(casesPathArg)
    ? casesPathArg
    : path.join(process.cwd(), casesPathArg);

  const timestamp = formatTimestamp(new Date());
  const defaultFilename = `rollout_${provider}_${timestamp}.jsonl`;
  const outputFile = path.join(process.cwd(), defaultFilename);
  const analysisFile = path.join(
    process.cwd(),
    `analysis_${provider}_${timestamp}.json`
  );

  let fileContent: string;
  try {
    fileContent = await readFile(casesPath, "utf8");
  } catch (err: any) {
    console.error(
      `Failed to read cases file at ${casesPath}: ${err?.message ?? err}`
    );
    process.exitCode = 1;
    return;
  }

  const lines = fileContent
    .split(/\r?\n/)
    .map((l) => l.trim())
    .filter((l) => l.length > 0);

  const selectedLines =
    typeof limit === "number" ? lines.slice(0, limit) : lines;

  const out = createWriteStream(outputFile, { flags: "w", encoding: "utf8" });

  const writeLine = (obj: any) =>
    new Promise<void>((resolve, reject) => {
      const str = JSON.stringify(obj) + "\n";
      out.write(str, (err) => (err ? reject(err) : resolve()));
    });

  // Accumulators for post-run analysis
  let skipped = 0; // invalid JSON lines
  const caseResults: Array<{
    run_id: string;
    success: boolean;
    provider: string;
    test_case: number;
    tool_name: string;
    input: string;
    result: RunCaseSummary;
  }> = [];

  async function processIndex(
    i: number,
    k: number,
    task: ListrTaskWrapper<any, any, any>
  ) {
    const line = selectedLines[i];
    let caseObj: any;
    try {
      caseObj = JSON.parse(line);
    } catch (err: any) {
      console.error(
        `Skipping invalid JSON on line ${i + 1}: ${err?.message ?? err}`
      );
      skipped++;
      return;
    }

    try {
      const summaries = await runCase(provider, caseObj, {
        maxTurns,
        streaming,
        strict: args.values.strict,
      });

      for (const summary of summaries) {
        const record = {
          run_id: `${i}_${k}`,
          success: summary.success,
          provider,
          test_case: i,
          tool_name: caseObj.tool_name,
          input: caseObj.input,
          result: summary,
        };
        task.output = `Case ${i} (attempt ${k + 1}): ${
          summary.success ? "Success" : "Failed"
        } ${summary.toolCallingDetails.warning || ""}`;
        caseResults.push(record);
        await writeLine(record);
      }
    } catch (err: any) {
      const record = {
        provider,
        test_case: i,
        tool_name: caseObj?.tool_name,
        input: caseObj?.input,
        expected_output: caseObj?.expected_output,
        instructions: caseObj?.instructions,
        error: String(err?.message ?? err),
      };
      await writeLine(record);
      task.output = `Case ${i} failed: ${err?.message ?? err}`;
    }
  }

  const listr = new Listr<{
    output: string;
  }>(
    selectedLines.flatMap((line, index) => {
      return Array.from({ length: tries }, (_, attempt) => ({
        title: `Processing case ${index} (attempt ${attempt + 1})`,
        task: async (_, task) => {
          await processIndex(index, attempt, task);
        },
        rendererOptions: { persistentOutput: true },
      }));
    }),
    {
      concurrent: 5,
    }
  );

  await listr.run();

  await new Promise((resolve) => out.end(resolve));
  console.log(`Results written to ${outputFile}`);
  const stats = analyze(caseResults, tries);
  await writeFile(analysisFile, JSON.stringify(stats, null, 2), "utf8");
  printAnalysis(
    stats,
    caseResults,
    provider,
    selectedLines,
    tries,
    skipped,
    analysisFile
  );
}

main().catch((err) => {
  console.error(err);
  process.exitCode = 1;
});


================================================
FILE: compatibility-test/package.json
================================================
{
  "type": "module",
  "dependencies": {
    "@openai/agents": "^0.0.15",
    "ajv": "^8.17.1",
    "listr2": "^9.0.1"
  },
  "scripts": {
    "start": "tsx index.ts"
  }
}


================================================
FILE: compatibility-test/providers.ts
================================================
export const PROVIDERS = {
  vllm: {
    apiBaseUrl: "http://localhost:8000/v1",
    apiKey: "vllm",
    apiType: ["responses", "chat"], // choose from responses, chat, or both
    modelName: "openai/gpt-oss-120b",
    providerDetails: {
      // add any provider-specific details here. These will be passed as part of every request
      // for example to fix the provider for openrouter, you can do:
      // provider: {
      //   only: ["example"],
      // },
    },
  },
};


================================================
FILE: compatibility-test/runCase.ts
================================================
import {
  Agent,
  Runner,
  OpenAIResponsesModel,
  OpenAIChatCompletionsModel,
  RunResult,
  StreamedRunResult,
  FunctionTool,
  setTracingDisabled,
} from "@openai/agents";
import { Ajv } from "ajv";
import { OpenAI } from "openai";
import { PROVIDERS } from "./providers";
import { TOOLS_MAP } from "./tools";

setTracingDisabled(true);

const ajv = new Ajv();

export type Case = {
  tool_name: string;
  input: string;
  expected_arguments: string;
  instructions?: string;
};

// Summary shape for each apiType
export type RunCaseSummary = {
  apiType: string;
  success: boolean;
  validResponse: boolean;
  validEvents?: boolean;
  details: Record<string, any>;
  history: any[];
  successToolCall: boolean;
  toolCallingDetails: Record<string, any>;
};

export async function runCase(
  provider: string,
  caseData: Case,
  {
    maxTurns,
    streaming,
    strict,
  }: { maxTurns: number; streaming: boolean; strict: boolean }
): Promise<RunCaseSummary[]> {
  const config = PROVIDERS[provider];
  if (!config) {
    throw new Error(
      `Provider ${provider} not found. Valid providers are: ${Object.keys(
        PROVIDERS
      ).join(", ")}`
    );
  }

  const agent = new Agent({
    name: caseData.tool_name,
    instructions: caseData.instructions,
    tools: [TOOLS_MAP[caseData.tool_name]],
  });

  const client = new OpenAI({
    apiKey: config.apiKey,
    baseURL: config.apiBaseUrl,
  });

  const summaries: RunCaseSummary[] = [];

  for (const apiType of config.apiType) {
    const runner = new Runner({
      model:
        apiType === "responses"
          ? new OpenAIResponsesModel(client, config.modelName)
          : new OpenAIChatCompletionsModel(client, config.modelName),
      modelSettings: {
        providerData: config.providerDetails ?? {},
      },
    });

    let result: RunResult<any, any> | StreamedRunResult<any, any>;
    let streamedEvents: any[] | undefined = undefined;
    if (streaming) {
      result = await runner.run(agent, caseData.input, {
        stream: streaming,
        maxTurns: maxTurns,
      });
      if (result instanceof StreamedRunResult) {
        // Collect streaming events if applicable
        streamedEvents = [];
        for await (const event of result) {
          if (event.type === "raw_model_stream_event") {
            if (event.data.type === "model") {
              streamedEvents.push(event.data.event);
            }
          }
        }
        await result.completed;
      }
    } else {
      result = await runner.run(agent, caseData.input, {
        maxTurns: maxTurns,
      });
    }

    const { success: successToolCall, details: toolCallingDetails } =
      testToolCall(apiType, caseData, result, strict);

    const { validResponse, details } = testOutputData(
      apiType,
      result.rawResponses,
      streaming
    );

    const { validEvents, details: eventsDetails } = streaming
      ? testEvents(apiType, streamedEvents)
      : { validEvents: true, details: {} };

    let success = successToolCall && validResponse;
    if (streaming) {
      success = success && validEvents;
    }
    const summary: RunCaseSummary = {
      apiType,
      success,
      validResponse,
      validEvents,
      details: {
        ...details,
        ...eventsDetails,
      },
      history: result?.rawResponses.map((entry) => entry.providerData) ?? [],
      successToolCall,
      toolCallingDetails,
    };

    summaries.push(summary);
  }

  return summaries;
}

function testToolCall(apiType, caseData, result, strict) {
  let details: Record<string, boolean | string> = {};
  result.newItems.forEach((item) => {
    // for this test for now we only care if the tool is called at least once
    if (details.calledToolAtLeastOnce) {
      return;
    }

    const isToolCall = item.type === "tool_call_item";
    if (isToolCall) {
      if (item.rawItem.type === "function_call") {
        if (item.rawItem.name === caseData.tool_name) {
          const validate = ajv.compile(
            (TOOLS_MAP[caseData.tool_name] as FunctionTool).parameters
          );
          const valid = validate(JSON.parse(item.rawItem.arguments));
          details.calledToolWithRightSchema = valid;
          details.calledToolAtLeastOnce = true;

          if (details.calledToolWithRightSchema) {
            const parsedArguments = JSON.parse(item.rawItem.arguments);
            const expectedArguments = JSON.parse(caseData.expected_arguments);
            details.calledToolWithRightArguments = deepEqual(
              parsedArguments,
              expectedArguments
            );
            if (!details.calledToolWithRightArguments) {
              if (details.calledToolWithRightSchema) {
                details.warning = `Tool call with wrong arguments but correct schema. Check logs for full details. Not failing this test. Parsed: ${JSON.stringify(
                  parsedArguments
                )} Expected: ${JSON.stringify(expectedArguments)}`;
              }
              details.actualArguments = parsedArguments;
              details.expectedArguments = expectedArguments;
            }
          }
        }
      }
    }
  });

  return {
    success:
      !!details.calledToolAtLeastOnce &&
      !!details.calledToolWithRightSchema &&
      (!strict || !!details.calledToolWithRightArguments),
    details,
  };
}

function testEvents(apiType, events) {
  // In an ideal world we would check all the events to follow and reconstruct the final response
  // and then compare it against the final response in the response.completed event
  // for now we just check that certain events are present

  let details: Record<string, boolean> = {};
  let validEvents: boolean = false;

  if (apiType === "chat") {
    let hasReasoningDeltas = false;
    for (const event of events) {
      hasReasoningDeltas =
        hasReasoningDeltas ||
        (typeof event.choices[0].delta.reasoning === "string" &&
          event.choices[0].delta.reasoning.length > 0);
    }
    details.hasReasoningDeltas = hasReasoningDeltas;
    validEvents = hasReasoningDeltas;
  }

  if (apiType === "responses") {
    let hasReasoningDeltaEvents = false;
    let hasReasoningDoneEvents = false;
    for (const event of events) {
      if (event.type === "raw_model_stream_event") {
        if (event.data.type === "model") {
          if (event.data.event.type === "response.reasoning_text.delta") {
            hasReasoningDeltaEvents = true;
          }
          if (event.data.event.type === "response.reasoning_text.done") {
            hasReasoningDoneEvents = true;
          }
        }
      }
    }

    details.hasReasoningDeltaEvents = hasReasoningDeltaEvents;
    details.hasReasoningDoneEvents = hasReasoningDoneEvents;
    validEvents =
      details.hasReasoningDeltaEvents && details.hasReasoningDoneEvents;
  }

  return {
    validEvents,
    details,
  };
}

function testOutputData(apiType, rawResponses, streaming) {
  let details: Record<string, boolean> = {};
  let validResponse: boolean = false;

  if (apiType === "chat") {
    for (const response of rawResponses) {
      if (streaming && !response.providerData) {
        // with Chat Completions we don't have a final response object that's native so we skip this test
        return {
          validResponse: true,
          details: {
            skippedBecauseStreaming: true,
          },
        };
      }

      // this is the actual HTTP response from the provider
      // Since it's not guaranteed that every response has a reasoning field, we check if it's present
      // at least once across all responses
      const data = response.providerData;
      const message = data.choices[0].message;
      if (message.role === "assistant" && !message.refusal) {
        details.hasReasoningField =
          details.hasReasoningField ||
          ("reasoning" in message && typeof message.reasoning === "string");
        details.hasReasoningContentField =
          details.hasReasoningContentField ||
          ("reasoning_content" in message &&
            typeof message.reasoning_content === "string");

        validResponse =
          validResponse ||
          (details.hasReasoningField && message.reasoning.length > 0);
      }
    }
  } else if (apiType === "responses") {
    // this is the actual HTTP response from the provider
    const data = rawResponses[0].providerData;
    for (const item of data.output) {
      // Since it's not guaranteed that every response has a reasoning field, we check if it's present
      // at least once across all responses

      if (item.type === "reasoning") {
        details.hasReasoningContentArray = Array.isArray(item.content);
        details.hasReasoningContentArrayLength = item.content.length > 0;
        details.hasReasoningContentArrayItemType = item.content.every(
          (item) => item.type === "reasoning_text"
        );
        details.hasReasoningContentArrayItemText = item.content.every(
          (item) => item.text.length > 0
        );

        validResponse =
          details.hasReasoningContentArray &&
          details.hasReasoningContentArrayLength &&
          details.hasReasoningContentArrayItemType &&
          details.hasReasoningContentArrayItemText;
      }
    }
  }

  return {
    validResponse,
    details,
  };
}

function deepEqual(a: any, b: any): boolean {
  if (a === b) return true;
  if (typeof a !== typeof b) return false;
  if (a && b && typeof a === "object") {
    if (Array.isArray(a) !== Array.isArray(b)) return false;
    if (Array.isArray(a)) {
      if (a.length !== b.length) return false;
      for (let i = 0; i < a.length; i++) {
        if (!deepEqual(a[i], b[i])) return false;
      }
      return true;
    } else {
      const aKeys = Object.keys(a);
      const bKeys = Object.keys(b);
      if (aKeys.length !== bKeys.length) return false;
      for (const key of aKeys) {
        if (!b.hasOwnProperty(key)) return false;
        if (!deepEqual(a[key], b[key])) return false;
      }
      return true;
    }
  }
  return false;
}


================================================
FILE: compatibility-test/tools.ts
================================================
import { Tool, tool } from "@openai/agents";

function convertToTool(toolData: any) {
  return tool({
    name: toolData.name,
    description: toolData.description,
    parameters: toolData.parameters,
    execute: async (parameters) => {
      return toolData.output;
    },
    strict: false,
  });
}

export const TOOLS = [
  {
    type: "function",
    name: "get_weather",
    description: "Get the weather for a given location",
    parameters: {
      type: "object",
      properties: {
        location: {
          type: "string",
          description: "The location to get the weather for",
        },
      },
      required: ["location"],
      additionalProperties: false,
    },
    output: '{"weather":"sunny"}',
  },
  {
    type: "function",
    name: "get_system_health",
    description:
      "Returns the current health status of the LLM runtime—use before critical operations to verify the service is live.",
    parameters: { type: "object", properties: {} },
    output: '{"status":"ok","uptime_seconds":372045}',
  },
  {
    type: "function",
    name: "markdown_to_html",
    description:
      "Converts a Markdown string to sanitized HTML—use when you need browser-renderable output.",
    parameters: {
      type: "object",
      properties: {
        markdown: { type: "string", description: "Raw Markdown content" },
      },
      required: ["markdown"],
      additionalProperties: false,
    },
    output: '{"html":"<h1>Hello World</h1><p>This is <em>great</em>.</p>"}',
  },
  {
    type: "function",
    name: "detect_language",
    description:
      "Identifies the ISO language code of the supplied text—use for routing text to language-specific models.",
    parameters: {
      type: "object",
      properties: {
        text: {
          type: "string",
          description: "Text whose language should be detected",
        },
      },
      required: ["text"],
      additionalProperties: false,
    },
    output: '{"language":"de","confidence":0.98}',
  },
  {
    type: "function",
    name: "generate_chart",
    description:
      "Creates a base64-encoded PNG chart from tabular data—use for quick visualizations inside chat.",
    parameters: {
      type: "object",
      properties: {
        data: {
          type: "array",
          items: { type: "array", items: { type: "number" } },
          description: "2-D numeric data matrix",
        },
        chart_type: {
          type: "string",
          enum: ["line", "bar", "scatter"],
          description: "Type of chart to generate",
        },
        title: {
          type: "string",
          description: "Chart title",
          default: "",
        },
        x_label: {
          type: "string",
          description: "Label for the x-axis",
          default: "",
        },
        y_label: {
          type: "string",
          description: "Label for the y-axis",
          default: "",
        },
      },
      required: ["data", "chart_type"],
      additionalProperties: false,
    },
    output: '{"image_png_base64":"iVBORw0KGgoAAAANSUhEUgAA..."}',
  },
  {
    type: "function",
    name: "query_database",
    description:
      "Runs a parameterized SQL SELECT on the internal analytics DB—use for lightweight data look-ups.",
    parameters: {
      type: "object",
      properties: {
        table: { type: "string", description: "Table name to query" },
        columns: {
          type: "array",
          items: { type: "string" },
          description: "Columns to return",
        },
        filters: {
          type: "string",
          description: "SQL WHERE clause without the word WHERE",
          default: "",
        },
        limit: {
          type: "integer",
          minimum: 1,
          maximum: 10000,
          description: "Max rows to return",
          default: 100,
        },
        order_by: {
          type: "string",
          description: "Column to order by (optional)",
          default: "",
        },
      },
      required: ["table", "columns"],
      additionalProperties: false,
    },
    output:
      '{"rows":[{"id":1,"email":"user@example.com"},{"id":2,"email":"foo@bar.com"}],"row_count":2}',
  },
];

export const TOOLS_MAP = TOOLS.reduce((acc, tool) => {
  acc[tool.name] = convertToTool(tool);
  return acc;
}, {} as Record<string, Tool>);


================================================
FILE: examples/agents-sdk-js/index.ts
================================================
import { OpenAI } from "openai";
import {
  Agent,
  run,
  setDefaultOpenAIClient,
  setOpenAIAPI,
  setTracingDisabled,
  tool,
  MCPServerStdio,
} from "@openai/agents";
import { z } from "zod";
import path from "node:path";
import process from "node:process";
import { styleText } from "node:util";
import { createInterface } from "node:readline/promises";

async function prompt(question: string) {
  const rl = createInterface({
    input: process.stdin,
    output: process.stdout,
  });
  const answer = await rl.question(question);
  rl.close();
  return answer;
}

const openai = new OpenAI({
  apiKey: "local",
  baseURL: "http://localhost:11434/v1",
});

const samplesDir = path.join(process.cwd());

const mcpServer = new MCPServerStdio({
  name: "Filesystem MCP Server, via npx",
  fullCommand: `npx -y @modelcontextprotocol/server-filesystem ${samplesDir}`,
});

await mcpServer.connect();

setTracingDisabled(true);
setDefaultOpenAIClient(openai);
setOpenAIAPI("chat_completions");

const searchTool = tool({
  name: "get_current_weather",
  description: "Get the current weather in a given location",
  parameters: z.object({
    location: z.string(),
  }),
  execute: async ({ location }) => {
    return `The weather in ${location} is sunny.`;
  },
});

const agent = new Agent({
  name: "My Agent",
  instructions: "You are a helpful assistant.",
  tools: [searchTool],
  model: "gpt-oss:20b-test",
  mcpServers: [mcpServer],
});

const input = await prompt("> ");

const result = await run(agent, input, {
  stream: true,
});

for await (const event of result) {
  if (event.type === "raw_model_stream_event" && event.data.type === "model") {
    if (event.data.event.choices[0].delta.content) {
      process.stdout.write(event.data.event.choices[0].delta.content);
    } else if (event.data.event.choices[0].delta.reasoning) {
      process.stdout.write(event.data.event.choices[0].delta.reasoning);
    }
  } else if (
    event.type === "run_item_stream_event" &&
    event.item.type === "tool_call_item" &&
    event.item.rawItem.type == "function_call"
  ) {
    console.log(
      `\nCalling ${event.item.rawItem.name} with: ${event.item.rawItem.arguments}`
    );
  }
}

console.log("\n");
await result.completed;
await mcpServer.close();


================================================
FILE: examples/agents-sdk-js/package.json
================================================
{
  "type": "module",
  "name": "agents-sdk",
  "version": "1.0.0",
  "main": "index.js",
  "scripts": {
    "start": "tsx index.ts",
    "test": "echo \"Error: no test specified\" && exit 1"
  },
  "keywords": [],
  "author": "",
  "license": "ISC",
  "description": "",
  "dependencies": {
    "@openai/agents": "^0.0.14",
    "tsx": "^4.20.3",
    "typescript": "^5.8.3",
    "zod": "^3.25.67"
  }
}


================================================
FILE: examples/agents-sdk-python/example.py
================================================
import asyncio
from pathlib import Path
import shutil

from openai import AsyncOpenAI
from agents import (
    Agent,
    ItemHelpers,
    Runner,
    set_default_openai_api,
    set_default_openai_client,
    set_tracing_disabled,
    function_tool,
)
from agents.mcp import MCPServerStdio


async def prompt_user(question: str) -> str:
    """Async input prompt function"""
    loop = asyncio.get_event_loop()
    return await loop.run_in_executor(None, input, question)


async def main():
    # Set up OpenAI client for local server (e.g., Ollama)
    openai_client = AsyncOpenAI(
        api_key="local",
        base_url="http://localhost:11434/v1",
    )

    # Get current working directory
    samples_dir = str(Path.cwd())

    # Create MCP server for filesystem operations
    mcp_server = MCPServerStdio(
        name="Filesystem MCP Server, via npx",
        params={
            "command": "npx",
            "args": [
                "-y",
                "@modelcontextprotocol/server-filesystem",
                samples_dir,
            ],
        },
    )

    # Connect to MCP server
    await mcp_server.connect()

    # Configure agents SDK
    set_tracing_disabled(True)
    set_default_openai_client(openai_client)
    set_default_openai_api("chat_completions")

    # Define weather tool
    @function_tool
    async def get_weather(location: str) -> str:
        return f"The weather in {location} is sunny."

    # Create agent
    agent = Agent(
        name="My Agent",
        instructions="You are a helpful assistant.",
        tools=[get_weather],
        model="gpt-oss:20b-test",
        mcp_servers=[mcp_server],
    )

    # Get user input
    user_input = await prompt_user("> ")

    # Run agent with streaming
    result = Runner.run_streamed(agent, user_input)

    # Process streaming results
    async for event in result.stream_events():
        if event.type == "raw_response_event":
            continue
        elif event.type == "agent_updated_stream_event":
            print(f"Agent updated: {event.new_agent.name}")
        elif event.type == "run_item_stream_event":
            if event.item.type == "tool_call_item":
                print("-- Tool was called")
            elif event.item.type == "tool_call_output_item":
                print(f"-- Tool output: {event.item.output}")
            elif event.item.type == "message_output_item":
                print(
                    f"-- Message output:\n {ItemHelpers.text_message_output(event.item)}"
                )
            else:
                pass

    print("=== Run complete ===")


if __name__ == "__main__":

    if not shutil.which("npx"):
        raise RuntimeError(
            "npx is not installed. Please install it with `npm install -g npx`."
        )
    asyncio.run(main())


================================================
FILE: examples/agents-sdk-python/pyproject.toml
================================================
[project]
name = "agents-sdk-python"
version = "0.1.0"
description = "Add your description here"
readme = "README.md"
requires-python = ">=3.12"
dependencies = [
    "openai-agents>=0.2.4",
]


================================================
FILE: examples/gradio/gradio_chat.py
================================================
import json
import requests
import gradio as gr

DEFAULT_FUNCTION_PROPERTIES = """
{
    "type": "object",
    "properties": {
        "location": {
            "type": "string",
            "description": "The city and state, e.g. San Francisco, CA"
        }
    },
    "required": ["location"]
}
""".strip()

def chat_with_model(message, history, model_choice, instructions, effort, use_functions, 
                   function_name, function_description, function_parameters,
                   use_browser_search, temperature, max_output_tokens, debug_mode):
    
    if not message.strip():
        return history, ""
    
    # Append user message and empty assistant placeholder (idiomatic Gradio pattern)
    history = history + [[message, ""]]
    
    # Build messages list from history (excluding the empty assistant placeholder)
    messages = []
    
    # Convert history to messages format (excluding the last empty assistant message)
    for user_msg, assistant_msg in history[:-1]:
        if user_msg:
            messages.append({
                "type": "message",
                "role": "user", 
                "content": [{"type": "input_text", "text": user_msg}]
            })
        if assistant_msg:
            messages.append({
                "type": "message",
                "role": "assistant",
                "content": [{"type": "output_text", "text": assistant_msg}]
            })
    
    # Add current user message
    messages.append({
        "type": "message",
        "role": "user",
        "content": [{"type": "input_text", "text": message}]
    })
    
    # Prepare tools
    tools = []
    if use_functions:
        try:
            tools.append({
                "type": "function",
                "name": function_name,
                "description": function_description,
                "parameters": json.loads(function_parameters),
            })
        except json.JSONDecodeError:
            pass
    
    if use_browser_search:
        tools.append({"type": "browser_search"})
    
    # Get URL based on model (matching streamlit logic)
    options = ["large", "small"]
    URL = ("http://localhost:8081/v1/responses" if model_choice == options[1] 
           else "http://localhost:8000/v1/responses")
    
    try:
        response = requests.post(
            URL,
            json={
                "input": messages,
                "stream": True,
                "instructions": instructions,
                "reasoning": {"effort": effort},
                "metadata": {"__debug": debug_mode},
                "tools": tools,
                "temperature": temperature,
                "max_output_tokens": max_output_tokens,
            },
            stream=True,
        )
        
        full_content = ""
        text_delta = ""
        current_output_index = 0
        in_reasoning = False
        
        for line in response.iter_lines(decode_unicode=True):
            if not line or not line.startswith("data:"):
                continue
            data_str = line[len("data:"):].strip()
            if not data_str:
                continue
            
            try:
                data = json.loads(data_str)
            except Exception:
                continue
            
            event_type = data.get("type", "")
            output_index = data.get("output_index", 0)
            
            if event_type == "response.output_item.added":
                current_output_index = output_index
                output_type = data.get("item", {}).get("type", "message")
                text_delta = ""
                
                if output_type == "reasoning":
                    if not in_reasoning:
                        full_content += "🤔 **Thinking...**\n"
                        in_reasoning = True
                elif output_type == "message":
                    if in_reasoning:
                        full_content += "\n\n"
                        in_reasoning = False
                
            elif event_type == "response.reasoning_text.delta":
                delta = data.get("delta", "")
                full_content += delta
                
                # Update last assistant message (idiomatic Gradio pattern)
                history[-1][1] = full_content
                yield history, ""
                
            elif event_type == "response.output_text.delta":
                delta = data.get("delta", "")
                full_content += delta
                
                # Update last assistant message (idiomatic Gradio pattern)  
                history[-1][1] = full_content
                yield history, ""
                
            elif event_type == "response.output_item.done":
                item = data.get("item", {})
                if item.get("type") == "function_call":
                    function_call_text = f"\n\n🔨 Called `{item.get('name')}`\n**Arguments**\n```json\n{item.get('arguments', '')}\n```"
                    full_content += function_call_text
                    
                    # Update last assistant message (idiomatic Gradio pattern)
                    history[-1][1] = full_content
                    yield history, ""
                    
                elif item.get("type") == "web_search_call":
                    web_search_text = f"\n\n🌐 **Web Search**\n```json\n{json.dumps(item.get('action', {}), indent=2)}\n```\n✅ Done"
                    full_content += web_search_text
                    
                    # Update last assistant message (idiomatic Gradio pattern)
                    history[-1][1] = full_content
                    yield history, ""
                    
            elif event_type == "response.completed":
                response_data = data.get("response", {})
                if debug_mode:
                    debug_info = response_data.get("metadata", {}).get("__debug", "")
                    if debug_info:
                        full_content += f"\n\n**Debug**\n```\n{debug_info}\n```"
                        
                        # Update last assistant message (idiomatic Gradio pattern)
                        history[-1][1] = full_content
                        yield history, ""
                break
        
        # Return final history and empty string to clear textbox
        return history, ""
        
    except Exception as e:
        error_message = f"❌ Error: {str(e)}"
        history[-1][1] = error_message
        return history, ""


# Create the Gradio interface
with gr.Blocks(title="💬 Chatbot") as demo:
    gr.Markdown("# 💬 Chatbot")
    
    with gr.Row():
        with gr.Column(scale=3):
            chatbot = gr.Chatbot(height=500)
            
            with gr.Row():
                msg = gr.Textbox(placeholder="Type a message...", scale=4, show_label=False)
                send_btn = gr.Button("Send", scale=1)
            
            clear_btn = gr.Button("Clear Chat")
        
        with gr.Column(scale=1):
            model_choice = gr.Radio(["large", "small"], value="small", label="Model")
            
            instructions = gr.Textbox(
                label="Instructions", 
                value="You are a helpful assistant that can answer questions and help with tasks.",
                lines=3
            )
            
            effort = gr.Radio(["low", "medium", "high"], value="medium", label="Reasoning effort")
            
            gr.Markdown("#### Functions")
            use_functions = gr.Checkbox(label="Use functions", value=False)
            
            with gr.Column(visible=False) as function_group:
                function_name = gr.Textbox(label="Function name", value="get_weather")
                function_description = gr.Textbox(
                    label="Function description", 
                    value="Get the weather for a given city"
                )
                function_parameters = gr.Textbox(
                    label="Function parameters", 
                    value=DEFAULT_FUNCTION_PROPERTIES,
                    lines=6
                )
            
            # Conditional browser search (matching Streamlit logic)
            # In Streamlit: if "show_browser" in st.query_params:
            # For Gradio, we'll always show it (simplified)
            gr.Markdown("#### Built-in Tools") 
            use_browser_search = gr.Checkbox(label="Use browser search", value=False)
            
            temperature = gr.Slider(0.0, 1.0, value=1.0, step=0.01, label="Temperature")
            max_output_tokens = gr.Slider(1000, 20000, value=1024, step=100, label="Max output tokens")
            
            debug_mode = gr.Checkbox(label="Debug mode", value=False)
    
    # Event handlers
    def toggle_function_group(use_funcs):
        return gr.update(visible=use_funcs)
    
    use_functions.change(toggle_function_group, use_functions, function_group)
    
    # Chat functionality
    inputs = [msg, chatbot, model_choice, instructions, effort, use_functions, 
              function_name, function_description, function_parameters,
              use_browser_search, temperature, max_output_tokens, debug_mode]
    
    msg.submit(chat_with_model, inputs, [chatbot, msg])
    send_btn.click(chat_with_model, inputs, [chatbot, msg])
    clear_btn.click(lambda: [], outputs=chatbot)


if __name__ == "__main__":
    demo.launch()

================================================
FILE: examples/reinforcement-fine-tuning.ipynb
================================================
{
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/github/openai/gpt-oss/blob/main/examples/reinforcement-fine-tuning.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Free Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "hzPgFeIkZn9q"
      },
      "source": [
        "# Make gpt-oss play games with Reinforcement Learning\n",
        "\n",
        "This notebook demonstrates how you make `gpt-oss` play the 2048 game autonomously by using reinforcement learning (RL).\n",
        "\n",
        "We will train `gpt-oss-20b` using [Unsloth](https://github.com/unslothai/unsloth) to develop a strategy for playing 2048. The strategy will run until the game ends, and the model will be rewarded or penalized based on whether it wins or loses.\n",
        "\n",
        "<img src=\"https://upload.wikimedia.org/wikipedia/commons/thumb/f/f9/2048_win.png/500px-2048_win.png\" width=300 />"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "31KIMLJLnHET"
      },
      "source": [
        "# Installation\n",
        "To run `gpt-oss-20b` RL on a free Google Colab instance, we’ll use the GRPO algorithm along with [Unsloth](https://docs.unsloth.ai/new/gpt-oss-reinforcement-learning), an open-source tool that enables less VRAM usage and faster training."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "CGoDZwcunHEU"
      },
      "outputs": [],
      "source": [
        "%%capture\n",
        "!pip install --upgrade -qqq uv\n",
        "try: import numpy; get_numpy = f\"numpy=={numpy.__version__}\"\n",
        "except: get_numpy = \"numpy\"\n",
        "!uv pip install -qqq \\\n",
        "    \"torch>=2.8.0\" \"triton>=3.4.0\" {get_numpy} torchvision bitsandbytes \"transformers==4.56.2\" \\\n",
        "    \"unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo\" \\\n",
        "    \"unsloth[base] @ git+https://github.com/unslothai/unsloth\" \\\n",
        "    git+https://github.com/triton-lang/triton.git@05b2c186c1b6c9a08375389d5efe9cb4c401c075#subdirectory=python/triton_kernels\n",
        "!uv pip install --upgrade --no-deps transformers==4.56.2 tokenizers\n",
        "!uv pip install --no-deps trl==0.22.2"
      ]
    },
    {
      "cell_type": "markdown",
      "source": [
        "We'll load gpt-oss-20b and set some parameters:\n",
        "* `max_seq_length = 768` The maximum context length of the model. Increasing it will use more memory, and 768 was the maximum we found to fit on a free 15GB Tesla T4 machine\n",
        "* `lora_rank = 4` The larger this number, the smarter the RL process, but the slower and more memory usage\n",
        "* `load_in_4bit = True` Uses quantization to reduce memory usage by 75% without reducing accuracy that much. `load_in_16bit` will be faster but will need a 80GB GPU (H100, B200)\n",
        "* `offload_embedding = True` Unsloth optimization which moves the embedding to CPU RAM, reducing VRAM by 1GB"
      ],
      "metadata": {
        "id": "CcLYwLyQLADE"
      }
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 575,
          "referenced_widgets": [
            "abe2b0a2913d4633943f44333ae799f8",
            "2c40c6b846924200b29616a590af1672",
            "749e8407a901483c8b513a2fb71596c8",
            "7baca79d720c40b5a923b9717e28c982",
            "68ea891644ca4753a8e1bf278ff47e84",
            "06ab9eaa6f0f48c4b68cff1ca4b9f2fa",
            "d98c2b1e979b4929891a8ee0c11f55df",
            "ef01b874478b4bb497d31d2f8dd6145a",
            "d50ea8cded9848ffa18be1ae6a2559df",
            "ffabf89ecd9d48a5a3fc2a1c855ce080",
            "614c5332c7d045109102a329e7f69dfd",
            "caf742160db041a1b6c2cfdf78f2dc9a",
            "34a9e38b0b454a69a067d1ddadec7626",
            "263b7dc0b3fd465fac89b9266b19d526",
            "5b7af68130f04a63ad3efa3d9f602ebe",
            "2a6aa92676c74509b58373ca604c5b3b",
            "9c4d6839934b4b13952a850d2084d498",
            "c6a1decbc0e7421db622033214913cb9",
            "147743757c804b85af2ef194f5f84e6a",
            "2820e352ab004e818949acc31eb3888d",
            "80fa3aef5e2040d9904c6b87b7214ca0",
            "0f99489932aa409b94ba34764aff19b0",
            "6ab4e5676ad84807a126fffa99f7a0d4",
            "e61ef80398444c13bf7cd20ef21a5057",
            "5ebe7b4e4ed24c53b783ee46377c682d",
            "e0fdef0087bc4a91a11932a2d933c001",
            "596c2a62a635469eb74233ce00586a6f",
            "da4324e287e64e5ba98fc110693066df",
            "8c7c6bb04a3f4a1494b34529f95a195c",
            "51aaa109480d4ae6bd419aea689d22ee",
            "acf4e50a248342f68d26daef21baa419",
            "7d3379cbd27a4218a9d84c5a12f3bb88",
            "7841bc90b6a74120ab3e603c76332a01",
            "3f9b801b52da4eb79f730d87bea5c338",
            "b66c6ded549d4db8a2e5ea8e5016615c",
            "43da5073c3ad4e98a3ade17a0bb3b93d",
            "40365e2c9fef49148e4c93592d458afc",
            "7e9d5212fc7844f286e14b70cbf0bc7a",
            "77d34c0f1de548b4872208a063bb5017",
            "bf96e8666c224c26b0a01451d08e907a",
            "4513a73fa95b41b5b6edadc9143ba9c1",
            "792d75a7d18945e7972826ac5b2ac386",
            "2a6f43b64d164636a2d9708f0190f21b",
            "65c62d2198e64ee4a9e6547c2733135a",
            "219ca32ab51e4b4385b2c1026a78503a",
            "6c2ccfe3363b40b58fc26ea164d4ead4",
            "07f0420c4dfa477caccd7ae96551c2e4",
            "1c96edb2f7c948b9968b1239982af942",
            "d93be4994f104b6e99d89a9e73cd6abd",
            "4da21f53bf7f4e2d8132eb43e6ecc739",
            "735f70fac43449e3974de1b783d56d33",
            "ad75f887a140416abfca615b2fc3c385",
            "dee02a37a6f44f168546ee0077dc20d1",
            "ee23056662ad4b719b65005d776e0e72",
            "87765ca0996b403dbe29deef48d548bf",
            "8db5e86577744ff1a39c8e198eee5dd3",
            "4b9b3fe8dc764eedb9e18f166fe2f548",
            "cca95e973bc445d3811335debf7c446e",
            "e507a46b4c754d9a8aede2aac0d203bc",
            "751a46fbb8e24efabfb381a85c90fbe8",
            "87a808c4d4f54f719adcd29de7206e1b",
            "5f0b2a0e1953406b88af2c884904e2da",
            "2fa84865e9f14c1491402ef81517b4bd",
            "245590db7d374515a428ff4abbd25588",
            "e2973e6c02834a7c9f2f6ce5755f35f0",
            "48741bbdeccb459aa4eea9c61339764b",
            "1183d3f2ad3c4fb0af1d925b5f9e3efe",
            "9cc51d8029eb4217bc37daa918649692",
            "41f13d2f023e405180689e03bc2c32a1",
            "247484c0bf5945bcb4627b48928366c8",
            "14c0f20a9ab341ee966fe77815099ff0",
            "a219f3b89a34443abe612846676f9356",
            "152d7bf2a74f400db3d3ecaa719ef8d1",
            "36676899a61f4be4b631f6271f6ecec9",
            "77ecad9f150c430fa85f5833d97c42df",
            "cef064f1c55f41bf957fc4623260fdb4",
            "37cbe8800af04a42a0355922969b6393",
            "f8dacdab001d4db0b6b3776ac7d3634a",
            "5a59fb5f7acf4213847c985e66c9ee3c",
            "ae6d42fb84fc4984af1d4430acdcd3c9",
            "02d120e49f2c4f95a6090b1d8d521767",
            "8f1e6c36b84c4115a671dcb9ade41c8b",
            "81a728910a2341a785a6f252bbb371f7",
            "69a8d50f11244ba688c183d14d2395ec",
            "350f29f737534bfba4258bc31ec274a2",
            "9beac0680e3049dfafcb6ec185fd2265",
            "dbf5ed93dac646ed979fa7a8c569dfe3",
            "4db5ee5b7b674abba75fbce264e6dfa3",
            "0c0c96eeac664f339aa4511bf47087e2",
            "18451e19df5449b1853b5e13dacd19c5",
            "d864d29d02c54ecfaedd7b866a6df8c2",
            "7875163297284832a35aca84cbb105ce",
            "d42d8228ea1247a1a81bb99b18c4640c",
            "bcda4c9a48e943a6a0ef812fcd64a6db",
            "61e491b843c347b6b2a9948de7caf01d",
            "dee07d33b8de4c3b847fcff670e68102",
            "b07acf871a0a46f1889bfb439d13752b",
            "ba94310dc12a4a258205b14901ad3f94",
            "a93210a691414502ba3c2dff03ffb4ce",
            "fd2fe9ef6da64f72ab29d481d1739f5e",
            "dbfeea8ee2374b8c8fa70431c35f281f",
            "84d27c45065e426badbfcfcdc8ff16b6",
            "fa9ea0d3234e41689c827485d0360885",
            "4cb119127b404f46a53012c62d004e28",
            "d9020a2a2c8440db81d2cfdf0289b667",
            "04d39c4dda9f4a1bb01b8d6320032372",
            "4d67b10ec7794170addb4e968e20f170",
            "55ac5c2a82ee48fe988e1e4f26c168b0",
            "9a079a30b4ae4bbc80122faf83e0ad59",
            "acda8e7582934fecbbf854e66e23f698",
            "4fbc4cfe529d471ba85f3ae8e53b28d6",
            "a0d0fedc5bec4f5b943fddf9a954fbdf",
            "cab602573c6940919f93e59fe6f4838d",
            "51b8f4ce40f94ac39cf44d98f1522ec7",
            "32d6af64f2464cfb965671f2692b4e15",
            "e1e77d98b01f4376a6c075975c27571e",
            "6a47e60b10a6481b94aee021c8dbc7ba",
            "5657a84bf4b74710b2de1a54f9236e39",
            "7bd5d1beeb0e49e293d9f6b91bb6d7fb",
            "60ceb890b5644493a8886d91b9dac461",
            "40138ff29073407abb95f793509fc320",
            "0ac4d8e674804ad6bdc5f2d62f2e0d33",
            "7bfcd9acf29646db8b6123708d1ffe27",
            "5e88d6515f16475fb72d7c153422b591",
            "5e5b77dd649547f896ab306fccc94a4e",
            "a843fa23e6c94fb486bff8764574fdc5",
            "fd0ac7ed3d3146ec85913f4e05c4a2f6",
            "77204d81ff8f4ee585361a503fa647dc",
            "923653dfe90e475a9efa44baf98ba9a0",
            "62600092f8cc43f493b86b0169f67be1",
            "59e46bbe96df4b88ad31c09096ce0e0a",
            "8f5c7b88a2cc4b5abb0814c814833349"
          ]
        },
        "id": "DkIvEkIIkEyB",
        "outputId": "2f85e1d0-8810-4b41-b683-0c33578d991c"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.\n",
            "🦥 Unsloth Zoo will now patch everything to make training faster!\n",
            "==((====))==  Unsloth 2025.10.1: Fast Gpt_Oss patching. Transformers: 4.56.2.\n",
            "   \\\\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.\n",
            "O^O/ \\_/ \\    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0\n",
            "\\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]\n",
            " \"-____-\"     Free license: http://github.com/unslothai/unsloth\n",
            "Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!\n",
            "Unsloth: Using float16 precision for gpt_oss won't work! Using float32.\n"
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "abe2b0a2913d4633943f44333ae799f8",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "model.safetensors.index.json: 0.00B [00:00, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "caf742160db041a1b6c2cfdf78f2dc9a",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "6ab4e5676ad84807a126fffa99f7a0d4",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "model-00001-of-00004.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "3f9b801b52da4eb79f730d87bea5c338",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "model-00004-of-00004.safetensors:   0%|          | 0.00/1.16G [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "219ca32ab51e4b4385b2c1026a78503a",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "model-00002-of-00004.safetensors:   0%|          | 0.00/4.00G [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "8db5e86577744ff1a39c8e198eee5dd3",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "model-00003-of-00004.safetensors:   0%|          | 0.00/3.37G [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "1183d3f2ad3c4fb0af1d925b5f9e3efe",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "f8dacdab001d4db0b6b3776ac7d3634a",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "generation_config.json:   0%|          | 0.00/165 [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Unsloth: Offloading embeddings to RAM to save 1.08 GB.\n"
          ]
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "0c0c96eeac664f339aa4511bf47087e2",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "tokenizer_config.json: 0.00B [00:00, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "fd2fe9ef6da64f72ab29d481d1739f5e",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "tokenizer.json:   0%|          | 0.00/27.9M [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "4fbc4cfe529d471ba85f3ae8e53b28d6",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "special_tokens_map.json:   0%|          | 0.00/446 [00:00<?, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "data": {
            "application/vnd.jupyter.widget-view+json": {
              "model_id": "0ac4d8e674804ad6bdc5f2d62f2e0d33",
              "version_major": 2,
              "version_minor": 0
            },
            "text/plain": [
              "chat_template.jinja: 0.00B [00:00, ?B/s]"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        }
      ],
      "source": [
        "from unsloth import FastLanguageModel\n",
        "import torch\n",
        "max_seq_length = 768 # Can increase for longer RL output\n",
        "lora_rank = 4        # Larger rank = smarter, but slower\n",
        "model, tokenizer = FastLanguageModel.from_pretrained(\n",
        "    model_name = \"unsloth/gpt-oss-20b\", # unsloth/gpt-oss-20b-BF16 for H100s\n",
        "    max_seq_length = max_seq_length,\n",
        "    load_in_4bit = True,      # False for LoRA 16bit. Choose False on H100s\n",
        "    offload_embedding = True, # Reduces VRAM by 1GB\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "TfeUs-lQJDSq"
      },
      "source": [
        "To do efficient RL, we will use LoRA, which allows us to only add 1 to 5% of extra weights to the model for fine-tuning purposes. This allows us to save memory usage by 60% while retaining most accuracy. Read Unsloth's [gpt-oss RL Guide](https://docs.unsloth.ai/new/gpt-oss-reinforcement-learning) for more details."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "8rGa-o3HJCo1",
        "outputId": "6dc27dbf-0c60-4996-8e97-932aab7c14fb"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Unsloth: Making `model.base_model.model.model` require gradients\n"
          ]
        }
      ],
      "source": [
        "model = FastLanguageModel.get_peft_model(\n",
        "    model,\n",
        "    r = lora_rank, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128\n",
        "    target_modules = [\n",
        "        \"q_proj\", \"k_proj\", \"v_proj\", \"o_proj\",\n",
        "        \"gate_proj\", \"up_proj\", \"down_proj\",\n",
        "    ],\n",
        "    lora_alpha = lora_rank*2, # *2 speeds up training\n",
        "    use_gradient_checkpointing = \"unsloth\", # Reduces memory usage\n",
        "    random_state = 3407,\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "N0QnO9_YJBOI"
      },
      "source": [
        "# 2048 game\n",
        "\n",
        "We used GPT-5 to create a variant of the 2048 game. It should output the current game board state, and allow us to advance the game board state with 1 action (up, down, left, right)."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "cellView": "form",
        "id": "D9CI4jtgL5mw"
      },
      "outputs": [],
      "source": [
        "#@title (Collapsible) 2048 Game Implementation\n",
        "from dataclasses import dataclass, field\n",
        "from typing import List, Tuple, Optional\n",
        "import random\n",
        "import copy\n",
        "\n",
        "def _compress_and_merge_row_left(row: List[int]) -> Tuple[List[int], int, bool]:\n",
        "    n = len(row)\n",
        "    tiles = [x for x in row if x != 0]\n",
        "    gained = 0\n",
        "    i = 0\n",
        "    merged = []\n",
        "    while i < len(tiles):\n",
        "        if i + 1 < len(tiles) and tiles[i] == tiles[i + 1]:\n",
        "            v = tiles[i] * 2\n",
        "            gained += v\n",
        "            merged.append(v)\n",
        "            i += 2\n",
        "        else:\n",
        "            merged.append(tiles[i])\n",
        "            i += 1\n",
        "    merged += [0] * (n - len(merged))\n",
        "    changed = merged != row\n",
        "    return merged, gained, changed\n",
        "\n",
        "def _move_left(board: List[List[int]]) -> Tuple[List[List[int]], int, bool]:\n",
        "    changed_any = False\n",
        "    total_gain = 0\n",
        "    new_board = []\n",
        "    for row in board:\n",
        "        new_row, gained, changed = _compress_and_merge_row_left(row)\n",
        "        new_board.append(new_row)\n",
        "        total_gain += gained\n",
        "        changed_any = changed_any or changed\n",
        "    return new_board, total_gain, changed_any\n",
        "\n",
        "def _move_right(board: List[List[int]]) -> Tuple[List[List[int]], int, bool]:\n",
        "    changed_any = False\n",
        "    total_gain = 0\n",
        "    new_board = []\n",
        "    for row in board:\n",
        "        rev = list(reversed(row))\n",
        "        new_rev, gained, changed = _compress_and_merge_row_left(rev)\n",
        "        new_row = list(reversed(new_rev))\n",
        "        new_board.append(new_row)\n",
        "        total_gain += gained\n",
        "        changed_any = changed_any or changed\n",
        "    return new_board, total_gain, changed_any\n",
        "\n",
        "def _transpose(board: List[List[int]]) -> List[List[int]]:\n",
        "    return [list(row) for row in zip(*board)]\n",
        "\n",
        "def _move_up(board: List[List[int]]) -> Tuple[List[List[int]], int, bool]:\n",
        "    t = _transpose(board)\n",
        "    moved, gain, changed = _move_left(t)\n",
        "    return _transpose(moved), gain, changed\n",
        "\n",
        "def _move_down(board: List[List[int]]) -> Tuple[List[List[int]], int, bool]:\n",
        "    t = _transpose(board)\n",
        "    moved, gain, changed = _move_right(t)\n",
        "    return _transpose(moved), gain, changed\n",
        "\n",
        "def _empty_cells(board: List[List[int]]) -> List[Tuple[int, int]]:\n",
        "    size = len(board)\n",
        "    return [(r, c) for r in range(size) for c in range(size) if board[r][c] == 0]\n",
        "\n",
        "def _can_move(board: List[List[int]]) -> bool:\n",
        "    if _empty_cells(board):\n",
        "        return True\n",
        "    size = len(board)\n",
        "    for r in range(size):\n",
        "        for c in range(size - 1):\n",
        "            if board[r][c] == board[r][c + 1]:\n",
        "                return True\n",
        "    for r in range(size - 1):\n",
        "        for c in range(size):\n",
        "            if board[r][c] == board[r + 1][c]:\n",
        "                return True\n",
        "    return False\n",
        "\n",
        "@dataclass\n",
        "class GameBoard:\n",
        "    size: int\n",
        "    seed: Optional[int] = None\n",
        "    target: int = 2048\n",
        "    probability_fours: float = 0.10 # originally spawns (4) 10% of the time!\n",
        "    _rng: random.Random = field(init=False, repr=False)\n",
        "    _board: List[List[int]] = field(init=False, repr=False)\n",
        "    _score: int = field(default=0, init=False, repr=False)\n",
        "    _state: str = field(default=\"ongoing\", init=False, repr=False)\n",
        "\n",
        "    def __post_init__(self):\n",
        "        if self.size < 2:\n",
        "            raise ValueError(\"Board size must be at least 2.\")\n",
        "        self._rng = random.Random(self.seed)\n",
        "        self._board = [[0 for _ in range(self.size)] for _ in range(self.size)]\n",
        "        self._add_random_tile()\n",
        "        self._add_random_tile()\n",
        "        self._update_state_after_change()\n",
        "\n",
        "    class _BoardView:\n",
        "        def __init__(self, game: \"GameBoard\"):\n",
        "            self._game = game\n",
        "        def __iter__(self):\n",
        "            return iter(self._game._board)\n",
        "        def __len__(self):\n",
        "            return len(self._game._board)\n",
        "        def __getitem__(self, idx):\n",
        "            return self._game._board[idx]\n",
        "        def __repr__(self) -> str:\n",
        "            return repr(self._game._board)\n",
        "        __str__ = __repr__\n",
        "        def do_action(self, key: str) -> None:\n",
        "            self._game.do_action(key)\n",
        "        def state(self) -> str:\n",
        "            return self._game.state()\n",
        "        def pretty(self, colors: bool = True, border: bool = True, dot_for_zero: bool = True) -> str:\n",
        "            return self._game._render_pretty(colors=colors, border=border, dot_for_zero=dot_for_zero)\n",
        "\n",
        "    def board(self) -> \"_BoardView\":\n",
        "        return GameBoard._BoardView(self)\n",
        "    def state(self) -> str:\n",
        "        return self._state\n",
        "    def score(self) -> int:\n",
        "        return self._score\n",
        "    def do_action(self, key: str) -> None:\n",
        "        if self._state != \"ongoing\":\n",
        "            return\n",
        "        if not isinstance(key, str) or len(key) == 0:\n",
        "            self._state = \"failed\"\n",
        "            return\n",
        "        k = key.strip().lower()\n",
        "        if k == \"q\":\n",
        "            self._state = \"failed\"\n",
        "            return\n",
        "        move_map = {\"a\": _move_left, \"d\": _move_right, \"w\": _move_up, \"s\": _move_down}\n",
        "        if k not in move_map:\n",
        "            self._state = \"failed\"\n",
        "            return\n",
        "        mover = move_map[k]\n",
        "        new_board, gain, changed = mover(self._board)\n",
        "        if changed:\n",
        "            self._board = new_board\n",
        "            self._score += gain\n",
        "            self._add_random_tile()\n",
        "        self._update_state_after_change()\n",
        "    def _add_random_tile(self) -> bool:\n",
        "        empties = _empty_cells(self._board)\n",
        "        if not empties:\n",
        "            return False\n",
        "        r, c = self._rng.choice(empties)\n",
        "        self._board[r][c] = 4 if self._rng.random() < self.probability_fours else 2\n",
        "        return True\n",
        "    def _update_state_after_change(self) -> None:\n",
        "        if any(self.target in row for row in self._board):\n",
        "            self._state = \"success\"\n",
        "            return\n",
        "        if not _can_move(self._board):\n",
        "            self._state = \"failed\"\n",
        "            return\n",
        "        self._state = \"ongoing\"\n",
        "    def _render_pretty(self, colors: bool = True, border: bool = True, dot_for_zero: bool = True) -> str:\n",
        "        \"\"\"\n",
        "        Pretty-print the board with colors that scale from 0 up to self.target.\n",
        "        Uses ANSI 256-color codes (works in most terminals). Set colors=False to disable.\n",
        "        \"\"\"\n",
        "        import math\n",
        "\n",
        "        b = self._board\n",
        "        mx = max((max(row) for row in b), default=0)\n",
        "        cell_w = max(3, len(str(mx)))\n",
        "\n",
        "        RESET = \"\\x1b[0m\"\n",
        "\n",
        "        # A smooth-ish gradient from cool → warm\n",
        "        # (blue/cyan/green → yellow/orange/red). Tweak or expand as you like.\n",
        "        GRAD = [33, 39, 45, 51, 50, 49, 48, 47, 46, 82, 118, 154, 190, 226, 220, 214, 208, 202, 196]\n",
        "        ZERO_FG = 239  # dim gray\n",
        "\n",
        "        def color_code(v: int) -> str:\n",
        "            if not colors:\n",
        "                return \"\"\n",
        "            if v == 0:\n",
        "                return f\"\\x1b[38;5;{ZERO_FG}m\"\n",
        "            # Normalize by exponent relative to target: r in [0,1]\n",
        "            t = max(2, self.target)  # safety; avoid log2(1)\n",
        "            # Guard: if v is not a power of two or is <1, handle gracefully\n",
        "            try:\n",
        "                r = max(0.0, min(1.0, math.log2(v) / math.log2(t)))\n",
        "            except ValueError:\n",
        "                r = 0.0\n",
        "            idx = int(round(r * (len(GRAD) - 1)))\n",
        "            return f\"\\x1b[38;5;{GRAD[idx]}m\"\n",
        "\n",
        "        def fmt(v: int) -> str:\n",
        "            s = \".\" if (v == 0 and dot_for_zero) else str(v)\n",
        "            s = s.rjust(cell_w)\n",
        "            return color_code(v) + s + (RESET if colors else \"\")\n",
        "\n",
        "        def hline(left: str, mid: str, right: str) -> str:\n",
        "            return left + mid.join(\"─\" * cell_w for _ in range(self.size)) + right\n",
        "\n",
        "        rows = []\n",
        "        if border:\n",
        "            rows.append(hline(\"┌\", \"┬\", \"┐\"))\n",
        "        for r in range(self.size):\n",
        "            content = \"│\".join(fmt(v) for v in b[r])\n",
        "            rows.append((\"│\" + content + \"│\") if border else content)\n",
        "            if border:\n",
        "                rows.append(hline(\"└\" if r == self.size - 1 else \"├\",\n",
        "                                \"┴\" if r == self.size - 1 else \"┼\",\n",
        "                                \"┘\" if r == self.size - 1 else \"┤\"))\n",
        "        return \"\\n\".join(rows)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "4BcaLniVKLpa"
      },
      "source": [
        "For example let's create a board of size 5 X 5 and set the target to 8 instead of 2048.\n",
        "\n",
        "**[NOTE]** 2048 originally spawns a (4) 10% of the time! We can disable this for harder games. See [Wikipedia page](https://en.wikipedia.org/wiki/2048_(video_game)) for more details."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "-M8kGaFRJ2ic",
        "outputId": "fad6c36b-cb16-490f-ad4f-6bf998dd24ab"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "┌───┬───┬───┬───┬───┐\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;48m  2\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;48m  2\u001b[0m│\n",
            "└───┴───┴───┴───┴───┘ ongoing\n"
          ]
        }
      ],
      "source": [
        "game = GameBoard(size = 5, seed = 42, target = 8, probability_fours = 0.10)\n",
        "print(game.board().pretty(), game.state())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "zclUeNxosv4k",
        "outputId": "ad099448-d1f2-4471-cbc1-f463293e06ba"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "GameBoard(size=5, seed=42, target=8, probability_fours=0.1)"
            ]
          },
          "execution_count": 6,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "game"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "envzrXmjKRff"
      },
      "source": [
        "We'll use WASD for the action space:\n",
        "\n",
        "```\n",
        "   W\n",
        "A  S  D\n",
        "```\n",
        "Also `game.state()` will say `success` if we succeeded in getting the target!"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "b-gSgthFI_wq",
        "outputId": "68af4e66-80c8-4fa0-c7f3-e9ba22923494"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "┌───┬───┬───┬───┬───┐\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;48m  2\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;190m  4\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "└───┴───┴───┴───┴───┘ ongoing\n"
          ]
        }
      ],
      "source": [
        "game.do_action(\"A\")\n",
        "print(game.board().pretty(), game.state())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "lUDdHKAxvZf8",
        "outputId": "38692fcc-bfa9-47b3-82f8-09bee2842d38"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "┌───┬───┬───┬───┬───┐\n",
            "│\u001b[38;5;190m  4\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;48m  2\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;48m  2\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "└───┴───┴───┴───┴───┘ ongoing\n"
          ]
        }
      ],
      "source": [
        "game.do_action(\"W\")\n",
        "print(game.board().pretty(), game.state())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "wkTHxvvUvcmO",
        "outputId": "f9447b03-b0eb-443e-e139-607f231c76fe"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "┌───┬───┬───┬───┬───┐\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;190m  4\u001b[0m│\u001b[38;5;48m  2\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;48m  2\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;190m  4\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "└───┴───┴───┴───┴───┘ ongoing\n"
          ]
        }
      ],
      "source": [
        "game.do_action(\"D\")\n",
        "print(game.board().pretty(), game.state())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "XO8vlL-4vd-K",
        "outputId": "a6f786bf-39d5-4a23-d79b-17ea9e94272c"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "┌───┬───┬───┬───┬───┐\n",
            "│\u001b[38;5;190m  4\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;190m  4\u001b[0m│\u001b[38;5;190m  4\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;190m  4\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "└───┴───┴───┴───┴───┘ ongoing\n"
          ]
        }
      ],
      "source": [
        "game.do_action(\"W\")\n",
        "print(game.board().pretty(), game.state())"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "MEa2ngmrvfNm",
        "outputId": "c27d9fca-55a0-42c4-dae5-bf8e402d7295"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "┌───┬───┬───┬───┬───┐\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;48m  2\u001b[0m│\u001b[38;5;190m  4\u001b[0m│\u001b[38;5;196m  8\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;190m  4\u001b[0m│\n",
            "├───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "└───┴───┴───┴───┴───┘ success\n"
          ]
        }
      ],
      "source": [
        "game.do_action(\"D\")\n",
        "print(game.board().pretty(), game.state())"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "gGL1X29Fy4n5"
      },
      "source": [
        "If we do some other action that's not part of the action space, we will get an error, and the game will not accept anymore actions."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "VZeIHbqoy7yn",
        "outputId": "11d15a8f-f09d-4833-8ef7-3bad0510e618"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "┌───┬───┬───┐\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;190m  4\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;48m  2\u001b[0m│\n",
            "├───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "└───┴───┴───┘ failed\n"
          ]
        }
      ],
      "source": [
        "game = GameBoard(size = 3, seed = 42, target = 8, probability_fours = 0.10)\n",
        "game.do_action(\"AA\") # Not in WASD\n",
        "game.do_action(\"W\")  # Doesn't do anything\n",
        "game.do_action(\"A\")  # Doesn't do anything\n",
        "print(game.board().pretty(), game.state())"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "VR6czU96cpxf"
      },
      "source": [
        "# RL Environment Setup\n",
        "\n",
        "We'll set up a function to accept some strategy that'll emit an action within `WASD` and check the game state.\n",
        "\n",
        "We'll also add a timer to only execute the stratgegy for 2 seconds maximum, otherwise it might never terminate!"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "tdgjnf-8z_kr"
      },
      "outputs": [],
      "source": [
        "from typing import Callable\n",
        "from unsloth import execute_with_time_limit\n",
        "\n",
        "def _execute_strategy(strategy : Callable, game : GameBoard):\n",
        "    assert callable(strategy)\n",
        "\n",
        "    steps = 0\n",
        "    while game.state() == \"ongoing\":\n",
        "        action = strategy(list(game.board()))\n",
        "        steps += 1\n",
        "        if type(action) is not str:\n",
        "            return steps, \"failed\"\n",
        "        game.do_action(action)\n",
        "    return steps, game.state()\n",
        "\n",
        "@execute_with_time_limit(2)\n",
        "def execute_strategy(strategy : Callable, game : GameBoard):\n",
        "    return _execute_strategy(strategy, game)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "ywh0HizI9ayE"
      },
      "source": [
        "Let's make a generic strategy to just hit `W`. We should expect this generic strategy to fail:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "5bkhqoZc0IO8",
        "outputId": "149e18be-dae2-4382-817a-620e7b40ebde"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Timed out with error = Timed out after 2s\n"
          ]
        }
      ],
      "source": [
        "def always_move_left(board):\n",
        "    return \"W\"\n",
        "\n",
        "game = GameBoard(size = 8, seed = 42, target = 2048, probability_fours = 0.10)\n",
        "try:\n",
        "    execute_strategy(always_move_left, game)\n",
        "except TimeoutError as e:\n",
        "    print(f\"Timed out with error = {str(e)}\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "dkuHVdB09sgf"
      },
      "source": [
        "To allow longer strategies for gpt-oss-20b Reinforcement Learning, we shall allow a 5 second timer."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "SK-LfzsA9wbW"
      },
      "outputs": [],
      "source": [
        "@execute_with_time_limit(5)\n",
        "def execute_strategy(strategy : Callable, game : GameBoard):\n",
        "    return _execute_strategy(strategy, game)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tRhLV_bZMYxy"
      },
      "source": [
        "# Code Execution\n",
        "\n",
        "To execute and create a new Python function, we first have to check if the function does not call other global variables or cheat. This is called `countering reward hacking` since we don't want the function to cheat.\n",
        "\n",
        "For example the below piece of code is fine, since it only imports Python level functions. We use `check_python_modules`:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "zz80kvg6M4BG",
        "outputId": "f13fdc0d-ddb3-4c4a-cf65-805dfb31dddd"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Only Python imports? True\n",
            "{'stdlib': ['math', 'typing'], 'non_stdlib': [], 'relative_imports': 0}\n"
          ]
        }
      ],
      "source": [
        "from unsloth import check_python_modules\n",
        "\n",
        "sample = \"\"\"\n",
        "def strategy(board):\n",
        "    import math\n",
        "    from typing import Callable\n",
        "    return \"W\"\n",
        "\"\"\"\n",
        "ok, info = check_python_modules(sample)\n",
        "print(\"Only Python imports?\", ok)\n",
        "print(info)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "bZzVWgKQ-VIg"
      },
      "source": [
        "For the below piece of code, since we import `numpy`, we should not allow the execution:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Z89Jw1KB-Ux7",
        "outputId": "1a4cc701-1677-44b9-d44e-3f3f6dfed8d2"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Only Python imports? False\n",
            "{'stdlib': [], 'non_stdlib': ['numpy'], 'relative_imports': 0}\n"
          ]
        }
      ],
      "source": [
        "sample = \"\"\"\n",
        "def strategy(board):\n",
        "    from numpy import matmul\n",
        "    return \"W\"\n",
        "\"\"\"\n",
        "ok, info = check_python_modules(sample)\n",
        "print(\"Only Python imports?\", ok)\n",
        "print(info)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "SDSrjOTLVyQm"
      },
      "source": [
        "We also disallow global variable access. We'll use Unsloth's `create_locked_down_function` function\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "GcmYAmohVqw2",
        "outputId": "bbfcbbb5-8063-42fe-b349-964554317ab8"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "name 'np' is not defined\n"
          ]
        }
      ],
      "source": [
        "from unsloth import create_locked_down_function\n",
        "function = \"\"\"\n",
        "def import_numpy():\n",
        "    np.matmul\n",
        "    print(\"Success\")\n",
        "\"\"\"\n",
        "f = create_locked_down_function(function)\n",
        "try:\n",
        "    f()\n",
        "except Exception as e:\n",
        "    print(str(e))"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "5tJKwLUgZsRq",
        "outputId": "13588c11-6685-4627-b2d4-445bff9799c8"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "60\n"
          ]
        }
      ],
      "source": [
        "from unsloth import create_locked_down_function\n",
        "function = \"\"\"\n",
        "def add(a, b):\n",
        "    def adder(a):\n",
        "        return a + b\n",
        "    return adder(b) + b\n",
        "\"\"\"\n",
        "f = create_locked_down_function(function)\n",
        "try:\n",
        "    print(f(10, 20))\n",
        "except Exception as e:\n",
        "    print(str(e))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "8CzwCyXIPK04"
      },
      "source": [
        "# Data & RL task setup\n",
        "\n",
        "We now have to create a prompt to tell the model to create a strategy for the 2048 game. You can customize this to some other task for another RL task."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "B-2RRE4HMrQO",
        "outputId": "332255d7-1e6a-4cb4-9ede-c8a2f01378fe"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Create a new short 2048 strategy using only native Python code.\n",
            "You are given a list of list of numbers for the current board state.\n",
            "Output one action for \"W\", \"A\", \"S\", \"D\" on what is the optimal next step.\n",
            "Output your new short function in backticks using the format below:\n",
            "```python\n",
            "def strategy(board):\n",
            "    return \"W\" # Example\n",
            "```\n",
            "All helper functions should be inside def strategy. Only output the short function `strategy`.\n"
          ]
        }
      ],
      "source": [
        "prompt = \"\"\"\n",
        "Create a new short 2048 strategy using only native Python code.\n",
        "You are given a list of list of numbers for the current board state.\n",
        "Output one action for \"W\", \"A\", \"S\", \"D\" on what is the optimal next step.\n",
        "Output your new short function in backticks using the format below:\n",
        "```python\n",
        "def strategy(board):\n",
        "    return \"W\" # Example\n",
        "```\n",
        "All helper functions should be inside def strategy. Only output the short function `strategy`.\n",
        "\"\"\".strip()\n",
        "print(prompt)"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "MIdudFUodN4i"
      },
      "source": [
        "First, let's prompt gpt-oss without RL and see how it goes:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "9HJxrS76h3Ds",
        "outputId": "fcfe2220-fc72-4af7-f8a1-f243311b0156"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\n",
            "Knowledge cutoff: 2024-06\n",
            "Current date: 2025-10-05\n",
            "\n",
            "Reasoning: low\n",
            "\n",
            "# Valid channels: analysis, commentary, final. Channel must be included for every message.\n",
            "Calls to these tools must go to the commentary channel: 'functions'.<|end|><|start|>user<|message|>Create a new short 2048 strategy using only native Python code.\n",
            "You are given a list of list of numbers for the current board state.\n",
            "Output one action for \"W\", \"A\", \"S\", \"D\" on what is the optimal next step.\n",
            "Output your new short function in backticks using the format below:\n",
            "```python\n",
            "def strategy(board):\n",
            "    return \"W\" # Example\n",
            "```\n",
            "All helper functions should be inside def strategy. Only output the short function `strategy`.<|end|><|start|>assistant<|channel|>analysis<|message|>We need simple heuristic: return random move or based on empty spaces. Provide function.<|end|><|start|>assistant<|channel|>final<|message|>```python\n",
            "def strategy(board):\n",
            "    # Count empty cells and choose move that reduces board density\n",
            "    empty_counts = {\n",
            "        \"W\": sum(row.count(0) for row in board[1:]),\n",
            "        \"S\": sum(row.count(0) for row in board[:-1]),\n",
            "        \"A\": sum(col.count(0) for col in zip(*board)[1:]),\n",
            "        \"D\": sum(col.count(0) for col in zip(*board)[:-1]),\n",
            "    }\n",
            "    # Prefer the direction with the most empty cells\n",
            "    return max(empty_counts, key=empty_counts.get)\n",
            "```<|return|>\n"
          ]
        }
      ],
      "source": [
        "text = tokenizer.apply_chat_template(\n",
        "    [{\"role\": \"user\", \"content\": prompt}],\n",
        "    tokenize = False,\n",
        "    add_generation_prompt = True,\n",
        "    reasoning_effort = \"low\",\n",
        ")\n",
        "\n",
        "from transformers import TextStreamer\n",
        "_ = model.generate(\n",
        "    **tokenizer(text, return_tensors = \"pt\").to(\"cuda\"),\n",
        "    temperature = 1.0,\n",
        "    max_new_tokens = 512,\n",
        "    streamer = TextStreamer(tokenizer, skip_prompt = False),\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "iknaWZNudTNq"
      },
      "source": [
        "# Reward functions\n",
        "\n",
        "We now design a `extract_function` function which simply extracts the function wrapped in 3 back ticks.\n",
        "\n",
        "And 3 reward functions:\n",
        "\n",
        "1. `function_works` which rewards the model if the strategy is a valid Python function.\n",
        "2. `no_cheating` which checks if the function imported other modules, and if it did, we penalize it.\n",
        "3. `strategy_succeeds` which checks if the game strategy actually succeeds in attaining 2048 after running the auto-generated strategy."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "8JJGXKdJ-Zl_",
        "outputId": "80fd8078-1621-4c64-a906-5204b444addd"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "def strategy(board):\n",
            "    return \"W\" # Example\n"
          ]
        }
      ],
      "source": [
        "def extract_function(text):\n",
        "    if text.count(\"```\") >= 2:\n",
        "        first = text.find(\"```\") + 3\n",
        "        second = text.find(\"```\", first)\n",
        "        fx = text[first : second].strip()\n",
        "        fx = fx.removeprefix(\"python\\n\")\n",
        "        fx = fx[fx.find(\"def\"):]\n",
        "        if fx.startswith(\"def strategy(board):\"): return fx\n",
        "    return None\n",
        "print(extract_function(prompt))"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "KLXEcf_HSJlI"
      },
      "source": [
        "Below is our `function_works` reward function which uses Python's `exec` but guarded by not allowing leakage of local and global variables. We can also use `check_python_modules` first to check if there are errors before even executing the function:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "h3-B0IIsS56S",
        "outputId": "f3e174fa-2fbf-400b-ec7d-87590be3ef68"
      },
      "outputs": [
        {
          "data": {
            "text/plain": [
              "(False,\n",
              " {'error': \"SyntaxError: expected '(' (<unknown>, line 1)\",\n",
              "  'stdlib': [],\n",
              "  'non_stdlib': [],\n",
              "  'relative_imports': 0})"
            ]
          },
          "execution_count": 23,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "ok, info = check_python_modules(\"def a\")\n",
        "ok, info"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "qgFNXORy-lpO"
      },
      "outputs": [],
      "source": [
        "def function_works(completions, **kwargs):\n",
        "    scores = []\n",
        "    for completion in completions:\n",
        "        score = 0\n",
        "        response = completion[0][\"content\"]\n",
        "        function = extract_function(response)\n",
        "        if function is not None:\n",
        "            ok, info = check_python_modules(function)\n",
        "        if function is None or \"error\" in info:\n",
        "            score = -2.0\n",
        "        else:\n",
        "            try:\n",
        "                new_strategy = create_locked_down_function(function)\n",
        "                score = 1.0\n",
        "            except:\n",
        "                score = -0.5\n",
        "        scores.append(score)\n",
        "    return scores"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "Gf69i2WT-m4K"
      },
      "source": [
        "`no_cheating` checks if the function cheated since it might have imported Numpy or other functions:"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "cUfHzCVx-nGK"
      },
      "outputs": [],
      "source": [
        "def no_cheating(completions, **kwargs):\n",
        "    scores = []\n",
        "    for completion in completions:\n",
        "        score = 0\n",
        "        response = completion[0][\"content\"]\n",
        "        function = extract_function(response)\n",
        "        if function is not None:\n",
        "            ok, info = check_python_modules(function)\n",
        "            scores.append(1.0 if ok else -20.0) # Penalize heavily!\n",
        "        else:\n",
        "            scores.append(-1.0) # Failed creating function\n",
        "    return scores"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "slnqWG3FTror"
      },
      "source": [
        "Next `strategy_succeeds` checks if the strategy actually allows the game to terminate. Imagine if the strategy simply returned \"W\" which would fail after a time limit of 10 seconds.\n",
        "\n",
        "We also add a global `PRINTER` to print out the strategy and board state."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "id": "sNi129lYTpZ2"
      },
      "outputs": [],
      "source": [
        "import numpy as np\n",
        "global PRINTER\n",
        "PRINTER = 0\n",
        "def strategy_succeeds(completions, **kwargs):\n",
        "    global PRINTER\n",
        "    scores = []\n",
        "    # Generate a random game board with seed\n",
        "    seed = np.random.randint(10000)\n",
        "    for completion in completions:\n",
        "        printed = False\n",
        "        score = 0\n",
        "        response = completion[0][\"content\"]\n",
        "        function = extract_function(response)\n",
        "        if PRINTER % 5 == 0:\n",
        "            printed = True\n",
        "            print(function)\n",
        "        PRINTER += 1\n",
        "        if function is not None:\n",
        "            ok, info = check_python_modules(function)\n",
        "        if function is None or \"error\" in info:\n",
        "            scores.append(0)\n",
        "            continue\n",
        "        try:\n",
        "            new_strategy = create_locked_down_function(function)\n",
        "        except:\n",
        "            scores.append(0)\n",
        "            continue\n",
        "        try:\n",
        "            game = GameBoard(size = 6, seed = seed, target = 2048, probability_fours = 0.10)\n",
        "            steps, game_state = execute_strategy(new_strategy, game)\n",
        "            print(f\"Steps = {steps} State = {game_state}\")\n",
        "            if printed is False:\n",
        "                print(function)\n",
        "            print(game.board().pretty())\n",
        "            if game_state == \"success\":\n",
        "                scores.append(20.0) # Success - massively reward!\n",
        "            else:\n",
        "                scores.append(2.0) # Failed but function works!\n",
        "        except TimeoutError as e:\n",
        "            print(\"Timeout\")\n",
        "            scores.append(-1.0) # Failed with timeout\n",
        "        except Exception as e:\n",
        "            print(f\"Exception = {str(e)}\")\n",
        "            scores.append(-3.0) # Failed\n",
        "    return scores"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "TCpSxtvSeAG_"
      },
      "source": [
        "We'll now create the dataset which includes a replica of our prompt. Remember to add a reasoning effort of low! You can choose high reasoning mode, but this'll only work on more memory GPUs like H100s."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "Ldf6SjLHVPRv",
        "outputId": "589f7523-9835-49b5-c477-4e1d8b0744ff"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "181\n"
          ]
        },
        {
          "data": {
            "text/plain": [
              "{'prompt': [{'content': 'Create a new short 2048 strategy using only native Python code.\\nYou are given a list of list of numbers for the current board state.\\nOutput one action for \"W\", \"A\", \"S\", \"D\" on what is the optimal next step.\\nOutput your new short function in backticks using the format below:\\n```python\\ndef strategy(board):\\n    return \"W\" # Example\\n```\\nAll helper functions should be inside def strategy. Only output the short function `strategy`.',\n",
              "   'role': 'user'}],\n",
              " 'answer': 0,\n",
              " 'reasoning_effort': 'low'}"
            ]
          },
          "execution_count": 27,
          "metadata": {},
          "output_type": "execute_result"
        }
      ],
      "source": [
        "from datasets import Dataset\n",
        "dataset = Dataset.from_list([{\"prompt\" : [{\"role\": \"user\", \"content\": prompt.strip()}], \"answer\" : 0, \"reasoning_effort\": \"low\"}]*1000)\n",
        "maximum_length = len(tokenizer.apply_chat_template([{\"role\": \"user\", \"content\": prompt.strip()}], add_generation_prompt = True))\n",
        "print(maximum_length)\n",
        "dataset[0]"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "9-IOMhVg-2AM"
      },
      "source": [
        "<a name=\"Train\"></a>\n",
        "### Train the model\n",
        "\n",
        "Now set up GRPO Trainer and all configurations! We also support GSPO, GAPO, Dr GRPO and more! Go the Unsloth [Reinforcement Learning Docs](https://docs.unsloth.ai/get-started/reinforcement-learning-rl-guide) for more options."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "ptqkXK2D4d6p",
        "outputId": "2061b833-5b98-4a2b-e7f5-4bc4652d8300"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Unsloth: We now expect `per_device_train_batch_size` to be a multiple of `num_generations`.\n",
            "We will change the batch size of 1 to the `num_generations` of 2\n"
          ]
        }
      ],
      "source": [
        "max_prompt_length = maximum_length + 1 # + 1 just in case!\n",
        "max_completion_length = max_seq_length - max_prompt_length\n",
        "\n",
        "from trl import GRPOConfig, GRPOTrainer\n",
        "training_args = GRPOConfig(\n",
        "    temperature = 1.0,\n",
        "    learning_rate = 5e-5,\n",
        "    weight_decay = 0.01,\n",
        "    warmup_ratio = 0.1,\n",
        "    lr_scheduler_type = \"linear\",\n",
        "    optim = \"adamw_8bit\",\n",
        "    logging_steps = 1,\n",
        "    per_device_train_batch_size = 1,\n",
        "    gradient_accumulation_steps = 1, # Increase to 4 for smoother training\n",
        "    num_generations = 2, # Decrease if out of memory\n",
        "    max_prompt_length = max_prompt_length,\n",
        "    max_completion_length = max_completion_length,\n",
        "    # num_train_epochs = 1, # Set to 1 for a full training run\n",
        "    max_steps = 1000,\n",
        "    save_steps = 100,\n",
        "    report_to = \"none\", # Can use Weights & Biases, TrackIO\n",
        "    output_dir = \"outputs\",\n",
        "\n",
        "    # For optional training + evaluation\n",
        "    # fp16_full_eval = True,\n",
        "    # per_device_eval_batch_size = 4,\n",
        "    # eval_accumulation_steps = 1,\n",
        "    # eval_strategy = \"steps\",\n",
        "    # eval_steps = 1,\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "r9Mv8UZO5hz-"
      },
      "source": [
        "And let's run the trainer! If you scroll up, you'll see a table of rewards. The goal is to see the `reward` column increase!\n",
        "\n",
        "You might have to wait 150 to 200 steps for any action. You'll probably get 0 reward for the first 100 steps. Please be patient!\n",
        "\n",
        "| Step | Training Loss | reward    | reward_std | completion_length | kl       |\n",
        "|------|---------------|-----------|------------|-------------------|----------|\n",
        "| 1    | 0.000000      | 0.125000  | 0.000000   | 200.000000        | 0.000000 |\n",
        "| 2    | 0.000000      | 0.072375  | 0.248112   | 200.000000        | 0.000000 |\n",
        "| 3    | 0.000000      | -0.079000 | 0.163776   | 182.500000        | 0.000005 |\n"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": null,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "vzOuSVCL_GA9",
        "outputId": "349f907c-cc67-4890-e131-397694679634"
      },
      "outputs": [
        {
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Unsloth: Switching to float32 training since model cannot work with float16\n"
          ]
        }
      ],
      "source": [
        "# For optional training + evaluation\n",
        "# new_dataset = dataset.train_test_split(test_size = 0.01)\n",
        "\n",
        "trainer = GRPOTrainer(\n",
        "    model = model,\n",
        "    processing_class = tokenizer,\n",
        "    reward_funcs = [\n",
        "        function_works,\n",
        "        no_cheating,\n",
        "        strategy_succeeds,\n",
        "    ],\n",
        "    args = training_args,\n",
        "    train_dataset = dataset,\n",
        "\n",
        "    # For optional training + evaluation\n",
        "    # train_dataset = new_dataset[\"train\"],\n",
        "    # eval_dataset = new_dataset[\"test\"],\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "fQhtuwP4cf34"
      },
      "source": [
        "And let's train the model!\n",
        "\n",
        "**NOTE** A T4 free GPU might take 5 minutes for one generation sadly since it's an old GPU - A100 or H100 will be much faster!"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 30,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        },
        "id": "VGRxPdSCcfC3",
        "outputId": "f8bb720c-6d69-4f43-d9d1-a404842d2dff"
      },
      "outputs": [
        {
          "metadata": {
            "tags": null
          },
          "name": "stderr",
          "output_type": "stream",
          "text": [
            "The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'bos_token_id': 199998, 'pad_token_id': 200017}.\n",
            "==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 2\n",
            "   \\\\   /|    Num examples = 1,000 | Num Epochs = 1 | Total steps = 1,000\n",
            "O^O/ \\_/ \\    Batch size per device = 2 | Gradient accumulation steps = 1\n",
            "\\        /    Data Parallel GPUs = 1 | Total batch size (2 x 1 x 1) = 2\n",
            " \"-____-\"     Trainable parameters = 1,990,656 of 20,916,747,840 (0.01% trained)\n",
            "`generation_config` default values have been modified to match model-specific defaults: {'max_length': 131072}. If this is not desired, please set these values explicitly.\n"
          ]
        },
        {
          "metadata": {
            "tags": null
          },
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "None\n",
            "Steps = 1 State = failed\n",
            "def strategy(board):\n",
            "    # simple heuristic: prefer right or down, then left, then up\n",
            "    for move in \"R D L U\".split():\n",
            "        pass\n",
            "┌───┬───┬───┬───┬───┬───┐\n",
            "│\u001b[38;5;45m  2\u001b[0m│\u001b[38;5;45m  2\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "└───┴───┴───┴───┴───┴───┘\n"
          ]
        },
        {
          "data": {
            "text/html": [
              "\n",
              "    <div>\n",
              "      \n",
              "      <progress value='86' max='1000' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
              "      [  86/1000 8:06:01 < 88:08:29, 0.00 it/s, Epoch 0.09/1]\n",
              "    </div>\n",
              "    <table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              " <tr style=\"text-align: left;\">\n",
              "      <th>Step</th>\n",
              "      <th>Training Loss</th>\n",
              "      <th>reward</th>\n",
              "      <th>reward_std</th>\n",
              "      <th>completions / mean_length</th>\n",
              "      <th>completions / min_length</th>\n",
              "      <th>completions / max_length</th>\n",
              "      <th>completions / clipped_ratio</th>\n",
              "      <th>completions / mean_terminated_length</th>\n",
              "      <th>completions / min_terminated_length</th>\n",
              "      <th>completions / max_terminated_length</th>\n",
              "      <th>kl</th>\n",
              "      <th>rewards / function_works / mean</th>\n",
              "      <th>rewards / function_works / std</th>\n",
              "      <th>rewards / no_cheating / mean</th>\n",
              "      <th>rewards / no_cheating / std</th>\n",
              "      <th>rewards / strategy_succeeds / mean</th>\n",
              "      <th>rewards / strategy_succeeds / std</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <td>1</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>4.949748</td>\n",
              "      <td>329.000000</td>\n",
              "      <td>72.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>72.000000</td>\n",
              "      <td>72.000000</td>\n",
              "      <td>72.000000</td>\n",
              "      <td>0.002197</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>1.414214</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>2</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>4.949748</td>\n",
              "      <td>550.500000</td>\n",
              "      <td>515.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>515.000000</td>\n",
              "      <td>515.000000</td>\n",
              "      <td>515.000000</td>\n",
              "      <td>0.000298</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>1.414214</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>3</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>538.000000</td>\n",
              "      <td>490.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>490.000000</td>\n",
              "      <td>490.000000</td>\n",
              "      <td>490.000000</td>\n",
              "      <td>0.000276</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-1.500000</td>\n",
              "      <td>2.121320</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>4</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>2.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>325.000000</td>\n",
              "      <td>120.000000</td>\n",
              "      <td>530.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>325.000000</td>\n",
              "      <td>120.000000</td>\n",
              "      <td>530.000000</td>\n",
              "      <td>0.000568</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>5</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>437.000000</td>\n",
              "      <td>288.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>288.000000</td>\n",
              "      <td>288.000000</td>\n",
              "      <td>288.000000</td>\n",
              "      <td>0.001381</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-1.500000</td>\n",
              "      <td>2.121320</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>6</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>308.500000</td>\n",
              "      <td>301.000000</td>\n",
              "      <td>316.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>308.500000</td>\n",
              "      <td>301.000000</td>\n",
              "      <td>316.000000</td>\n",
              "      <td>0.000826</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-3.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>7</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>519.000000</td>\n",
              "      <td>452.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>452.000000</td>\n",
              "      <td>452.000000</td>\n",
              "      <td>452.000000</td>\n",
              "      <td>0.000223</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>8</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>333.500000</td>\n",
              "      <td>81.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>81.000000</td>\n",
              "      <td>81.000000</td>\n",
              "      <td>81.000000</td>\n",
              "      <td>0.001181</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>9</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>568.500000</td>\n",
              "      <td>551.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>551.000000</td>\n",
              "      <td>551.000000</td>\n",
              "      <td>551.000000</td>\n",
              "      <td>0.000281</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>10</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-3.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000153</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>11</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>2.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>330.000000</td>\n",
              "      <td>264.000000</td>\n",
              "      <td>396.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>330.000000</td>\n",
              "      <td>264.000000</td>\n",
              "      <td>396.000000</td>\n",
              "      <td>0.004015</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>12</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>374.500000</td>\n",
              "      <td>360.000000</td>\n",
              "      <td>389.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>374.500000</td>\n",
              "      <td>360.000000</td>\n",
              "      <td>389.000000</td>\n",
              "      <td>0.000245</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>13</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>520.500000</td>\n",
              "      <td>455.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>455.000000</td>\n",
              "      <td>455.000000</td>\n",
              "      <td>455.000000</td>\n",
              "      <td>0.000915</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>14</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>406.500000</td>\n",
              "      <td>227.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>227.000000</td>\n",
              "      <td>227.000000</td>\n",
              "      <td>227.000000</td>\n",
              "      <td>0.007664</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>15</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>348.500000</td>\n",
              "      <td>302.000000</td>\n",
              "      <td>395.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>348.500000</td>\n",
              "      <td>302.000000</td>\n",
              "      <td>395.000000</td>\n",
              "      <td>0.002411</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>16</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>408.000000</td>\n",
              "      <td>379.000000</td>\n",
              "      <td>437.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>408.000000</td>\n",
              "      <td>379.000000</td>\n",
              "      <td>437.000000</td>\n",
              "      <td>0.002496</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>17</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-12.500000</td>\n",
              "      <td>13.435029</td>\n",
              "      <td>493.000000</td>\n",
              "      <td>400.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>400.000000</td>\n",
              "      <td>400.000000</td>\n",
              "      <td>400.000000</td>\n",
              "      <td>0.009901</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-10.500000</td>\n",
              "      <td>13.435029</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>18</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>413.000000</td>\n",
              "      <td>260.000000</td>\n",
              "      <td>566.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>413.000000</td>\n",
              "      <td>260.000000</td>\n",
              "      <td>566.000000</td>\n",
              "      <td>0.021275</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>19</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>487.500000</td>\n",
              "      <td>389.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>389.000000</td>\n",
              "      <td>389.000000</td>\n",
              "      <td>389.000000</td>\n",
              "      <td>0.019204</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>20</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.001022</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-1.500000</td>\n",
              "      <td>2.121320</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>21</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>397.500000</td>\n",
              "      <td>276.000000</td>\n",
              "      <td>519.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>397.500000</td>\n",
              "      <td>276.000000</td>\n",
              "      <td>519.000000</td>\n",
              "      <td>0.027686</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>22</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>499.500000</td>\n",
              "      <td>486.000000</td>\n",
              "      <td>513.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>499.500000</td>\n",
              "      <td>486.000000</td>\n",
              "      <td>513.000000</td>\n",
              "      <td>0.007218</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>23</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.250000</td>\n",
              "      <td>2.474874</td>\n",
              "      <td>575.500000</td>\n",
              "      <td>565.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>565.000000</td>\n",
              "      <td>565.000000</td>\n",
              "      <td>565.000000</td>\n",
              "      <td>0.005928</td>\n",
              "      <td>-1.250000</td>\n",
              "      <td>1.060660</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>24</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>563.500000</td>\n",
              "      <td>541.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>541.000000</td>\n",
              "      <td>541.000000</td>\n",
              "      <td>541.000000</td>\n",
              "      <td>0.008769</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-1.500000</td>\n",
              "      <td>2.121320</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>25</td>\n",
              "      <td>0.000100</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>444.500000</td>\n",
              "      <td>303.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>303.000000</td>\n",
              "      <td>303.000000</td>\n",
              "      <td>303.000000</td>\n",
              "      <td>0.084963</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>26</td>\n",
              "      <td>0.000100</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>419.000000</td>\n",
              "      <td>252.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>252.000000</td>\n",
              "      <td>252.000000</td>\n",
              "      <td>252.000000</td>\n",
              "      <td>0.114125</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-1.500000</td>\n",
              "      <td>2.121320</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>27</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>339.500000</td>\n",
              "      <td>321.000000</td>\n",
              "      <td>358.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>339.500000</td>\n",
              "      <td>321.000000</td>\n",
              "      <td>358.000000</td>\n",
              "      <td>0.033457</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>28</td>\n",
              "      <td>0.000100</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>372.500000</td>\n",
              "      <td>311.000000</td>\n",
              "      <td>434.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>372.500000</td>\n",
              "      <td>311.000000</td>\n",
              "      <td>434.000000</td>\n",
              "      <td>0.081829</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>29</td>\n",
              "      <td>0.000100</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>387.500000</td>\n",
              "      <td>336.000000</td>\n",
              "      <td>439.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>387.500000</td>\n",
              "      <td>336.000000</td>\n",
              "      <td>439.000000</td>\n",
              "      <td>0.100017</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>30</td>\n",
              "      <td>0.000100</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>463.000000</td>\n",
              "      <td>410.000000</td>\n",
              "      <td>516.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>463.000000</td>\n",
              "      <td>410.000000</td>\n",
              "      <td>516.000000</td>\n",
              "      <td>0.095180</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>31</td>\n",
              "      <td>0.000300</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>445.500000</td>\n",
              "      <td>305.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>305.000000</td>\n",
              "      <td>305.000000</td>\n",
              "      <td>305.000000</td>\n",
              "      <td>0.321803</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>32</td>\n",
              "      <td>0.000300</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>425.000000</td>\n",
              "      <td>310.000000</td>\n",
              "      <td>540.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>425.000000</td>\n",
              "      <td>310.000000</td>\n",
              "      <td>540.000000</td>\n",
              "      <td>0.335011</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>33</td>\n",
              "      <td>0.000400</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>458.500000</td>\n",
              "      <td>331.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>331.000000</td>\n",
              "      <td>331.000000</td>\n",
              "      <td>331.000000</td>\n",
              "      <td>0.362238</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>34</td>\n",
              "      <td>0.000500</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>347.500000</td>\n",
              "      <td>207.000000</td>\n",
              "      <td>488.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>347.500000</td>\n",
              "      <td>207.000000</td>\n",
              "      <td>488.000000</td>\n",
              "      <td>0.518291</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>35</td>\n",
              "      <td>0.000400</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>471.000000</td>\n",
              "      <td>356.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>356.000000</td>\n",
              "      <td>356.000000</td>\n",
              "      <td>356.000000</td>\n",
              "      <td>0.383606</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-1.500000</td>\n",
              "      <td>2.121320</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>36</td>\n",
              "      <td>0.000700</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>393.000000</td>\n",
              "      <td>200.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>200.000000</td>\n",
              "      <td>200.000000</td>\n",
              "      <td>200.000000</td>\n",
              "      <td>0.674902</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>37</td>\n",
              "      <td>0.000700</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>344.500000</td>\n",
              "      <td>198.000000</td>\n",
              "      <td>491.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>344.500000</td>\n",
              "      <td>198.000000</td>\n",
              "      <td>491.000000</td>\n",
              "      <td>0.689294</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>38</td>\n",
              "      <td>0.000600</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>473.500000</td>\n",
              "      <td>361.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>361.000000</td>\n",
              "      <td>361.000000</td>\n",
              "      <td>361.000000</td>\n",
              "      <td>0.607979</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>39</td>\n",
              "      <td>0.000100</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>380.000000</td>\n",
              "      <td>361.000000</td>\n",
              "      <td>399.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>380.000000</td>\n",
              "      <td>361.000000</td>\n",
              "      <td>399.000000</td>\n",
              "      <td>0.142165</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>40</td>\n",
              "      <td>0.000300</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>386.500000</td>\n",
              "      <td>352.000000</td>\n",
              "      <td>421.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>386.500000</td>\n",
              "      <td>352.000000</td>\n",
              "      <td>421.000000</td>\n",
              "      <td>0.293521</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>41</td>\n",
              "      <td>0.000500</td>\n",
              "      <td>-10.500000</td>\n",
              "      <td>16.263456</td>\n",
              "      <td>107.500000</td>\n",
              "      <td>89.000000</td>\n",
              "      <td>126.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>107.500000</td>\n",
              "      <td>89.000000</td>\n",
              "      <td>126.000000</td>\n",
              "      <td>0.465591</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>-9.500000</td>\n",
              "      <td>14.849242</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>42</td>\n",
              "      <td>0.000300</td>\n",
              "      <td>-0.250000</td>\n",
              "      <td>1.060660</td>\n",
              "      <td>410.000000</td>\n",
              "      <td>373.000000</td>\n",
              "      <td>447.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>410.000000</td>\n",
              "      <td>373.000000</td>\n",
              "      <td>447.000000</td>\n",
              "      <td>0.314028</td>\n",
              "      <td>0.250000</td>\n",
              "      <td>1.060660</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.500000</td>\n",
              "      <td>2.121320</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>43</td>\n",
              "      <td>0.000800</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>473.000000</td>\n",
              "      <td>360.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>360.000000</td>\n",
              "      <td>360.000000</td>\n",
              "      <td>360.000000</td>\n",
              "      <td>0.753577</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>44</td>\n",
              "      <td>0.000400</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>528.500000</td>\n",
              "      <td>471.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>471.000000</td>\n",
              "      <td>471.000000</td>\n",
              "      <td>471.000000</td>\n",
              "      <td>0.370155</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>45</td>\n",
              "      <td>0.000600</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>360.000000</td>\n",
              "      <td>293.000000</td>\n",
              "      <td>427.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>360.000000</td>\n",
              "      <td>293.000000</td>\n",
              "      <td>427.000000</td>\n",
              "      <td>0.609444</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>46</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>581.500000</td>\n",
              "      <td>577.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>577.000000</td>\n",
              "      <td>577.000000</td>\n",
              "      <td>577.000000</td>\n",
              "      <td>0.021817</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>47</td>\n",
              "      <td>0.000900</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>466.500000</td>\n",
              "      <td>347.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>347.000000</td>\n",
              "      <td>347.000000</td>\n",
              "      <td>347.000000</td>\n",
              "      <td>0.863071</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>48</td>\n",
              "      <td>0.000700</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>495.000000</td>\n",
              "      <td>404.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>404.000000</td>\n",
              "      <td>404.000000</td>\n",
              "      <td>404.000000</td>\n",
              "      <td>0.727124</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>49</td>\n",
              "      <td>0.000200</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>558.500000</td>\n",
              "      <td>531.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>531.000000</td>\n",
              "      <td>531.000000</td>\n",
              "      <td>531.000000</td>\n",
              "      <td>0.173142</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-1.500000</td>\n",
              "      <td>2.121320</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>50</td>\n",
              "      <td>0.000100</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>477.000000</td>\n",
              "      <td>465.000000</td>\n",
              "      <td>489.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>477.000000</td>\n",
              "      <td>465.000000</td>\n",
              "      <td>489.000000</td>\n",
              "      <td>0.089374</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>51</td>\n",
              "      <td>0.001400</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>367.500000</td>\n",
              "      <td>149.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>149.000000</td>\n",
              "      <td>149.000000</td>\n",
              "      <td>149.000000</td>\n",
              "      <td>1.374907</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>52</td>\n",
              "      <td>0.000900</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>458.500000</td>\n",
              "      <td>331.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>331.000000</td>\n",
              "      <td>331.000000</td>\n",
              "      <td>331.000000</td>\n",
              "      <td>0.929248</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-1.500000</td>\n",
              "      <td>2.121320</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>53</td>\n",
              "      <td>0.000900</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>475.000000</td>\n",
              "      <td>364.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>364.000000</td>\n",
              "      <td>364.000000</td>\n",
              "      <td>364.000000</td>\n",
              "      <td>0.887930</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>54</td>\n",
              "      <td>0.000100</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>439.000000</td>\n",
              "      <td>424.000000</td>\n",
              "      <td>454.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>439.000000</td>\n",
              "      <td>424.000000</td>\n",
              "      <td>454.000000</td>\n",
              "      <td>0.126352</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>55</td>\n",
              "      <td>0.000400</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>323.500000</td>\n",
              "      <td>293.000000</td>\n",
              "      <td>354.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>323.500000</td>\n",
              "      <td>293.000000</td>\n",
              "      <td>354.000000</td>\n",
              "      <td>0.367167</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>56</td>\n",
              "      <td>0.000400</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>543.000000</td>\n",
              "      <td>500.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>543.000000</td>\n",
              "      <td>500.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.375893</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>57</td>\n",
              "      <td>0.000700</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>382.000000</td>\n",
              "      <td>317.000000</td>\n",
              "      <td>447.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>382.000000</td>\n",
              "      <td>317.000000</td>\n",
              "      <td>447.000000</td>\n",
              "      <td>0.687571</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>58</td>\n",
              "      <td>0.000600</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>506.000000</td>\n",
              "      <td>426.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>426.000000</td>\n",
              "      <td>426.000000</td>\n",
              "      <td>426.000000</td>\n",
              "      <td>0.648271</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>59</td>\n",
              "      <td>0.001100</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>260.500000</td>\n",
              "      <td>187.000000</td>\n",
              "      <td>334.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>260.500000</td>\n",
              "      <td>187.000000</td>\n",
              "      <td>334.000000</td>\n",
              "      <td>1.084255</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>60</td>\n",
              "      <td>0.000200</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>523.500000</td>\n",
              "      <td>495.000000</td>\n",
              "      <td>552.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>523.500000</td>\n",
              "      <td>495.000000</td>\n",
              "      <td>552.000000</td>\n",
              "      <td>0.198019</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>61</td>\n",
              "      <td>0.001000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>471.500000</td>\n",
              "      <td>357.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>357.000000</td>\n",
              "      <td>357.000000</td>\n",
              "      <td>357.000000</td>\n",
              "      <td>0.987108</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>62</td>\n",
              "      <td>0.000400</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>532.000000</td>\n",
              "      <td>478.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>478.000000</td>\n",
              "      <td>478.000000</td>\n",
              "      <td>478.000000</td>\n",
              "      <td>0.428900</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>63</td>\n",
              "      <td>0.000100</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>411.000000</td>\n",
              "      <td>400.000000</td>\n",
              "      <td>422.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>411.000000</td>\n",
              "      <td>400.000000</td>\n",
              "      <td>422.000000</td>\n",
              "      <td>0.107686</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-3.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>64</td>\n",
              "      <td>0.001000</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>470.500000</td>\n",
              "      <td>355.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>355.000000</td>\n",
              "      <td>355.000000</td>\n",
              "      <td>355.000000</td>\n",
              "      <td>0.967091</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-1.500000</td>\n",
              "      <td>2.121320</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>65</td>\n",
              "      <td>0.000300</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>553.000000</td>\n",
              "      <td>520.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>520.000000</td>\n",
              "      <td>520.000000</td>\n",
              "      <td>520.000000</td>\n",
              "      <td>0.262037</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-1.500000</td>\n",
              "      <td>2.121320</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>66</td>\n",
              "      <td>0.000400</td>\n",
              "      <td>2.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>471.500000</td>\n",
              "      <td>423.000000</td>\n",
              "      <td>520.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>471.500000</td>\n",
              "      <td>423.000000</td>\n",
              "      <td>520.000000</td>\n",
              "      <td>0.414690</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>67</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>580.500000</td>\n",
              "      <td>575.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>575.000000</td>\n",
              "      <td>575.000000</td>\n",
              "      <td>575.000000</td>\n",
              "      <td>0.035250</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>68</td>\n",
              "      <td>0.001200</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>435.000000</td>\n",
              "      <td>284.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>284.000000</td>\n",
              "      <td>284.000000</td>\n",
              "      <td>284.000000</td>\n",
              "      <td>1.168353</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>69</td>\n",
              "      <td>0.000800</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>492.000000</td>\n",
              "      <td>398.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>398.000000</td>\n",
              "      <td>398.000000</td>\n",
              "      <td>398.000000</td>\n",
              "      <td>0.789415</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>70</td>\n",
              "      <td>0.000700</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>291.500000</td>\n",
              "      <td>240.000000</td>\n",
              "      <td>343.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>291.500000</td>\n",
              "      <td>240.000000</td>\n",
              "      <td>343.000000</td>\n",
              "      <td>0.723002</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>71</td>\n",
              "      <td>0.001000</td>\n",
              "      <td>-10.500000</td>\n",
              "      <td>16.263456</td>\n",
              "      <td>407.000000</td>\n",
              "      <td>301.000000</td>\n",
              "      <td>513.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>407.000000</td>\n",
              "      <td>301.000000</td>\n",
              "      <td>513.000000</td>\n",
              "      <td>0.958203</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>-9.500000</td>\n",
              "      <td>14.849242</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>72</td>\n",
              "      <td>0.000900</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>362.500000</td>\n",
              "      <td>279.000000</td>\n",
              "      <td>446.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>362.500000</td>\n",
              "      <td>279.000000</td>\n",
              "      <td>446.000000</td>\n",
              "      <td>0.902191</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>73</td>\n",
              "      <td>0.000100</td>\n",
              "      <td>0.750000</td>\n",
              "      <td>0.353553</td>\n",
              "      <td>479.000000</td>\n",
              "      <td>466.000000</td>\n",
              "      <td>492.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>479.000000</td>\n",
              "      <td>466.000000</td>\n",
              "      <td>492.000000</td>\n",
              "      <td>0.102604</td>\n",
              "      <td>0.250000</td>\n",
              "      <td>1.060660</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>74</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>579.000000</td>\n",
              "      <td>572.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>572.000000</td>\n",
              "      <td>572.000000</td>\n",
              "      <td>572.000000</td>\n",
              "      <td>0.049443</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-1.500000</td>\n",
              "      <td>2.121320</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>75</td>\n",
              "      <td>0.000200</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>530.500000</td>\n",
              "      <td>507.000000</td>\n",
              "      <td>554.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>530.500000</td>\n",
              "      <td>507.000000</td>\n",
              "      <td>554.000000</td>\n",
              "      <td>0.173276</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>76</td>\n",
              "      <td>0.000500</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>401.000000</td>\n",
              "      <td>353.000000</td>\n",
              "      <td>449.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>401.000000</td>\n",
              "      <td>353.000000</td>\n",
              "      <td>449.000000</td>\n",
              "      <td>0.522857</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>77</td>\n",
              "      <td>0.000300</td>\n",
              "      <td>0.750000</td>\n",
              "      <td>0.353553</td>\n",
              "      <td>512.500000</td>\n",
              "      <td>473.000000</td>\n",
              "      <td>552.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>512.500000</td>\n",
              "      <td>473.000000</td>\n",
              "      <td>552.000000</td>\n",
              "      <td>0.271977</td>\n",
              "      <td>0.250000</td>\n",
              "      <td>1.060660</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>78</td>\n",
              "      <td>0.000200</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>432.500000</td>\n",
              "      <td>411.000000</td>\n",
              "      <td>454.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>432.500000</td>\n",
              "      <td>411.000000</td>\n",
              "      <td>454.000000</td>\n",
              "      <td>0.181327</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>79</td>\n",
              "      <td>0.000200</td>\n",
              "      <td>10.500000</td>\n",
              "      <td>16.263456</td>\n",
              "      <td>475.000000</td>\n",
              "      <td>452.000000</td>\n",
              "      <td>498.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>475.000000</td>\n",
              "      <td>452.000000</td>\n",
              "      <td>498.000000</td>\n",
              "      <td>0.200004</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>8.500000</td>\n",
              "      <td>16.263456</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>80</td>\n",
              "      <td>0.000600</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>341.000000</td>\n",
              "      <td>296.000000</td>\n",
              "      <td>386.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>341.000000</td>\n",
              "      <td>296.000000</td>\n",
              "      <td>386.000000</td>\n",
              "      <td>0.606937</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-2.000000</td>\n",
              "      <td>1.414214</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>81</td>\n",
              "      <td>0.000200</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>456.500000</td>\n",
              "      <td>428.000000</td>\n",
              "      <td>485.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>456.500000</td>\n",
              "      <td>428.000000</td>\n",
              "      <td>485.000000</td>\n",
              "      <td>0.235978</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>82</td>\n",
              "      <td>0.000800</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>407.000000</td>\n",
              "      <td>326.000000</td>\n",
              "      <td>488.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>407.000000</td>\n",
              "      <td>326.000000</td>\n",
              "      <td>488.000000</td>\n",
              "      <td>0.825952</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>0.000000</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>83</td>\n",
              "      <td>0.000200</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>557.500000</td>\n",
              "      <td>529.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>529.000000</td>\n",
              "      <td>529.000000</td>\n",
              "      <td>529.000000</td>\n",
              "      <td>0.239547</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <td>84</td>\n",
              "      <td>0.001600</td>\n",
              "      <td>-1.000000</td>\n",
              "      <td>2.828427</td>\n",
              "      <td>368.500000</td>\n",
              "      <td>151.000000</td>\n",
              "      <td>586.000000</td>\n",
              "      <td>0.500000</td>\n",
              "      <td>151.000000</td>\n",
              "      <td>151.000000</td>\n",
              "      <td>151.000000</td>\n",
              "      <td>1.608883</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>2.121320</td>\n",
              "      <td>0.000000</td>\n",
              "      <td>1.414214</td>\n",
              "      <td>-0.500000</td>\n",
              "      <td>0.707107</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table><p>"
            ],
            "text/plain": [
              "<IPython.core.display.HTML object>"
            ]
          },
          "metadata": {},
          "output_type": "display_data"
        },
        {
          "metadata": {
            "tags": null
          },
          "name": "stdout",
          "output_type": "stream",
          "text": [
            "Steps = 1 State = failed\n",
            "def strategy(board):\n",
            "    # Helper: simulate a move, return new board and score\n",
            "    def simulate(board, dir):\n",
            "        n = len(board)\n",
            "        new = [[0]*n for _ in range(n)]\n",
            "        score = 0\n",
            "        for i in range(n):\n",
            "            # extract line\n",
            "            if dir == 'A':\n",
            "                line = [board[i][j] for j in range(n)]\n",
            "                rev = False\n",
            "            elif dir == 'D':\n",
            "                line = [board[i][j] for j in range(n-1, -1, -1)]\n",
            "                rev = True\n",
            "            elif dir == 'W':\n",
            "                line = [board[j][i] for j in range(n)]\n",
            "                rev = False\n",
            "            else:  # 'S'\n",
            "                line = [board[j][i] for j in range(n-1, -1, -1)]\n",
            "                rev = True\n",
            "            # compress and merge\n",
            "            new_line = [x for x in line if x != 0]\n",
            "            merged = []\n",
            "            j = 0\n",
            "            while j < len(new_line):\n",
            "                if j + 1 < len(new_line) and new_line[j] == new_line[j+1]:\n",
            "                    merged.append(new_line[j]*2)\n",
            "                    score += new_line[j]*2\n",
            "                    j += 2\n",
            "                else:\n",
            "                    merged.append(new_line[j])\n",
            "                    j += 1\n",
            "            # fill with zeros\n",
            "            merged += [0]*(n-len(merged))\n",
            "            # place back\n",
            "            if rev:\n",
            "                merged = merged[::-1]\n",
            "            if dir in ('A','D'):\n",
            "                for j in range(n):\n",
            "                    new[i][j] = merged[j]\n",
            "            else:\n",
            "                for j in range(n):\n",
            "                    new[j][i] = merged[j]\n",
            "        return new, score\n",
            "\n",
            "    best, best_dir = 0, None\n",
            "    for dir in ('W','A','S','D'):\n",
            "        _, score = simulate(board, dir)\n",
            "        if score > best:\n",
            "            best, best_dir = score, dir\n",
            "    return best_dir  # returns one of 'W','A','S','D'\n",
            "┌───┬───┬───┬───┬───┬───┐\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;51m  4\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;45m  2\u001b[0m│\n",
            "├───┼───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "├───┼───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\u001b[38;5;239m  .\u001b[0m│\n",
            "└───┴───┴───┴───┴───┴───┘\n",
            "Unsloth: Will smartly offload gradients to save VRAM!\n",
            "def strategy(board):\n",
            "    # helpers\n",
            "    def move(b, d):\n",
            "        n = len(b)\n",
            "        def compress(row):\n",
            "            new = [x for x in row if x!=0]\n",
            "            for i in range(len(new)-1):\n",
            "                if new[i]==new[i+1]:\n",
            "                    new[i]*=2; new[i+1]=0\n",
            "            return [x for x in new if x!=0]+[0]*(n-len(new))\n",
            "        res=[[0]*n for _ in range(n)]\n",
            "        if d==\"W\":\n",
            "            for j in range(n):\n",
            "                col=[b[i][j] for i in range(n)]\n",
            "                col=compress(col)\n",
            "                for i in range(n):\n",
            "                    res[i][j]=col[i]\n",
            "        elif d==\"S\":\n",
            "            for j in range(n):\n",
            "                col=[b[i][j] for i in range(n)][::-1]\n",
            "                col=compress(col)\n",
            "                col=col[::-1]\n",
            "                for i in range(n):\n",
            "                    res[i][j]=col[i]\n",
            "        elif d==\"A\":\n",
            "            for i in range(n):\n",
            "                row=compress(b[i])\n",
            "                res[i]=row\n",
            "        elif d==\"D\":\n",
            "            for i in range(n):\n",
            "                row=compress(b[i][::-1])\n",
            "                row=row[::-1]\n",
            "                res[i]=row\n",
            "        return res\n",
            "\n",
            "    def score(b):\n",
            "        return sum(sum(row) for row in b)\n",
            "\n",
            "    moves=\"WASD\"\n",
            "    best=None; best_val=-1\n",
            "    for m in moves:\n",
            "        nb=move(board, m)\n",
            "        val=score(nb)\n",
            "        if val>best_val and any(nb[i][j]!=board[i][j] for i in range(len(nb)) for j in range(len(nb[0]))):\n",
            "            best_val=val; best=m\n",
            "    return best if best else \"W\"\n",
            "Exception = list index out of range\n",
            "Timeout\n",
            "Steps = 475 State = failed\n",
            "def strategy(board):\n",
            "    def move_possible(board, direction):\n",
            "        rows, cols = len(board), len(board[0])\n",
            "        if direction == 'W':\n",
            "            for j in range(cols):\n",
            "                for i in range(1, rows):\n",
            "                    if board[i][j] != 0:\n",
            "                        for k in range(i-1, -1, -1):\n",
            "                            if board[k][j] == 0 or board[k][j] == board[i][j]:\n",
            "                                return True\n",
            "                            if board[k][j] != 0:\n",
            "                                break\n",
            "        elif direction == 'S':\n",
            "            for j in range(cols):\n",
            "                for i in range(rows-2, -1, -1):\n",
            "                    if board[i][j] != 0:\n",
            "                        for k in range(i+1, rows):\n",
            "                            if board[k][j] == 0 or board[k][j] == board[i][j]:\n",
            "                                return True\n",
            "                            if board[k][j] != 0:\n",
            "                                break\n",
            "        elif direction == 'A':\n",
            "            for i in range(rows):\n",
            "                for j in range(1, cols):\n",
            "                    if board[i][j] != 0:\n",
            "                        for k in range(j-1, -1, -1):\n",
            "                            if board[i][k] == 0 or board[i][k] == board[i][j]:\n",
            "                                return True\n",
            "                            if board[i][k] != 0:\n",
            "                                break\n",
            "        elif direction == 'D':\n",
            "            for i in range(rows):\n",
            "                for j in range(cols-2, -1, -1):\n",
            "                    if board[i][j] != 0:\n",
            "                        for k in range(j+1, cols):\n",
            "                            if board[i][k] == 0 or board[i][k] == board[i][j]:\n",
            "                                return True\n",
            "                            if board[i][k] != 0:\n",
            "                                break\n",
            "        return False\n",
            "\n",
            "    # Prefer moves that allow a merge as they increase score\n",
            "    for d in ('W', 'S', 'A', 'D'):\n",
            "        if move_possible(board, d):\n",
            "            return d\n",
            "    # If no merges are possible, pick any direction that moves tiles\n",
            "    for d in ('W', 'S', 'A', 'D'):\n",
            "        if any(board[i][j] != 0 for i in range(len(board)) for j in range(len(board[0]))):\n",
            "            return d\n",
            "    return 'W'\n",
            "┌───┬───┬───┬───┬───┬───┐\n",
            "│\u001b[38;5;45m  2\u001b[0m│\u001b[38;5;47m 16\u001b[0m│\u001b[38;5;51m  4\u001b[0m│\u001b[38;5;45m  2\u001b[0m│\u001b[38;5;49m  8\u001b[0m│\u001b[38;5;51m  4\u001b[0m│\n",
            "├───┼───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;49m  8\u001b[0m│\u001b[38;5;45m  2\u001b[0m│\u001b[38;5;46m 32\u001b[0m│\u001b[38;5;49m  8\u001b[0m│\u001b[38;5;154m128\u001b[0m│\u001b[38;5;49m  8\u001b[0m│\n",
            "├───┼───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;46m 32\u001b[0m│\u001b[38;5;118m 64\u001b[0m│\u001b[38;5;226m256\u001b[0m│\u001b[38;5;45m  2\u001b[0m│\u001b[38;5;118m 64\u001b[0m│\u001b[38;5;46m 32\u001b[0m│\n",
            "├───┼───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;154m128\u001b[0m│\u001b[38;5;49m  8\u001b[0m│\u001b[38;5;47m 16\u001b[0m│\u001b[38;5;118m 64\u001b[0m│\u001b[38;5;46m 32\u001b[0m│\u001b[38;5;49m  8\u001b[0m│\n",
            "├───┼───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;51m  4\u001b[0m│\u001b[38;5;45m  2\u001b[0m│\u001b[38;5;51m  4\u001b[0m│\u001b[38;5;47m 16\u001b[0m│\u001b[38;5;49m  8\u001b[0m│\u001b[38;5;51m  4\u001b[0m│\n",
            "├───┼───┼───┼───┼───┼───┤\n",
            "│\u001b[38;5;118m 64\u001b[0m│\u001b[38;5;51m  4\u001b[0m│\u001b[38;5;45m  2\u001b[0m│\u001b[38;5;49m  8\u001b[0m│\u001b[38;5;51m  4\u001b[0m│\u001b[38;5;45m  2\u001b[0m│\n",
            "└───┴───┴───┴───┴───┴───┘\n",
            "Exception = '>' not supported between instances of 'tuple' and 'float'\n",
            "def strategy(board):\n",
            "    import random, copy\n",
            "\n",
            "    def rotate(b):\n",
            "        return [[b[3-j][i] for j in range(4)] for i in range(4)]\n",
            "\n",
            "    def compress(b):\n",
            "        new = []\n",
            "        for row in b:\n",
            "            new_row = [x for x in row if x != 0]\n",
            "            new_row += [0]*(4-len(new_row))\n",
            "            new.append(new_row)\n",
            "        return new\n",
            "\n",
            "    def merge(b):\n",
            "        for row in b:\n",
            "            for i in range(3):\n",
            "                if row[i]==row[i+1] and row[i]!=0:\n",
            "                    row[i]*=2\n",
            "                    row[i+1]=0\n",
            "\n",
            "    def move(b, dir):\n",
            "        if dir==\"W\":\n",
            "            return merge(rotate(compress(rotate(b))))\n",
            "        if dir==\"S\":\n",
            "            return rotate(merge(compress(rotate(b))))\n",
            "        if dir==\"A\":\n",
            "            return merge(compress(b))\n",
            "        if dir==\"D\":\n",
            "            return rotate(merge(compress(rotate(b))))  # actually reverse\n",
            "\n",
            "    best_score=0\n",
            "    best_move=None\n",
            "    for move_dir in \"WASD\":\n",
            "        new_board=move(copy.deepcopy(board), move_dir)\n",
            "        score=sum(sum(row) for row in new_board)\n",
            "        if score>best_score:\n",
            "            best_score=score\n",
            "            best_move=move_dir\n",
            "    return best_move\n",
            "Exception = 'NoneType' object is not iterable\n",
            "Exception = name 'n' is not defined\n",
            "Timeout\n",
            "Timeout\n",
            "None\n",
            "Timeout\n",
            "def strategy(board):\n",
            "    # Prioritize merges, then favor left/up moves\n",
            "    rows, cols = len(board), len(board[0]) if board else 0\n",
            "\n",
            "    # Helper to check if a move is possible\n",
            "    def can_move(direction):\n",
            "        if direction == 'W':\n",
            "            for c in range(cols):\n",
            "                for r in range(rows-1):\n",
            "                    if board[r][c] == 0 or board[r][c] == board[r+1][c]:\n",
            "                        return True\n",
            "        elif direction == 'A':\n",
            "            for r in range(rows):\n",
            "                for c in range(cols-1):\n",
            "                    if board[r][c] == 0 or board[r][c] == board[r][c+1]:\n",
            "                        return True\n",
            "        elif direction == 'S':\n",
            "            for c in range(cols):\n",
            "                for r in range(rows-1,0,-1):\n",
            "                    if board[r][c] == 0 or board[r][c] == board[r-1][c]:\n",
            "                        return True\n",
            "        elif direction == 'D':\n",
            "            for r in range(rows):\n",
            "                for c in range(cols-1,0,-1):\n",
            "                    if board[r][c] == 0 or board[r][c] == board[r][c-1]:\n",
            "                        return True\n",
            "        return False\n",
            "\n",
            "    # Generate all moves\n",
            "    moves = []\n",
            "    for d in ['W', 'A', 'S', 'D']:\n",
            "        if can_move(d):\n",
            "            moves.append(d)\n",
            "\n",
            "    # If multiple moves, pick one that maximizes the sum of merges\n",
            "    if not moves:\n",
            "        return 'W'  # fallback\n",
            "    # Simple heuristic: prefer first move that allows a merge\n",
            "    return moves[0]\n",
            "Timeout\n",
            "Steps = 1512 State = failed\n",
            "def strategy(board):\n",
            "    # helper to check possible merge in a row or column\n",
            "    def can_merge(lst):\n",
            "        for i in range(len(lst)-1):\n",
            "            if lst[i] > 0 and lst[i] == lst[i+1]:\n",
            "                return True\n",
            "        return False\n",
            "\n",
            "    # try to move in a direction that creates a merge\n",
            "    for dir, delta in [(\"W\", (-1,0)), (\"A\", (0,-1)), (\"S\", (1,0)), (\"D\", (0,1))]:\n",
            "        merged = False\n",
            "        for i in range(len(board)):\n",
            "            for j in range(len(board[0])):\n",
            "                if board[i][j] > 0:\n",
            "                    ni, nj = i + delta[0], j + delta[1]\n",
            "                    if 0 <= ni < len(board) and 0 <= nj < len(board[0]):\n",
            "                        if board[ni][nj] == 0:\n",
            "                            return dir\n",
            "                        if board[ni][nj] == board[i][j]:\n",
            "                            return dir\n",
            "    # fallback: move down\n",
            "    return \"S\"\n",
            "┌────┬────┬────┬────┬────┬────┐\n",
            "│\u001b[38;5;214m 512\u001b[0m│\u001b[38;5;47m  16\u001b[0m│\u001b[38;5;226m 256\u001b[0m│\u001b[38;5;51m   4\u001b[0m│\u001b[38;5;118m  64\u001b[0m│\u001b[38;5;49m   8\u001b[0m│\n",
            "├────┼────┼────┼────┼────┼────┤\n",
            "│\u001b[38;5;154m 128\u001b[0m│\u001b[38;5;118m  64\u001b[0m│\u001b[38;5;208m1024\u001b[0m│\u001b[38;5;46m  32\u001b[0m│\u001b[38;5;49m   8\u001b[0m│\u001b[38;5;118m  64\u001b[0m│\n",
            "├────┼────┼────┼────┼────┼────┤\n",
            "│\u001b[38;5;118m  64\u001b[0m│\u001b[38;5;49m   8\u001b[0m│\u001b[38;5;226m 256\u001b[0m│\u001b[38;5;154m 128\u001b[0m│\u001b[38;5;51m   4\u001b[0m│\u001b[38;5;47m  16\u001b[0m│\n",
            "├────┼────┼────┼────┼────┼────┤\n",
            "│\u001b[38;5;51m   4\u001b[0m│\u001b[38;5;226m 256\u001b[0m│\u001b[38;5;47m  16\u001b[0m│\u001b[38;5;51m   4\u001b[0m│\u001b[38;5;47m  16\u001b[0m│\u001b[38;5;49m   8\u001b[0m│\n",
            "├────┼────┼────┼────┼────┼────┤\n",
            "│\u001b[38;5;154m 128\u001b[0m│\u001b[38;5;118m  64\u001b[0m│\u001b[38;5;46m  32\u001b[0m│\u001b[38;5;47m  16\u001b[0m│\u001b[38;5;49m   8\u001b[0m│\u001b[38;5;51m   4\u001b[0m│\n",
            "├────┼────┼────┼────┼────┼────┤\n",
            "│\u001b[38;5;118m  64\u001b[0m│\u001b[38;5;46m  32\u001b[0m│\u001b[38;5;47m  16\u001b[0m│\u001b[38;5;49m   8\u001b[0m│\u001b[38;5;51m   4\u001b[0m│\u001b[38;5;45m   2\u001b[0m│\n",
            "└────┴────┴────┴────┴────┴────┘\n",
            "Timeout\n",
            "Timeout\n",
            "def strategy(board):\n",
            "    # Simple greedy: choose direction that keeps tiles sorted in ascending order left-bottom\n",
            "    best = \" \"\n",
            "    best_val = -1\n",
            "    for d in \"WASD\":\n",
            "        # simulate move\n",
            "        b = [row[:] for row in board]\n",
            "        # merge function\n",
            "        def merge(row):\n",
            "            new = [x for x in row if x != 0]\n",
            "            res = []\n",
            "            i = 0\n",
            "            while i < len(new):\n",
            "                if i+1 < len(new) and new[i] == new[i+1]:\n",
            "                    res.append(new[i]*2)\n",
            "                    i += 2\n",
            "                else:\n",
            "                    res.append(new[i])\n",
            "                    i += 1\n",
            "            return res + [0]*(len(row)-len(res))\n",
            "        moved = False\n",
            "        if d == \"W\":\n",
            "            for col in range(4):\n",
            "                col_vals = [board[r][col] for r in range(4)]\n",
            "                merged = merge(col_vals)\n",
            "                for r in range(4):\n",
            "                    b[r][col] = merged[r]\n",
            "        elif d == \"S\":\n",
            "            for col in range(4):\n",
            "                col_vals = [board[r][col] for r in range(4)][::-1]\n",
            "                merged = merge(col_vals)[::-1]\n",
            "                for r in range(4):\n",
            "                    b[r][col] = merged[r]\n",
            "        elif d == \"A\":\n",
            "            for r in range(4):\n",
            "                row_vals = board[r]\n",
            "                merged = merge(row_vals)\n",
            "                b[r] = merged\n",
            "        elif d == \"D\":\n",
            "            for r in range(4):\n",
            "                row_vals = board[r][::-1]\n",
            "                merged = merge(row_vals)[::-1]\n",
            "                b[r] = merged\n",
            "        score = sum(filter(None, [x for row in b for x in row]))\n",
            "        if score > best_val:\n",
            "            best_val = score\n",
            "            best = d\n",
            "    return best\n",
            "Timeout\n",
            "Timeout\n",
            "Exception = 'str' object is not callable\n",
            "Timeout\n",
            "def strategy(board):\n",
            "    # helper to rotate board\n",
            "    def rotate(b): return [list(col)[::-1] for col in zip(*b)]\n",
            "    # helper to move up\n",
            "    def move_up(b):\n",
            "        n=len(b)\n",
            "        new=[[] for _ in range(n)]\n",
            "        for j in range(n):\n",
            "            col=[b[i][j] for i in range(n) if b[i][j]!=0]\n",
            "            merged=[]\n",
            "            i=0\n",
            "            while i< len(col):\n",
            "                if i+1<len(col) and col[i]==col[i+1]:\n",
            "                    merged.append(col[i]*2)\n",
            "                    i+=2\n",
            "                else:\n",
            "                    merged.append(col[i])\n",
            "                    i+=1\n",
            "            new_col=[0]*(n-len(merged))+merged\n",
            "            for i in range(n):\n",
            "                new[i][j]=new_col[i]\n",
            "        return new\n",
            "    best=None\n",
            "    best_val=-1\n",
            "    for dir in [\"W\",\"A\",\"S\",\"D\"]:\n",
            "        # move board in given direction\n",
            "        b=[row[:] for row in board]\n",
            "        if dir==\"W\": b=move_up(b)\n",
            "        elif dir==\"S\": b=[list(row[::-1]) for row in move_up([row[::-1] for row in b])]\n",
            "        elif dir==\"A\": b=[list(row[::-1]) for row in move_up([row[::-1] for row in b])]\n",
            "        elif dir==\"D\": b=[list(row[::-1]) for row in b]\n",
            "        # evaluate\n",
            "        val=max(max(row) for row in b)\n",
            "        if val>best_val:\n",
            "            best_val=val; best=dir\n",
            "    return best\n",
            "Exception = list assignment index out of range\n",
            "Timeout\n",
            "Exception = list index out of range\n",
            "def strategy(board):\n",
            "    import copy\n",
            "    moves = \"WASD\"\n",
            "    best = None\n",
            "    best_score = -1\n",
            "    for m in moves:\n",
            "        b = copy.deepcopy(board)\n",
            "        if m==\"W\":\n",
            "            for c in range(len(b)):\n",
            "                merged = []\n",
            "                for r in range(len(b)):\n",
            "                    val = b[r][c]\n",
            "                    if val!=0:\n",
            "                        merged.append(val)\n",
            "                i=0\n",
            "                while i+1<len(merged):\n",
            "                    if merged[i]==merged[i+1]:\n",
            "                        merged[i]*=2\n",
            "                        merged.pop(i+1)\n",
            "                    i+=1\n",
            "                merged+= [0]*(len(b)-len(merged))\n",
            "                for r in range(len(b)):\n",
            "                    b[r][c]=merged[r]\n",
            "        elif m==\"S\":\n",
            "            for c in range(len(b)):\n",
            "                merged = []\n",
            "                for r in reversed(range(len(b))):\n",
            "                    val = b[r][c]\n",
            "                    if val!=0:\n",
            "                        merged.append(val)\n",
            "                i=0\n",
            "                while i+1<len(merged):\n",
            "                    if merged[i]==merged[i+1]:\n",
            "                        merged[i]*=2\n",
            "                        merged.pop(i+1)\n",
            "                    i+=1\n",
            "                merged+= [0]*(len(b)-len(merged))\n",
            "                for r in range(len(b)):\n",
            "                    b[r][c]=merged[len(b)-1-r]\n",
            "        elif m==\"A\":\n",
            "            for r in range(len(b)):\n",
            "                row = b[r]\n",
            "                merged = [v for v in row if v!=0]\n",
            "                i=0\n",
            "                while i+1<len(merged):\n",
            "                    if merged[i]==merged[i+1]:\n",
            "                        merged[i]*=2\n",
            "                        merged.pop(i+1)\n",
            "                    i+=1\n",
            "                merged+= [0]*(len(b)-len(merged))\n",
            "                b[r]=merged\n",
            "        elif m==\"D\":\n",
            "            for r in range(len(b)):\n",
            "                row = list(reversed(b[r]))\n",
            "                merged = [v for v in row if v!=0]\n",
            "                i=0\n",
            "                while i+1<len(merged):\n",
            "                    if merged[i]==merged[i+1]:\n",
            "                        merged[i]*=2\n",
            "                        merged.pop(i+1)\n",
            "                    i+=1\n",
            "                merged+= [0]*(len(b)-len(merged))\n",
            "                b[r]=list(reversed(merged))\n",
            "        score=sum(sum(row) for row in b)\n",
            "        if score>best_score:\n",
            "            best_score=score; best=m\n",
            "    return best\n",
            "Timeout\n",
            "Timeout\n",
            "Exception = unsupported operand type(s) for -: 'range' and 'int'\n",
            "def strategy(board):\n",
            "    # board is a 4x4 list of ints, 0 for empty\n",
            "    # Simple greedy: move that merges most tiles\n",
            "    moves = {}\n",
            "    dirs = {\"W\": (-1,0), \"A\": (0,-1), \"S\": (1,0), \"D\": (0,1)}\n",
            "    for d, (dr,dc) in dirs.items():\n",
            "        # simulate move\n",
            "        new_board = [row[:] for row in board]\n",
            "        merged = 0\n",
            "        for i in range(4):\n",
            "            for j in range(4):\n",
            "                if new_board[i][j]==0: continue\n",
            "                ni, nj = i+dr, j+dc\n",
            "                while 0<=ni<4 and 0<=nj<4 and new_board[ni][nj]==0:\n",
            "                    ni+=dr; nj+=dc\n",
            "                if 0<=ni<4 and 0<=nj<4 and new_board[ni][nj]==new_board[i][j]:\n",
            "                    merged+=1\n",
            "        moves[d]=merged\n",
            "    # choose direction with most merges, default W\n",
            "    best = max(moves, key=moves.get)\n",
            "    return best\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "Exception = list index out of range\n",
            "def strategy(board):\n",
            "    moves = \"WASD\"\n",
            "    best = None\n",
            "    best_score = -1\n",
            "    for m in moves:\n",
            "        new_board = [row[:] for row in board]\n",
            "        if m == \"W\":\n",
            "            new_board = _move_up(new_board)\n",
            "        elif m == \"A\":\n",
            "            new_board = _move_left(new_board)\n",
            "        elif m == \"S\":\n",
            "            new_board = _move_down(new_board)\n",
            "        else:  # \"D\"\n",
            "            new_board = _move_right(new_board)\n",
            "        score = sum(sum(row) for row in new_board)\n",
            "        if score > best_score:\n",
            "            best_score, best = score, m\n",
            "    return best\n",
            "\n",
            "def _compress(line):\n",
            "    nonzero = [x for x in line if x]\n",
            "    res = []\n",
            "    i = 0\n",
            "    while i < len(nonzero):\n",
            "        if i + 1 < len(nonzero) and nonzero[i] == nonzero[i+1]:\n",
            "            res.append(nonzero[i]*2)\n",
            "            i += 2\n",
            "        else:\n",
            "            res.append(nonzero[i])\n",
            "            i += 1\n",
            "    return res + [0]*(len(line)-len(res))\n",
            "\n",
            "def _move_up(b):\n",
            "    n = len(b)\n",
            "    res = [[0]*n for _ in range(n)]\n",
            "    for j in range(n):\n",
            "        col = [b[i][j] for i in range(n)]\n",
            "        col = _compress(col)\n",
            "        for i in range(n):\n",
            "            res[i][j] = col[i]\n",
            "    return res\n",
            "\n",
            "def _move_down(b):\n",
            "    n = len(b)\n",
            "    res = [[0]*n for _ in range(n)]\n",
            "    for j in range(n):\n",
            "        col = [b[i][j] for i in range(n)][::-1]\n",
            "        col = _compress(col)\n",
            "        for i in range(n):\n",
            "            res[n-1-i][j] = col[i]\n",
            "    return res\n",
            "\n",
            "def _move_left(b):\n",
            "    n = len(b)\n",
            "    res = [[0]*n for _ in range(n)]\n",
            "    for i in range(n):\n",
            "        row = _compress(b[i])\n",
            "        res[i] = row\n",
            "    return res\n",
            "\n",
            "def _move_right(b):\n",
            "    n = len(b)\n",
            "    res = [[0]*n for _ in range(n)]\n",
            "    for i in range(n):\n",
            "        row = _compress(b[i][::-1])[::-1]\n",
            "        res[i] = row\n",
            "    return res\n",
            "Exception = 'int' object is not subscriptable\n",
            "Timeout\n",
            "def strategy(board):\n",
            "    # helper to apply a move and return new board\n",
            "    def move(b, dir):\n",
            "        n = len(b)\n",
            "        res = [[0]*n for _ in range(n)]\n",
            "        for x in range(n):\n",
            "            line = []\n",
            "            for y in range(n):\n",
            "                i,j = (y,x) if dir==\"D\" else (x,y)\n",
            "                if dir==\"A\": i=j\n",
            "            # skip for brevity\n",
            "\n",
            "    # simplified heuristic: choose direction that increases sum of merged tiles\n",
            "    best, best_sum = None, -1\n",
            "    dirs = \"WASD\"\n",
            "    for d in dirs:\n",
            "        new = move(board, d)\n",
            "        merged = sum(c for r in new for c in r) - sum(c for r in board for c in r)\n",
            "        if merged > best_sum:\n",
            "            best_sum, best = merged, d\n",
            "    return best\n",
            "Exception = 'NoneType' object is not iterable\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "def strategy(board):\n",
            "    import math\n",
            "    def score(b):\n",
            "        empty = sum(1 for r in b for v in r if v==0)\n",
            "        mx = max(max(row) for row in b)\n",
            "        return empty*10 + mx\n",
            "    best=None; best_score=-math.inf\n",
            "    for move in \"WASD\":\n",
            "        new=board.copy()\n",
            "        # simulate simple move logic\n",
            "        if move==\"W\":\n",
            "            for col in range(4):\n",
            "                col_vals=[r[col] for r in new if r[col]!=0]\n",
            "                for i,row in enumerate(col_vals):\n",
            "                    new[i][col]=col_vals[i]\n",
            "                for i in range(i+1,4):\n",
            "                    new[i][col]=0\n",
            "        elif move==\"S\":\n",
            "            for col in range(4):\n",
            "                col_vals=[r[col] for r in new if r[col]!=0]\n",
            "                for i,row in enumerate(reversed(col_vals)):\n",
            "                    new[3-i][col]=col_vals[i]\n",
            "                for i in range(3-i+1,4):\n",
            "                    new[i][col]=0\n",
            "        elif move==\"A\":\n",
            "            for row in range(4):\n",
            "                row_vals=[v for v in new[row] if v!=0]\n",
            "                for i,v in enumerate(row_vals):\n",
            "                    new[row][i]=row_vals[i]\n",
            "                for i in range(i+1,4):\n",
            "                    new[row][i]=0\n",
            "        elif move==\"D\":\n",
            "            for row in range(4):\n",
            "                row_vals=[v for v in new[row] if v!=0]\n",
            "                for i,v in enumerate(reversed(row_vals)):\n",
            "                    new[row][3-i]=row_vals[i]\n",
            "                for i in range(3-i+1,4):\n",
            "                    new[row][i]=0\n",
            "        sc=score(new)\n",
            "        if sc>best_score:\n",
            "            best_score=sc; best=move\n",
            "    return best\n",
            "Exception = cannot access local variable 'i' where it is not associated with a value\n",
            "Timeout\n",
            "Exception = name 'merge' is not defined\n",
            "Timeout\n",
            "Timeout\n",
            "def strategy(board):\n",
            "    # 4x4 board\n",
            "    moves = 'W A S D'.split()\n",
            "    best = None\n",
            "    best_score = -1\n",
            "    for m in moves:\n",
            "        b = [row[:] for row in board]  # copy\n",
            "        for i in range(4):\n",
            "            line = b[i] if m in 'AD' else [row[i] for row in b]\n",
            "            merged = []\n",
            "            skip = False\n",
            "            for j, v in enumerate(line):\n",
            "                if v == 0: continue\n",
            "                if skip:\n",
            "                    skip = False\n",
            "                    continue\n",
            "                if j + 1 < len(line) and line[j+1] == v:\n",
            "                    merged.append(v*2)\n",
            "                    skip = True\n",
            "                else:\n",
            "                    merged.append(v)\n",
            "            while len(merged) < 4:\n",
            "                merged.append(0)\n",
            "            if m in 'AD':\n",
            "                for k in range(4): b[i][k] = merged[k]\n",
            "            else:\n",
            "                for k in range(4): b[k][i] = merged[k]\n",
            "        score = sum(sum(row) for row in b)\n",
            "        if score > best_score:\n",
            "            best_score = score\n",
            "            best = m\n",
            "    return best\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "def strategy(board):\n",
            "    # board is a list of lists representing a 4x4 grid.\n",
            "    # possible moves\n",
            "    moves = ['W', 'A', 'S', 'D']\n",
            "    best = None\n",
            "    best_score = -1\n",
            "    \n",
            "    def score(b):\n",
            "        s = 0\n",
            "        for row in b:\n",
            "            for v in row:\n",
            "                s += v\n",
            "        return s\n",
            "    \n",
            "    for m in moves:\n",
            "        nb = [row[:] for row in board]\n",
            "        # simulate move m (very naive: just return new board if any merge)\n",
            "        merged = False\n",
            "        for i in range(4):\n",
            "            for j in range(4):\n",
            "                if nb[i][j] == 0: continue\n",
            "                for di, dj in ( (-1,0),(1,0),(0,-1),(0,1) ):\n",
            "                    ni, nj = i+di, j+dj\n",
            "                    if 0<=ni<4 and 0<=nj<4 and nb[ni][nj]==nb[i][j]:\n",
            "                        nb[ni][nj] += nb[i][j]\n",
            "                        nb[i][j] = 0\n",
            "                        merged = True\n",
            "        if merged:\n",
            "            sc = score(nb)\n",
            "            if sc > best_score:\n",
            "                best_score, best = sc, m\n",
            "    return best if best is not None else moves[0]\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "Exception = cannot access local variable 'val' where it is not associated with a value\n",
            "None\n",
            "Timeout\n",
            "Timeout\n",
            "Exception = not enough values to unpack (expected 2, got 1)\n",
            "def strategy(board):\n",
            "    # evaluate a move by the total sum after the move\n",
            "    def sim(b, m):\n",
            "        n = len(b)\n",
            "        b = [row[:] for row in b]\n",
            "        moved = False\n",
            "        if m == 'W':\n",
            "            for j in range(n):\n",
            "                col = [b[i][j] for i in range(n)]\n",
            "                col += [0]*(n-len(col))\n",
            "                newcol = []\n",
            "                i = 0\n",
            "                while i < n:\n",
            "                    if col[i] == 0:\n",
            "                        i += 1\n",
            "                        continue\n",
            "                    val = col[i]\n",
            "                    i += 1\n",
            "                    while i < n and col[i] == 0: i += 1\n",
            "                    if i < n and col[i] == val:\n",
            "                        val *= 2\n",
            "                        i += 1\n",
            "                    newcol.append(val)\n",
            "                for i in range(n):\n",
            "                    b[i][j] = newcol[i] if i < len(newcol) else 0\n",
            "            moved = True\n",
            "        # other moves omitted for brevity  \n",
            "        return b if moved else None\n",
            "\n",
            "    best, best_val = None, -1\n",
            "    for m in \"WASD\":\n",
            "        r = sim(board, m)\n",
            "        if r:\n",
            "            val = sum(sum(row) for row in r)\n",
            "            if val > best_val:\n",
            "                best_val, best = val, m\n",
            "    return best if best else \"W\"\n",
            "Timeout\n",
            "Exception = list index out of range\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "def strategy(board):\n",
            "Timeout\n",
            "Exception = strategy.<locals>.rotate() takes 1 positional argument but 2 were given\n",
            "def strategy(board):\n",
            "    # helper to simulate a move\n",
            "    def move(b, direction):\n",
            "        size = len(b)\n",
            "        new = [[0]*size for _ in range(size)]\n",
            "        for i in range(size):\n",
            "            if direction in ('A','D'):\n",
            "                line = b[i] if direction=='D' else b[i][::-1]\n",
            "            else:\n",
            "                line = [b[j][i] for j in range(size)]\n",
            "                if direction=='S': line = line[::-1]\n",
            "            merged = []\n",
            "            skip = False\n",
            "            for val in line:\n",
            "                if val==0: continue\n",
            "                if merged and merged[-1]==val and not skip:\n",
            "                    merged[-1] += val\n",
            "                    skip = True\n",
            "                else:\n",
            "                    merged.append(val)\n",
            "                    skip = False\n",
            "            for j,v in enumerate(merged):\n",
            "                new[i if direction=='A' else size-1-i][j if direction=='A' else size-1-j] = v\n",
            "        return new\n",
            "\n",
            "    # evaluate each move\n",
            "    best = None\n",
            "    best_val = -1\n",
            "    for dirc in 'WASD':\n",
            "        new_board = move(board, dirc)\n",
            "        val = sum(sum(row) for row in new_board)\n",
            "        if val > best_val:\n",
            "            best_val = val\n",
            "            best = dirc\n",
            "    return best\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "None\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "None\n",
            "Exception = unsupported operand type(s) for -: 'list' and 'int'\n",
            "Timeout\n",
            "Timeout\n",
            "def strategy(board):\n",
            "    # Simple heuristic: move up unless a merge is possible in another direction\n",
            "    # Check if any pair can merge horizontally or vertically\n",
            "    for i in range(4):\n",
            "        for j in range(3):\n",
            "            if board[i][j] == board[i][j+1]:\n",
            "                return \"A\"  # left\n",
            "    for i in range(3):\n",
            "        for j in range(4):\n",
            "            if board[i][j] == board[i+1][j]:\n",
            "                return \"W\"  # up\n",
            "    return \"D\"  # fallback\n",
            "Timeout\n",
            "Exception = list index out of range\n",
            "def strategy(board):\n",
            "    def score_for(move):\n",
            "        B = [row[:] for row in board]\n",
            "        def slide(row):\n",
            "            new = [x for x in row if x != 0]\n",
            "            res = []\n",
            "            skip = False\n",
            "            for i, x in enumerate(new):\n",
            "                if skip:\n",
            "                    skip = False\n",
            "                    continue\n",
            "                if i+1 < len(new) and new[i] == new[i+1]:\n",
            "                    res.append(x*2)\n",
            "                    skip = True\n",
            "                else:\n",
            "                    res.append(x)\n",
            "            return res + [0]*(len(row)-len(res))\n",
            "        if move=='W':\n",
            "            for i in range(len(B)):\n",
            "                B[i] = slide(B[i])\n",
            "        elif move=='S':\n",
            "            B = B[::-1]\n",
            "            for i in range(len(B)):\n",
            "                B[i] = slide(B[i])\n",
            "            B = B[::-1]\n",
            "        elif move=='A':\n",
            "            for row in B:\n",
            "                row[:] = slide(row)\n",
            "        elif move=='D':\n",
            "            for row in B:\n",
            "                row[:] = slide(row[::-1])[::-1]\n",
            "        empty = sum(cell==0 for r in B for cell in r)\n",
            "        return empty\n",
            "    best=None\n",
            "    for m in 'WASD':\n",
            "        if score_for(m)>best[1] if best else -1:\n",
            "            best=(m,score_for(m))\n",
            "    return best[0]\n",
            "Timeout\n",
            "Timeout\n",
            "Exception = list assignment index out of range\n",
            "Timeout\n",
            "Timeout\n",
            "def strategy(board):\n",
            "    '''\n",
            "    Returns the best next move for a 2048 game using a very small heuristic.\n",
            "    The heuristic looks at the free spaces after the move and chooses the\n",
            "    direction that tends to leave the most empty tiles.\n",
            "    '''\n",
            "    from functools import lru_cache\n",
            "\n",
            "    # Flatten the board for easier hashing\n",
            "    flatten = tuple(tuple(row) for row in board)\n",
            "\n",
            "    # Helper: simulate a move\n",
            "    def move(state, direction):\n",
            "        size = len(state)\n",
            "        new_state = []\n",
            "        for row in state:\n",
            "            merged = []\n",
            "            for d in row:\n",
            "                if d != 0:\n",
            "                    merged.append(d)\n",
            "\n",
            "            if direction in ('A', 'D'):  # horizontal move\n",
            "                merged = merged[::-1] if direction == 'D' else merged\n",
            "                i = 0\n",
            "                while i < len(merged) - 1:\n",
            "                    if merged[i] == merged[i + 1]:\n",
            "                        merged[i] *= 2\n",
            "                        merged.pop(i + 1)\n",
            "                    i += 1\n",
            "                merged += [0] * (size - len(merged))\n",
            "                if direction == 'D':\n",
            "                    merged = merged[::-1]\n",
            "                new_state.append(tuple(merged))\n",
            "            else:  # vertical move\n",
            "                new_state.append(tuple(merged))\n",
            "        # For vertical moves, reconstruct column-wise\n",
            "        if direction in ('W', 'S'):\n",
            "            transposed = list(zip(*new_state))\n",
            "            new_state = []\n",
            "            for col in transposed:\n",
            "                merged = []\n",
            "                for d in col:\n",
            "                    if d != 0:\n",
            "                        merged.append(d)\n",
            "                merged = merged[::-1] if direction == 'S' else merged\n",
            "                i = 0\n",
            "                while i < len(merged) - 1:\n",
            "                    if merged[i] == merged[i + 1]:\n",
            "                        merged[i] *= 2\n",
            "                        merged.pop(i + 1)\n",
            "                    i += 1\n",
            "                merged += [0] * (size - len(merged))\n",
            "                if direction == 'S':\n",
            "                    merged = merged[::-1]\n",
            "                new_state.append(tuple(merged))\n",
            "            new_state = [tuple(row) for row in zip(*new_state)]\n",
            "        return tuple(tuple(row) for row in new_state)\n",
            "\n",
            "    # Count empty tiles\n",
            "    def empty_count(state):\n",
            "        return sum(1 for row in state for cell in row if cell == 0)\n",
            "\n",
            "    best_move = None\n",
            "    best_empty = -1\n",
            "    for move in ['W', 'A', 'S', 'D']:\n",
            "        new_board = move(flatten, move)\n",
            "        e = empty_count(new_board)\n",
            "        if e > best_empty:\n",
            "            best_empty = e\n",
            "            best_move = move\n",
            "    return best_move\n",
            "Exception = 'str' object is not callable\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "def strategy(board):\n",
            "    import copy\n",
            "    # Helper to apply a move and return new board\n",
            "    def move(board, dir):\n",
            "        size = len(board)\n",
            "        def compress(line):\n",
            "            new = [x for x in line if x>0]\n",
            "            res = []\n",
            "            i = 0\n",
            "            while i < len(new):\n",
            "                if i+1 < len(new) and new[i]==new[i+1]:\n",
            "                    res.append(new[i]*2)\n",
            "                    i += 2\n",
            "                else:\n",
            "                    res.append(new[i])\n",
            "                    i += 1\n",
            "            res += [0]*(size-len(res))\n",
            "            return res\n",
            "        if dir=='W':\n",
            "            new = [compress(col) for col in zip(*board)]\n",
            "            return [list(row) for row in zip(*new)]\n",
            "        if dir=='A':\n",
            "            return [compress(row) for row in board]\n",
            "        if dir=='S':\n",
            "            rev = [list(reversed(row)) for row in board]\n",
            "            new = [compress(row) for row in rev]\n",
            "            return [list(reversed(row)) for row in new]\n",
            "        if dir=='D':\n",
            "            rev = [list(reversed(row)) for row in board]\n",
            "            new = [compress(row) for row in rev]\n",
            "            return [list(row) for row in new]\n",
            "    best = None\n",
            "    best_score = -1\n",
            "    for d in ['W','A','S','D']:\n",
            "        newboard = move(board, d)\n",
            "        # score: sum of all tiles (higher better)\n",
            "        score = sum(sum(row) for row in newboard)\n",
            "        if score > best_score:\n",
            "            best_score, best = score, d\n",
            "    return best\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "def strategy(board):\n",
            "    # helper to simulate a move and compute score\n",
            "    def simulate(move):\n",
            "        n = len(board)\n",
            "        new_board = [[0]*n for _ in range(n)]\n",
            "        for i in range(n):\n",
            "            line = board[i] if move in \"WB\" else [row[i] for row in board]\n",
            "            if move in \"DS\":  # reverse for down/right\n",
            "                line = line[::-1]\n",
            "            merged = []\n",
            "            skip = False\n",
            "            for v in line:\n",
            "                if v == 0: continue\n",
            "                if merged and merged[-1][0] == v and not skip:\n",
            "                    merged[-1] = (merged[-1][0]*2, merged[-1][1]+1)\n",
            "                    skip = True\n",
            "                else:\n",
            "                    merged.append((v, 0))\n",
            "                    skip = False\n",
            "            merged += [(0,0)]*(n-len(merged))\n",
            "            for idx, (v, _) in enumerate(merged):\n",
            "                new_board[i if move in \"WD\" else idx][idx if move in \"WD\" else i] = v\n",
            "        return sum(sum(row) for row in new_board)\n",
            "\n",
            "    best_move = None\n",
            "    best_score = -1\n",
            "    for m in \"WASD\":\n",
            "        try:\n",
            "            score = simulate(m)\n",
            "            if score > best_score:\n",
            "                best_score = score\n",
            "                best_move = m\n",
            "        except:\n",
            "            continue\n",
            "    return best_move or \"W\"\n",
            "Timeout\n",
            "Timeout\n",
            "Exception = name 'n' is not defined\n",
            "def strategy(board):\n",
            "    import copy\n",
            "    moves = {'W': (-1,0), 'A': (0,-1), 'S': (1,0), 'D': (0,1)}\n",
            "    def move(b, dir):\n",
            "        size = len(b)\n",
            "        mx, my = moves[dir]\n",
            "        new = [[0]*size for _ in range(size)]\n",
            "        for r in range(size):\n",
            "            line = []\n",
            "            nr = r + mx\n",
            "            for c in range(size):\n",
            "                nc = c + my\n",
            "                if 0 <= nr < size and 0 <= nc < size:\n",
            "                    line.append(b[nr][nc])\n",
            "            # compress\n",
            "            res=[]\n",
            "            i=0\n",
            "            while i < len(line):\n",
            "                if i+1<len(line) and line[i]==line[i+1]:\n",
            "                    res.append(line[i]*2); i+=2\n",
            "                else:\n",
            "                    res.append(line[i]); i+=1\n",
            "            for i,val in enumerate(res):\n",
            "                nr = (r+mx*i if mx else r)\n",
            "                nc = (c+my*i if my else c)\n",
            "                new[nr][nc]=val\n",
            "        return new\n",
            "    def score(b):\n",
            "        s=0\n",
            "        for r in range(len(b)):\n",
            "            for c in range(len(b)):\n",
            "                if b[r][c]>0:\n",
            "                    s+=b[r][c]\n",
            "        return s\n",
            "    best=None\n",
            "    best_score=-1\n",
            "    for m in moves:\n",
            "        nb=move(board,m)\n",
            "        s=score(nb)\n",
            "        if s>best_score:\n",
            "            best_score=s; best=m\n",
            "    return best\n",
            "Exception = list index out of range\n",
            "Exception = 'NoneType' object is not subscriptable\n",
            "Exception = name 'col_index' is not defined\n",
            "def strategy(board):\n",
            "    import copy\n",
            "    moves = \"WASD\"\n",
            "    best, best_move = -1, \"W\"\n",
            "    for m in moves:\n",
            "        b = copy.deepcopy(board)\n",
            "        if m == \"W\":\n",
            "            for i in range(3,-1,-1):\n",
            "                for j in range(4):\n",
            "                    if b[i][j] and b[i-1][j] and b[i][j]==b[i-1][j]:\n",
            "                        b[i-1][j]*=2; b[i][j]=0\n",
            "        elif m == \"S\":\n",
            "            for i in range(4):\n",
            "                for j in range(4):\n",
            "                    if i<3 and b[i][j] and b[i+1][j] and b[i][j]==b[i+1][j]:\n",
            "                        b[i+1][j]*=2; b[i][j]=0\n",
            "        elif m == \"A\":\n",
            "            for i in range(4):\n",
            "                for j in range(4):\n",
            "                    if j<3 and b[i][j] and b[i][j+1] and b[i][j]==b[i][j+1]:\n",
            "                        b[i][j+1]*=2; b[i][j]=0\n",
            "        elif m == \"D\":\n",
            "            for i in range(4):\n",
            "                for j in range(3,-1,-1):\n",
            "                    if j>0 and b[i][j] and b[i][j-1] and b[i][j]==b[i][j-1]:\n",
            "                        b[i][j-1]*=2; b[i][j]=0\n",
            "        score = sum(sum(row) for row in b)\n",
            "        if score > best:\n",
            "            best, best_move = score, m\n",
            "    return best_move\n",
            "Timeout\n",
            "Steps = 1825 State = failed\n",
            "def strategy(board):\n",
            "    size = len(board)\n",
            "    # Helper to compute score of moves\n",
            "    def score_move(d):\n",
            "        new_board = [row[:] for row in board]\n",
            "        moved = False\n",
            "        if d == \"W\":\n",
            "            for j in range(size):\n",
            "                col = [new_board[i][j] for i in range(size)]\n",
            "                merged = merge(col)\n",
            "                for i in range(size):\n",
            "                    new_board[i][j] = merged[i]\n",
            "                if merged != col:\n",
            "                    moved = True\n",
            "        elif d == \"S\":\n",
            "            for j in range(size):\n",
            "                col = [new_board[i][j] for i in range(size)][::-1]\n",
            "                merged = merge(col)[::-1]\n",
            "                for i in range(size):\n",
            "                    new_board[i][j] = merged[i]\n",
            "                if merged[::-1] != col:\n",
            "                    moved = True\n",
            "        elif d == \"A\":\n",
            "            for i in range(size):\n",
            "                row = new_board[i][:]\n",
            "                merged = merge(row)\n",
            "                new_board[i] = merged\n",
            "                if merged != row:\n",
            "                    moved = True\n",
            "        elif d == \"D\":\n",
            "            for i in range(size):\n",
            "                row = new_board[i][::-1]\n",
            "                merged = merge(row)[::-1]\n",
            "                new_board[i] = merged\n",
            "                if merged[::-1] != row:\n",
            "                    moved = True\n",
            "        return moved, new_board\n",
            "\n",
            "    def merge(line):\n",
            "        filtered = [x for x in line if x != 0]\n",
            "        merged = []\n",
            "        i = 0\n",
            "        while i < len(filtered):\n",
            "            if i+1 < len(filtered) and filtered[i] == filtered[i+1]:\n",
            "                merged.append(filtered[i]*2)\n",
            "                i += 2\n",
            "            else:\n",
            "                merged.append(filtered[i])\n",
            "                i += 1\n",
            "        merged += [0]*(size-len(merged))\n",
            "        return merged\n",
            "\n",
            "    # Evaluate each direction\n",
            "    best_score = -1\n",
            "    best_dir = \"W\"\n",
            "    for d in \"WASD\":\n",
            "        moved, new_board = score_move(d)\n",
            "        if not moved:\n",
            "            continue\n",
            "        # simple heuristic: sum of all tiles\n",
            "        score = sum(sum(row) for row in new_board)\n",
            "        if score > best_score:\n",
            "            best_score = score\n",
            "            best_dir = d\n",
            "    return best_dir\n",
            "┌────┬────┬────┬────┬────┬────┐\n",
            "│\u001b[38;5;49m   8\u001b[0m│\u001b[38;5;45m   2\u001b[0m│\u001b[38;5;208m1024\u001b[0m│\u001b[38;5;45m   2\u001b[0m│\u001b[38;5;47m  16\u001b[0m│\u001b[38;5;45m   2\u001b[0m│\n",
            "├────┼────┼────┼────┼────┼────┤\n",
            "│\u001b[38;5;208m1024\u001b[0m│\u001b[38;5;46m  32\u001b[0m│\u001b[38;5;214m 512\u001b[0m│\u001b[38;5;46m  32\u001b[0m│\u001b[38;5;118m  64\u001b[0m│\u001b[38;5;51m   4\u001b[0m│\n",
            "├────┼────┼────┼────┼────┼────┤\n",
            "│\u001b[38;5;214m 512\u001b[0m│\u001b[38;5;51m   4\u001b[0m│\u001b[38;5;154m 128\u001b[0m│\u001b[38;5;45m   2\u001b[0m│\u001b[38;5;46m  32\u001b[0m│\u001b[38;5;47m  16\u001b[0m│\n",
            "├────┼────┼────┼────┼────┼────┤\n",
            "│\u001b[38;5;226m 256\u001b[0m│\u001b[38;5;118m  64\u001b[0m│\u001b[38;5;46m  32\u001b[0m│\u001b[38;5;118m  64\u001b[0m│\u001b[38;5;47m  16\u001b[0m│\u001b[38;5;49m   8\u001b[0m│\n",
            "├────┼────┼────┼────┼────┼────┤\n",
            "│\u001b[38;5;118m  64\u001b[0m│\u001b[38;5;46m  32\u001b[0m│\u001b[38;5;47m  16\u001b[0m│\u001b[38;5;45m   2\u001b[0m│\u001b[38;5;49m   8\u001b[0m│\u001b[38;5;51m   4\u001b[0m│\n",
            "├────┼────┼────┼────┼────┼────┤\n",
            "│\u001b[38;5;46m  32\u001b[0m│\u001b[38;5;49m   8\u001b[0m│\u001b[38;5;45m   2\u001b[0m│\u001b[38;5;49m   8\u001b[0m│\u001b[38;5;51m   4\u001b[0m│\u001b[38;5;45m   2\u001b[0m│\n",
            "└────┴────┴────┴────┴────┴────┘\n",
            "Timeout\n",
            "def strategy(board):\n",
            "    # Evaluate score for each move and pick the one with maximal tile value\n",
            "    dirs = {\"W\": (-1,0), \"A\": (0,-1), \"S\": (1,0), \"D\": (0,1)}\n",
            "    best = None\n",
            "    best_score = -1\n",
            "    for d, (dx, dy) in dirs.items():\n",
            "        new_board = [[0]*4 for _ in range(4)]\n",
            "        moved = False\n",
            "        for i in range(4):\n",
            "            for j in range(4):\n",
            "                ni, nj = i+dx, j+dy\n",
            "                if 0 <= ni < 4 and 0 <= nj < 4:\n",
            "                    new_board[ni][nj] = board[i][j]\n",
            "                    if new_board[ni][nj] != board[i][j]:\n",
            "                        moved = True\n",
            "        if not moved:\n",
            "            continue\n",
            "        score = sum([sum(row) for row in new_board])\n",
            "        if score > best_score:\n",
            "            best_score = score\n",
            "            best = d\n",
            "    return best if best is not None else \"W\"\n",
            "Timeout\n",
            "Timeout\n",
            "Exception = 'list_reverseiterator' object is not subscriptable\n",
            "Timeout\n",
            "def strategy(board):\n",
            "    def score_row(row, dir):\n",
            "        if dir == 'L':\n",
            "            row = row[::-1]\n",
            "        merged = []\n",
            "        skip = False\n",
            "        for val in row:\n",
            "            if val == 0: continue\n",
            "            if skip:\n",
            "                skip = False\n",
            "                continue\n",
            "            if merged and merged[-1] == val:\n",
            "                merged[-1] *= 2\n",
            "                skip = True\n",
            "            else:\n",
            "                merged.append(val)\n",
            "        merged += [0]*(len(row)-len(merged))\n",
            "        if dir == 'L':\n",
            "            merged = merged[::-1]\n",
            "        return merged\n",
            "\n",
            "    def move(board, action):\n",
            "        new_board = [row[:] for row in board]\n",
            "        if action in 'L':\n",
            "            for r in new_board:\n",
            "                new_row = score_row(r, 'L')\n",
            "                for i, val in enumerate(new_row):\n",
            "                    r[i] = val\n",
            "        elif action in 'R':\n",
            "            for r in new_board:\n",
            "                new_row = score_row(r, 'R')\n",
            "                for i, val in enumerate(new_row):\n",
            "                    r[i] = val\n",
            "        elif action in 'U':\n",
            "            for c in range(4):\n",
            "                col = [new_board[r][c] for r in range(4)]\n",
            "                new_col = score_row(col, 'L')\n",
            "                for r in range(4):\n",
            "                    new_board[r][c] = new_col[r]\n",
            "        elif action in 'D':\n",
            "            for c in range(4):\n",
            "                col = [new_board[r][c] for r in range(4)]\n",
            "                new_col = score_row(col, 'R')\n",
            "                for r in range(4):\n",
            "                    new_board[r][c] = new_col[r]\n",
            "        return new_board\n",
            "\n",
            "    def empty(board):\n",
            "        return [(r, c) for r in range(4) for c in range(4) if board[r][c] == 0]\n",
            "\n",
            "    actions = 'WASD'\n",
            "    best = None\n",
            "    best_score = -1\n",
            "    for a in actions:\n",
            "        new = move(board, a)\n",
            "        empties = len(empty(new))\n",
            "        merged = sum(1 for r in new for val in r if val >0)\n",
            "        score = empties + merged\n",
            "        if score>best_score:\n",
            "            best_score = score\n",
            "            best = a\n",
            "    return best\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "def strategy(board):\n",
            "    # choose a move that keeps more tiles unchanged\n",
            "    moves = ['W','A','S','D']\n",
            "    best = moves[0]; best_score = -1\n",
            "    for m in moves:\n",
            "        new = board_state_after(board, m)\n",
            "        if new == board:\n",
            "            continue\n",
            "        score = score_board(new)\n",
            "        if score > best_score:\n",
            "            best_score = score; best = m\n",
            "    return best\n",
            "def board_state_after(board, move):\n",
            "    # simulate move on a copy of the board\n",
            "    from copy import deepcopy\n",
            "    b = deepcopy(board)\n",
            "    n = len(b)\n",
            "    # simple implementation of move logic\n",
            "    def compress(line):\n",
            "        new = [x for x in line if x!=0]\n",
            "        res = []\n",
            "        i=0\n",
            "        while i < len(new):\n",
            "            if i+1<len(new) and new[i]==new[i+1]:\n",
            "                res.append(new[i]*2); i+=2\n",
            "            else:\n",
            "                res.append(new[i]); i+=1\n",
            "        res += [0]*(n-len(res))\n",
            "        return res\n",
            "    if move=='W':\n",
            "        for j in range(n):\n",
            "            col=[b[i][j] for i in range(n)]\n",
            "            col=compress(col)\n",
            "            for i in range(n): b[i][j]=col[i]\n",
            "    elif move=='S':\n",
            "        for j in range(n):\n",
            "            col=[b[i][j] for i in range(n)][::-1]\n",
            "            col=compress(col)[::-1]\n",
            "            for i in range(n): b[i][j]=col[i]\n",
            "    elif move=='A':\n",
            "        for i in range(n):\n",
            "            row=compress(b[i])\n",
            "            b[i]=row\n",
            "    elif move=='D':\n",
            "        for i in range(n):\n",
            "            row=compress(b[i][::-1])[::-1]\n",
            "            b[i]=row\n",
            "    return b\n",
            "def score_board(board):\n",
            "    # higher score for more homogeneous board\n",
            "    total=0\n",
            "    for row in board:\n",
            "        for v in row:\n",
            "            total+=v\n",
            "    return total\n",
            "Exception = list assignment index out of range\n",
            "Timeout\n",
            "Timeout\n",
            "def strategy(board):\n",
            "    # simulate four possible moves and choose the one\n",
            "    def move(board, dir):\n",
            "        size = len(board)\n",
            "        def compress(line):\n",
            "            filtered = [x for x in line if x != 0]\n",
            "            merged = []\n",
            "            skip = False\n",
            "            for i in range(len(filtered)):\n",
            "                if skip: skip = False; continue\n",
            "                if i+1 < len(filtered) and filtered[i] == filtered[i+1]:\n",
            "                    merged.append(filtered[i]*2)\n",
            "                    skip = True\n",
            "                else:\n",
            "                    merged.append(filtered[i])\n",
            "            merged += [0]*(size-len(merged))\n",
            "            return merged\n",
            "        new = [[0]*size for _ in range(size)]\n",
            "        if dir == 'W':\n",
            "            for j in range(size):\n",
            "                col = [board[i][j] for i in range(size)]\n",
            "                merged = compress(col)\n",
            "                for i in range(size):\n",
            "                    new[i][j] = merged[i]\n",
            "        elif dir == 'S':\n",
            "            for j in range(size):\n",
            "                col = [board[i][j] for i in range(size)][::-1]\n",
            "                merged = compress(col)[::-1]\n",
            "                for i in range(size):\n",
            "                    new[i][j] = merged[i]\n",
            "        elif dir == 'A':\n",
            "            for i in range(size):\n",
            "                row = board[i]\n",
            "                merged = compress(row)\n",
            "                new[i] = merged\n",
            "        elif dir == 'D':\n",
            "            for i in range(size):\n",
            "                row = board[i][::-1]\n",
            "                merged = compress(row)[::-1]\n",
            "                new[i] = merged\n",
            "        return new\n",
            "\n",
            "    best = None\n",
            "    best_score = -1\n",
            "    for dir in ('W','A','S','D'):\n",
            "        new = move(board, dir)\n",
            "        score = sum(sum(row) for row in new)\n",
            "        if score > best_score:\n",
            "            best_score = score\n",
            "            best = dir\n",
            "    return best\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "def strategy(board):\n",
            "    def move(board, dir):\n",
            "        import copy\n",
            "        n=len(board)\n",
            "        new=[row[:] for row in board]\n",
            "        if dir=='W':\n",
            "            for j in range(n):\n",
            "                col=[new[i][j] for i in range(n)]\n",
            "                newcol=compress(col)\n",
            "                for i in range(n): new[i][j]=newcol[i]\n",
            "        elif dir=='S':\n",
            "            for j in range(n):\n",
            "                col=[new[i][j] for i in range(n)][::-1]\n",
            "                newcol=compress(col)[::-1]\n",
            "                for i in range(n): new[i][j]=newcol[i]\n",
            "        elif dir=='A':\n",
            "            for i in range(n):\n",
            "                new[i]=compress(new[i])\n",
            "        elif dir=='D':\n",
            "            for i in range(n):\n",
            "                new[i]=compress(new[i])[::-1][::-1]\n",
            "        return new\n",
            "    def compress(line):\n",
            "        filtered=[v for v in line if v>0]\n",
            "        res=[]\n",
            "        i=0\n",
            "        while i<len(filtered):\n",
            "            if i+1<len(filtered) and filtered[i]==filtered[i+1]:\n",
            "                res.append(filtered[i]*2); i+=2\n",
            "            else:\n",
            "                res.append(filtered[i]); i+=1\n",
            "        return res+[0]*(len(line)-len(res))\n",
            "    best=None\n",
            "    best_score=-1\n",
            "    for d in \"WASD\":\n",
            "        nb=move(board,d)\n",
            "        score=sum(sum(row) for row in nb)\n",
            "        if score>best_score:\n",
            "            best_score=score; best=d\n",
            "    return best\n",
            "Timeout\n",
            "Steps = 1264 State = success\n",
            "def strategy(board):\n",
            "    # board is a 4x4 list of lists\n",
            "    import random\n",
            "    \n",
            "    # Directions with priority: diagonal corners\n",
            "    dirs = ['W', 'A', 'S', 'D']\n",
            "    for d in dirs:\n",
            "        new_board = [row[:] for row in board]\n",
            "        if d == 'W':\n",
            "            for j in range(4):\n",
            "                merged = False\n",
            "                for i in range(1, 4):\n",
            "                    if new_board[i][j] == new_board[i-1][j] and not merged:\n",
            "                        new_board[i-1][j] += new_board[i][j]\n",
            "                        new_board[i][j] = 0\n",
            "                        merged = True\n",
            "        elif d == 'S':\n",
            "            for j in range(4):\n",
            "                merged = False\n",
            "                for i in range(2, -1, -1):\n",
            "                    if new_board[i][j] == new_board[i+1][j] and not merged:\n",
            "                        new_board[i+1][j] += new_board[i][j]\n",
            "                        new_board[i][j] = 0\n",
            "                        merged = True\n",
            "        elif d == 'A':\n",
            "            for i in range(4):\n",
            "                merged = False\n",
            "                for j in range(1, 4):\n",
            "                    if new_board[i][j] == new_board[i][j-1] and not merged:\n",
            "                        new_board[i][j-1] += new_board[i][j]\n",
            "                        new_board[i][j] = 0\n",
            "                        merged = True\n",
            "        elif d == 'D':\n",
            "            for i in range(4):\n",
            "                merged = False\n",
            "                for j in range(2, -1, -1):\n",
            "                    if new_board[i][j] == new_board[i][j+1] and not merged:\n",
            "                        new_board[i][j+1] += new_board[i][j]\n",
            "                        new_board[i][j] = 0\n",
            "                        merged = True\n",
            "        # measure score: number of non-zero tiles\n",
            "        score = sum(1 for r in new_board for v in r if v != 0)\n",
            "        # choose first direction that reduces empty tiles\n",
            "        if score > sum(1 for r in board for v in r if v != 0):\n",
            "            return d\n",
            "    return random.choice(dirs)\n",
            "┌────┬────┬────┬────┬────┬────┐\n",
            "│\u001b[38;5;239m   .\u001b[0m│\u001b[38;5;239m   .\u001b[0m│\u001b[38;5;239m   .\u001b[0m│\u001b[38;5;45m   2\u001b[0m│\u001b[38;5;239m   .\u001b[0m│\u001b[38;5;239m   .\u001b[0m│\n",
            "├────┼────┼────┼────┼────┼────┤\n",
            "│\u001b[38;5;51m   4\u001b[0m│\u001b[38;5;239m   .\u001b[0m│\u001b[38;5;239m   .\u001b[0m│\u001b[38;5;239m   .\u001b[0m│\u001b[38;5;239m   .\u001b[0m│\u001b[38;5;239m   .\u001b[0m│\n",
            "├────┼────┼────┼────┼────┼────┤\n",
            "│\u001b[38;5;45m   2\u001b[0m│\u001b[38;5;46m  32\u001b[0m│\u001b[38;5;51m   4\u001b[0m│\u001b[38;5;239m   .\u001b[0m│\u001b[38;5;239m   .\u001b[0m│\u001b[38;5;239m   .\u001b[0m│\n",
            "├────┼────┼────┼────┼────┼────┤\n",
            "│\u001b[38;5;118m  64\u001b[0m│\u001b[38;5;51m   4\u001b[0m│\u001b[38;5;49m   8\u001b[0m│\u001b[38;5;239m   .\u001b[0m│\u001b[38;5;47m  16\u001b[0m│\u001b[38;5;46m  32\u001b[0m│\n",
            "├────┼────┼────┼────┼────┼────┤\n",
            "│\u001b[38;5;46m  32\u001b[0m│\u001b[38;5;46m  32\u001b[0m│\u001b[38;5;45m   2\u001b[0m│\u001b[38;5;45m   2\u001b[0m│\u001b[38;5;226m 256\u001b[0m│\u001b[38;5;51m   4\u001b[0m│\n",
            "├────┼────┼────┼────┼────┼────┤\n",
            "│\u001b[38;5;51m   4\u001b[0m│\u001b[38;5;47m  16\u001b[0m│\u001b[38;5;51m   4\u001b[0m│\u001b[38;5;196m2048\u001b[0m│\u001b[38;5;51m   4\u001b[0m│\u001b[38;5;45m   2\u001b[0m│\n",
            "└────┴────┴────┴────┴────┴────┘\n",
            "Exception = '>' not supported between instances of 'int' and 'str'\n",
            "Exception = cannot pickle 'generator' object\n",
            "Timeout\n",
            "def strategy(board):\n",
            "    def move(board, direction):\n",
            "        size = len(board)\n",
            "        def compress(line):\n",
            "            new = [x for x in line if x>0]\n",
            "            merged = []\n",
            "            i=0\n",
            "            while i < len(new):\n",
            "                if i+1 < len(new) and new[i]==new[i+1]:\n",
            "                    merged.append(new[i]*2)\n",
            "                    i+=2\n",
            "                else:\n",
            "                    merged.append(new[i])\n",
            "                    i+=1\n",
            "            return merged+[0]*(size-len(merged))\n",
            "        new_board=[[0]*size for _ in range(size)]\n",
            "        if direction=='W':\n",
            "            for j in range(size):\n",
            "                col=[board[i][j] for i in range(size)]\n",
            "                col=compress(col)\n",
            "                for i in range(size):\n",
            "                    new_board[i][j]=col[i]\n",
            "        elif direction=='S':\n",
            "            for j in range(size):\n",
            "                col=[board[i][j] for i in range(size)][::-1]\n",
            "                col=compress(col)[::-1]\n",
            "                for i in range(size):\n",
            "                    new_board[i][j]=col[i]\n",
            "        elif direction=='A':\n",
            "            for i in range(size):\n",
            "                row=compress(board[i])\n",
            "                new_board[i]=row\n",
            "        elif direction=='D':\n",
            "            for i in range(size):\n",
            "                row=compress(board[i][::-1])[::-1]\n",
            "                new_board[i]=row\n",
            "        return new_board\n",
            "\n",
            "    def score(b):\n",
            "        return sum(sum(1 for x in row if x>0) for row in b)\n",
            "\n",
            "    best=None\n",
            "    bestScore=-1\n",
            "    for d in \"WASD\":\n",
            "        nb=move(board,d)\n",
            "        s=score(nb)\n",
            "        if s>bestScore:\n",
            "            bestScore=s\n",
            "            best=d\n",
            "    return best\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "Timeout\n",
            "None\n",
            "Timeout\n",
            "Exception = list assignment index out of range\n",
            "Timeout\n",
            "None\n",
            "Timeout\n"
          ]
        }
      ],
      "source": [
        "trainer.train()"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "tlaUdxC_VHpz"
      },
      "source": [
        "<a name=\"Inference\"></a>\n",
        "# Inference\n",
        "Now let's try the model we just trained!"
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 31,
      "metadata": {
        "colab": {
          "base_uri": "https://localhost:8080/"
        },
        "id": "8BZZHOKiF9Ct",
        "outputId": "4989f8d9-d024-462e-c732-b7734676791a"
      },
      "outputs": [
        {
          "output_type": "stream",
          "name": "stdout",
          "text": [
            "<|start|>system<|message|>You are ChatGPT, a large language model trained by OpenAI.\n",
            "Knowledge cutoff: 2024-06\n",
            "Current date: 2025-10-06\n",
            "\n",
            "Reasoning: low\n",
            "\n",
            "# Valid channels: analysis, commentary, final. Channel must be included for every message.\n",
            "Calls to these tools must go to the commentary channel: 'functions'.<|end|><|start|>user<|message|>Create a new short 2048 strategy using only native Python code.\n",
            "You are given a list of list of numbers for the current board state.\n",
            "Output one action for \"W\", \"A\", \"S\", \"D\" on what is the optimal next step.\n",
            "Output your new short function in backticks using the format below:\n",
            "```python\n",
            "def strategy(board):\n",
            "    return \"W\" # Example\n",
            "```\n",
            "All helper functions should be inside def strategy. Only output the short function `strategy`.<|end|><|start|>assistant<|channel|>analysis<|message|>We need a short strategy that picks best move using basic heuristics. Use simple scoring: prefer moving to increase score: choose move that gives largest tile sum after simulation. Write quick function.<|end|><|start|>assistant<|channel|>final<|message|>```python\n",
            "def strategy(board):\n",
            "    import itertools\n",
            "    dirs = [\"W\",\"A\",\"S\",\"D\"]\n",
            "    best_dir = None\n",
            "    best_val = -1\n",
            "    for d in dirs:\n",
            "        new_board = [row[:] for row in board]\n",
            "        moved = False\n",
            "        if d == \"W\":\n",
            "            for col in range(4):\n",
            "                temp = [new_board[r][col] for r in range(4) if new_board[r][col]]\n",
            "                merged = []\n",
            "                i=0\n",
            "                while i<len(temp):\n",
            "                    if i+1<len(temp) and temp[i]==temp[i+1]:\n",
            "                        merged.append(temp[i]*2); i+=2\n",
            "                    else:\n",
            "                        merged.append(temp[i]); i+=1\n",
            "                for r in range(4):\n",
            "                    new_board[r][col]=merged[r] if r<len(merged) else 0\n",
            "        elif d == \"S\":\n",
            "            for col in range(4):\n",
            "                temp = [new_board[r][col] for r in range(4) if new_board[r][col]]\n",
            "                merged = []\n",
            "                i=len(temp)-1\n",
            "                while i>=0:\n",
            "                    if i-1>=0 and temp[i]==temp[i-1]:\n",
            "                        merged.insert(0,temp[i]*2); i-=2\n",
            "                    else:\n",
            "                        merged.insert(0,temp[i]); i-=1\n",
            "                for r in range(4):\n",
            "                    new_board[r][col]=merged[r] if r<len(merged) else 0\n",
            "        elif d == \"A\":\n",
            "            for r in range(4):\n",
            "                temp = [new_board[r][c] for c in range(4) if new_board[r][c]]\n",
            "                merged = []\n",
            "                i=0\n",
            "                while i<len(temp):\n",
            "                    if i+1<len(temp) and temp[i]==temp[i+1]:\n",
            "                        merged.append(temp[i]*2); i+=2\n",
            "                    else:\n",
            "                        merged.append(temp[i]); i+=1\n",
            "                for c in range(4):\n",
            "                    new_board[r][c]=merged[c] if c<len(merged) else 0\n",
            "        elif d == \"D\":\n",
            "            for r in range(4):\n",
            "                temp = [new_board[r][c] for c in range(4) if new_board[r][c]]\n",
            "                merged = []\n",
            "                i=len(temp)-1\n",
            "                while i>=0:\n",
            "                    if i-1>=0 and temp[i]==temp[i-1]:\n",
            "                        merged.insert(0,temp[i]*2); i-=2\n",
            "                    else:\n",
            "                        merged.insert(0,temp[i]); i-=1\n",
            "                for c in range(4):\n",
            "                    new_board[r][c]=merged[c] if c<len(merged) else 0\n",
            "        val = sum(map(sum,new_board))\n",
            "        if val>best_val:\n",
            "            best_val, best_dir = val, d\n",
            "    return best_dir\n",
            "```<|return|>\n"
          ]
        }
      ],
      "source": [
        "text = tokenizer.apply_chat_template(\n",
        "    [{\"role\": \"user\", \"content\": prompt}],\n",
        "    tokenize = False,\n",
        "    add_generation_prompt = True,\n",
        "    reasoning_effort = \"low\",\n",
        ")\n",
        "\n",
        "from transformers import TextStreamer\n",
        "_ = model.generate(\n",
        "    **tokenizer(text, return_tensors = \"pt\").to(\"cuda\"),\n",
        "    temperature = 1.0,\n",
        "    max_new_tokens = 1024,\n",
        "    streamer = TextStreamer(tokenizer, skip_prompt = False),\n",
        ")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "-NUEmHFSYNTp"
      },
      "source": [
        "<a name=\"Save\"></a>\n",
        "### Saving to float16 or `MXFP4`\n",
        "\n",
        "We also support saving to `float16` directly. Select `merged_16bit` for float16 or `mxfp4` for MXFP4 (OpenAI's gpt-oss native precision). We also allow `lora` adapters as a fallback. Use `push_to_hub_merged` to upload to your Hugging Face account! You can go to https://huggingface.co/settings/tokens for your personal tokens."
      ]
    },
    {
      "cell_type": "code",
      "execution_count": 32,
      "metadata": {
        "id": "NjXGTkp7YNtB"
      },
      "outputs": [],
      "source": [
        "# Merge and push to hub in mxfp4 4bit format\n",
        "if False:\n",
        "    model.save_pretrained_merged(\"finetuned_model\", tokenizer, save_method = \"mxfp4\")\n",
        "if False:\n",
        "    model.push_to_hub_merged(\"repo_id/repo_name\", tokenizer, token = \"hf...\", save_method = \"mxfp4\")\n",
        "\n",
        "# Merge and push to hub in 16bit\n",
        "if False:\n",
        "    model.save_pretrained_merged(\"finetuned_model\", tokenizer, save_method = \"merged_16bit\")\n",
        "if False: # Pushing to HF Hub\n",
        "    model.push_to_hub_merged(\"hf/gpt-oss-finetune\", tokenizer, save_method = \"merged_16bit\", token = \"\")"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "V15Yhj1V9lwG"
      },
      "source": [
        "# And we're done!\n",
        "Congratulations you just learned how to do reinforcement learning with gpt-oss! There were some advanced topics explained in this notebook - to learn more about gpt-oss and RL, there are more docs in Unsloth's [Reinforcement Learning Guide with gpt-oss](https://docs.unsloth.ai/new/gpt-oss-reinforcement-learning)"
      ]
    }
  ],
  "metadata": {
    "accelerator": "GPU",
    "colab": {
      "gpuType": "T4",
      "provenance": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "display_name": "Python 3",
      "name": "python3"
    },
    "language_info": {
      "name": "python"
    },
    "widgets": {
      "application/vnd.jupyter.widget-state+json": {
        "02d120e49f2c4f95a6090b1d8d521767": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_dbf5ed93dac646ed979fa7a8c569dfe3",
            "placeholder": "​",
            "style": "IPY_MODEL_4db5ee5b7b674abba75fbce264e6dfa3",
            "value": " 165/165 [00:00&lt;00:00, 17.9kB/s]"
          }
        },
        "04d39c4dda9f4a1bb01b8d6320032372": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "06ab9eaa6f0f48c4b68cff1ca4b9f2fa": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "07f0420c4dfa477caccd7ae96551c2e4": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "FloatProgressModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_ad75f887a140416abfca615b2fc3c385",
            "max": 3996690997,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_dee02a37a6f44f168546ee0077dc20d1",
            "value": 3996690997
          }
        },
        "0ac4d8e674804ad6bdc5f2d62f2e0d33": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HBoxModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_7bfcd9acf29646db8b6123708d1ffe27",
              "IPY_MODEL_5e88d6515f16475fb72d7c153422b591",
              "IPY_MODEL_5e5b77dd649547f896ab306fccc94a4e"
            ],
            "layout": "IPY_MODEL_a843fa23e6c94fb486bff8764574fdc5"
          }
        },
        "0c0c96eeac664f339aa4511bf47087e2": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HBoxModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_18451e19df5449b1853b5e13dacd19c5",
              "IPY_MODEL_d864d29d02c54ecfaedd7b866a6df8c2",
              "IPY_MODEL_7875163297284832a35aca84cbb105ce"
            ],
            "layout": "IPY_MODEL_d42d8228ea1247a1a81bb99b18c4640c"
          }
        },
        "0f99489932aa409b94ba34764aff19b0": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "1183d3f2ad3c4fb0af1d925b5f9e3efe": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HBoxModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_9cc51d8029eb4217bc37daa918649692",
              "IPY_MODEL_41f13d2f023e405180689e03bc2c32a1",
              "IPY_MODEL_247484c0bf5945bcb4627b48928366c8"
            ],
            "layout": "IPY_MODEL_14c0f20a9ab341ee966fe77815099ff0"
          }
        },
        "147743757c804b85af2ef194f5f84e6a": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "14c0f20a9ab341ee966fe77815099ff0": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "152d7bf2a74f400db3d3ecaa719ef8d1": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "18451e19df5449b1853b5e13dacd19c5": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_bcda4c9a48e943a6a0ef812fcd64a6db",
            "placeholder": "​",
            "style": "IPY_MODEL_61e491b843c347b6b2a9948de7caf01d",
            "value": "tokenizer_config.json: "
          }
        },
        "1c96edb2f7c948b9968b1239982af942": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_ee23056662ad4b719b65005d776e0e72",
            "placeholder": "​",
            "style": "IPY_MODEL_87765ca0996b403dbe29deef48d548bf",
            "value": " 4.00G/4.00G [01:42&lt;00:00, 117MB/s]"
          }
        },
        "219ca32ab51e4b4385b2c1026a78503a": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HBoxModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_6c2ccfe3363b40b58fc26ea164d4ead4",
              "IPY_MODEL_07f0420c4dfa477caccd7ae96551c2e4",
              "IPY_MODEL_1c96edb2f7c948b9968b1239982af942"
            ],
            "layout": "IPY_MODEL_d93be4994f104b6e99d89a9e73cd6abd"
          }
        },
        "245590db7d374515a428ff4abbd25588": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "ProgressStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "247484c0bf5945bcb4627b48928366c8": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_cef064f1c55f41bf957fc4623260fdb4",
            "placeholder": "​",
            "style": "IPY_MODEL_37cbe8800af04a42a0355922969b6393",
            "value": " 4/4 [01:00&lt;00:00, 13.06s/it]"
          }
        },
        "263b7dc0b3fd465fac89b9266b19d526": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "FloatProgressModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_147743757c804b85af2ef194f5f84e6a",
            "max": 4,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_2820e352ab004e818949acc31eb3888d",
            "value": 4
          }
        },
        "2820e352ab004e818949acc31eb3888d": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "ProgressStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "2a6aa92676c74509b58373ca604c5b3b": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "2a6f43b64d164636a2d9708f0190f21b": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "2c40c6b846924200b29616a590af1672": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_06ab9eaa6f0f48c4b68cff1ca4b9f2fa",
            "placeholder": "​",
            "style": "IPY_MODEL_d98c2b1e979b4929891a8ee0c11f55df",
            "value": "model.safetensors.index.json: "
          }
        },
        "2fa84865e9f14c1491402ef81517b4bd": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "32d6af64f2464cfb965671f2692b4e15": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "34a9e38b0b454a69a067d1ddadec7626": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_9c4d6839934b4b13952a850d2084d498",
            "placeholder": "​",
            "style": "IPY_MODEL_c6a1decbc0e7421db622033214913cb9",
            "value": "Fetching 4 files: 100%"
          }
        },
        "350f29f737534bfba4258bc31ec274a2": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "36676899a61f4be4b631f6271f6ecec9": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "37cbe8800af04a42a0355922969b6393": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "3f9b801b52da4eb79f730d87bea5c338": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HBoxModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_b66c6ded549d4db8a2e5ea8e5016615c",
              "IPY_MODEL_43da5073c3ad4e98a3ade17a0bb3b93d",
              "IPY_MODEL_40365e2c9fef49148e4c93592d458afc"
            ],
            "layout": "IPY_MODEL_7e9d5212fc7844f286e14b70cbf0bc7a"
          }
        },
        "40138ff29073407abb95f793509fc320": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "40365e2c9fef49148e4c93592d458afc": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_2a6f43b64d164636a2d9708f0190f21b",
            "placeholder": "​",
            "style": "IPY_MODEL_65c62d2198e64ee4a9e6547c2733135a",
            "value": " 1.16G/1.16G [00:25&lt;00:00, 39.8MB/s]"
          }
        },
        "41f13d2f023e405180689e03bc2c32a1": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "FloatProgressModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_36676899a61f4be4b631f6271f6ecec9",
            "max": 4,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_77ecad9f150c430fa85f5833d97c42df",
            "value": 4
          }
        },
        "43da5073c3ad4e98a3ade17a0bb3b93d": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "FloatProgressModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_4513a73fa95b41b5b6edadc9143ba9c1",
            "max": 1158267008,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_792d75a7d18945e7972826ac5b2ac386",
            "value": 1158267008
          }
        },
        "4513a73fa95b41b5b6edadc9143ba9c1": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "48741bbdeccb459aa4eea9c61339764b": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "4b9b3fe8dc764eedb9e18f166fe2f548": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_87a808c4d4f54f719adcd29de7206e1b",
            "placeholder": "​",
            "style": "IPY_MODEL_5f0b2a0e1953406b88af2c884904e2da",
            "value": "model-00003-of-00004.safetensors: 100%"
          }
        },
        "4cb119127b404f46a53012c62d004e28": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "4d67b10ec7794170addb4e968e20f170": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "4da21f53bf7f4e2d8132eb43e6ecc739": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "4db5ee5b7b674abba75fbce264e6dfa3": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "4fbc4cfe529d471ba85f3ae8e53b28d6": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HBoxModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_a0d0fedc5bec4f5b943fddf9a954fbdf",
              "IPY_MODEL_cab602573c6940919f93e59fe6f4838d",
              "IPY_MODEL_51b8f4ce40f94ac39cf44d98f1522ec7"
            ],
            "layout": "IPY_MODEL_32d6af64f2464cfb965671f2692b4e15"
          }
        },
        "51aaa109480d4ae6bd419aea689d22ee": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "51b8f4ce40f94ac39cf44d98f1522ec7": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_60ceb890b5644493a8886d91b9dac461",
            "placeholder": "​",
            "style": "IPY_MODEL_40138ff29073407abb95f793509fc320",
            "value": " 446/446 [00:00&lt;00:00, 50.5kB/s]"
          }
        },
        "55ac5c2a82ee48fe988e1e4f26c168b0": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "ProgressStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "5657a84bf4b74710b2de1a54f9236e39": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "596c2a62a635469eb74233ce00586a6f": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "59e46bbe96df4b88ad31c09096ce0e0a": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "5a59fb5f7acf4213847c985e66c9ee3c": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_81a728910a2341a785a6f252bbb371f7",
            "placeholder": "​",
            "style": "IPY_MODEL_69a8d50f11244ba688c183d14d2395ec",
            "value": "generation_config.json: 100%"
          }
        },
        "5b7af68130f04a63ad3efa3d9f602ebe": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_80fa3aef5e2040d9904c6b87b7214ca0",
            "placeholder": "​",
            "style": "IPY_MODEL_0f99489932aa409b94ba34764aff19b0",
            "value": " 4/4 [01:42&lt;00:00, 42.23s/it]"
          }
        },
        "5e5b77dd649547f896ab306fccc94a4e": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_59e46bbe96df4b88ad31c09096ce0e0a",
            "placeholder": "​",
            "style": "IPY_MODEL_8f5c7b88a2cc4b5abb0814c814833349",
            "value": " 15.1k/? [00:00&lt;00:00, 1.37MB/s]"
          }
        },
        "5e88d6515f16475fb72d7c153422b591": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "FloatProgressModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_923653dfe90e475a9efa44baf98ba9a0",
            "max": 1,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_62600092f8cc43f493b86b0169f67be1",
            "value": 1
          }
        },
        "5ebe7b4e4ed24c53b783ee46377c682d": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "FloatProgressModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_51aaa109480d4ae6bd419aea689d22ee",
            "max": 3998751275,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_acf4e50a248342f68d26daef21baa419",
            "value": 3998751275
          }
        },
        "5f0b2a0e1953406b88af2c884904e2da": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "60ceb890b5644493a8886d91b9dac461": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "614c5332c7d045109102a329e7f69dfd": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "61e491b843c347b6b2a9948de7caf01d": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "62600092f8cc43f493b86b0169f67be1": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "ProgressStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "65c62d2198e64ee4a9e6547c2733135a": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "68ea891644ca4753a8e1bf278ff47e84": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "69a8d50f11244ba688c183d14d2395ec": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "6a47e60b10a6481b94aee021c8dbc7ba": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "6ab4e5676ad84807a126fffa99f7a0d4": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HBoxModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_e61ef80398444c13bf7cd20ef21a5057",
              "IPY_MODEL_5ebe7b4e4ed24c53b783ee46377c682d",
              "IPY_MODEL_e0fdef0087bc4a91a11932a2d933c001"
            ],
            "layout": "IPY_MODEL_596c2a62a635469eb74233ce00586a6f"
          }
        },
        "6c2ccfe3363b40b58fc26ea164d4ead4": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_4da21f53bf7f4e2d8132eb43e6ecc739",
            "placeholder": "​",
            "style": "IPY_MODEL_735f70fac43449e3974de1b783d56d33",
            "value": "model-00002-of-00004.safetensors: 100%"
          }
        },
        "735f70fac43449e3974de1b783d56d33": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "749e8407a901483c8b513a2fb71596c8": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "FloatProgressModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_ef01b874478b4bb497d31d2f8dd6145a",
            "max": 1,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_d50ea8cded9848ffa18be1ae6a2559df",
            "value": 1
          }
        },
        "751a46fbb8e24efabfb381a85c90fbe8": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "77204d81ff8f4ee585361a503fa647dc": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "77d34c0f1de548b4872208a063bb5017": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "77ecad9f150c430fa85f5833d97c42df": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "ProgressStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "7841bc90b6a74120ab3e603c76332a01": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "7875163297284832a35aca84cbb105ce": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_ba94310dc12a4a258205b14901ad3f94",
            "placeholder": "​",
            "style": "IPY_MODEL_a93210a691414502ba3c2dff03ffb4ce",
            "value": " 22.8k/? [00:00&lt;00:00, 1.66MB/s]"
          }
        },
        "792d75a7d18945e7972826ac5b2ac386": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "ProgressStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "7baca79d720c40b5a923b9717e28c982": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_ffabf89ecd9d48a5a3fc2a1c855ce080",
            "placeholder": "​",
            "style": "IPY_MODEL_614c5332c7d045109102a329e7f69dfd",
            "value": " 1.19M/? [00:00&lt;00:00, 81.8MB/s]"
          }
        },
        "7bd5d1beeb0e49e293d9f6b91bb6d7fb": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "ProgressStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "7bfcd9acf29646db8b6123708d1ffe27": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_fd0ac7ed3d3146ec85913f4e05c4a2f6",
            "placeholder": "​",
            "style": "IPY_MODEL_77204d81ff8f4ee585361a503fa647dc",
            "value": "chat_template.jinja: "
          }
        },
        "7d3379cbd27a4218a9d84c5a12f3bb88": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "7e9d5212fc7844f286e14b70cbf0bc7a": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "80fa3aef5e2040d9904c6b87b7214ca0": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "81a728910a2341a785a6f252bbb371f7": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "84d27c45065e426badbfcfcdc8ff16b6": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "FloatProgressModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_4d67b10ec7794170addb4e968e20f170",
            "max": 27868174,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_55ac5c2a82ee48fe988e1e4f26c168b0",
            "value": 27868174
          }
        },
        "87765ca0996b403dbe29deef48d548bf": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "87a808c4d4f54f719adcd29de7206e1b": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "8c7c6bb04a3f4a1494b34529f95a195c": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "8db5e86577744ff1a39c8e198eee5dd3": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HBoxModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_4b9b3fe8dc764eedb9e18f166fe2f548",
              "IPY_MODEL_cca95e973bc445d3811335debf7c446e",
              "IPY_MODEL_e507a46b4c754d9a8aede2aac0d203bc"
            ],
            "layout": "IPY_MODEL_751a46fbb8e24efabfb381a85c90fbe8"
          }
        },
        "8f1e6c36b84c4115a671dcb9ade41c8b": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "8f5c7b88a2cc4b5abb0814c814833349": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "923653dfe90e475a9efa44baf98ba9a0": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": "20px"
          }
        },
        "9a079a30b4ae4bbc80122faf83e0ad59": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "9beac0680e3049dfafcb6ec185fd2265": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "ProgressStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "9c4d6839934b4b13952a850d2084d498": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "9cc51d8029eb4217bc37daa918649692": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_a219f3b89a34443abe612846676f9356",
            "placeholder": "​",
            "style": "IPY_MODEL_152d7bf2a74f400db3d3ecaa719ef8d1",
            "value": "Loading checkpoint shards: 100%"
          }
        },
        "a0d0fedc5bec4f5b943fddf9a954fbdf": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_e1e77d98b01f4376a6c075975c27571e",
            "placeholder": "​",
            "style": "IPY_MODEL_6a47e60b10a6481b94aee021c8dbc7ba",
            "value": "special_tokens_map.json: 100%"
          }
        },
        "a219f3b89a34443abe612846676f9356": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "a843fa23e6c94fb486bff8764574fdc5": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "a93210a691414502ba3c2dff03ffb4ce": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "abe2b0a2913d4633943f44333ae799f8": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HBoxModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_2c40c6b846924200b29616a590af1672",
              "IPY_MODEL_749e8407a901483c8b513a2fb71596c8",
              "IPY_MODEL_7baca79d720c40b5a923b9717e28c982"
            ],
            "layout": "IPY_MODEL_68ea891644ca4753a8e1bf278ff47e84"
          }
        },
        "acda8e7582934fecbbf854e66e23f698": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "acf4e50a248342f68d26daef21baa419": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "ProgressStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "ad75f887a140416abfca615b2fc3c385": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "ae6d42fb84fc4984af1d4430acdcd3c9": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "FloatProgressModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_350f29f737534bfba4258bc31ec274a2",
            "max": 165,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_9beac0680e3049dfafcb6ec185fd2265",
            "value": 165
          }
        },
        "b07acf871a0a46f1889bfb439d13752b": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "ProgressStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "b66c6ded549d4db8a2e5ea8e5016615c": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_77d34c0f1de548b4872208a063bb5017",
            "placeholder": "​",
            "style": "IPY_MODEL_bf96e8666c224c26b0a01451d08e907a",
            "value": "model-00004-of-00004.safetensors: 100%"
          }
        },
        "ba94310dc12a4a258205b14901ad3f94": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "bcda4c9a48e943a6a0ef812fcd64a6db": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "bf96e8666c224c26b0a01451d08e907a": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "c6a1decbc0e7421db622033214913cb9": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "cab602573c6940919f93e59fe6f4838d": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "FloatProgressModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_5657a84bf4b74710b2de1a54f9236e39",
            "max": 446,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_7bd5d1beeb0e49e293d9f6b91bb6d7fb",
            "value": 446
          }
        },
        "caf742160db041a1b6c2cfdf78f2dc9a": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HBoxModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_34a9e38b0b454a69a067d1ddadec7626",
              "IPY_MODEL_263b7dc0b3fd465fac89b9266b19d526",
              "IPY_MODEL_5b7af68130f04a63ad3efa3d9f602ebe"
            ],
            "layout": "IPY_MODEL_2a6aa92676c74509b58373ca604c5b3b"
          }
        },
        "cca95e973bc445d3811335debf7c446e": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "FloatProgressModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_2fa84865e9f14c1491402ef81517b4bd",
            "max": 3372033380,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_245590db7d374515a428ff4abbd25588",
            "value": 3372033380
          }
        },
        "cef064f1c55f41bf957fc4623260fdb4": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "d42d8228ea1247a1a81bb99b18c4640c": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "d50ea8cded9848ffa18be1ae6a2559df": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "ProgressStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "d864d29d02c54ecfaedd7b866a6df8c2": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "FloatProgressModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "FloatProgressModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "ProgressView",
            "bar_style": "success",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_dee07d33b8de4c3b847fcff670e68102",
            "max": 1,
            "min": 0,
            "orientation": "horizontal",
            "style": "IPY_MODEL_b07acf871a0a46f1889bfb439d13752b",
            "value": 1
          }
        },
        "d9020a2a2c8440db81d2cfdf0289b667": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "d93be4994f104b6e99d89a9e73cd6abd": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "d98c2b1e979b4929891a8ee0c11f55df": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "DescriptionStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "DescriptionStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "description_width": ""
          }
        },
        "da4324e287e64e5ba98fc110693066df": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "dbf5ed93dac646ed979fa7a8c569dfe3": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "dbfeea8ee2374b8c8fa70431c35f281f": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_d9020a2a2c8440db81d2cfdf0289b667",
            "placeholder": "​",
            "style": "IPY_MODEL_04d39c4dda9f4a1bb01b8d6320032372",
            "value": "tokenizer.json: 100%"
          }
        },
        "dee02a37a6f44f168546ee0077dc20d1": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "ProgressStyleModel",
          "state": {
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "ProgressStyleModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "StyleView",
            "bar_color": null,
            "description_width": ""
          }
        },
        "dee07d33b8de4c3b847fcff670e68102": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": "20px"
          }
        },
        "e0fdef0087bc4a91a11932a2d933c001": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_7d3379cbd27a4218a9d84c5a12f3bb88",
            "placeholder": "​",
            "style": "IPY_MODEL_7841bc90b6a74120ab3e603c76332a01",
            "value": " 4.00G/4.00G [01:41&lt;00:00, 60.6MB/s]"
          }
        },
        "e1e77d98b01f4376a6c075975c27571e": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "e2973e6c02834a7c9f2f6ce5755f35f0": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "e507a46b4c754d9a8aede2aac0d203bc": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_e2973e6c02834a7c9f2f6ce5755f35f0",
            "placeholder": "​",
            "style": "IPY_MODEL_48741bbdeccb459aa4eea9c61339764b",
            "value": " 3.37G/3.37G [01:40&lt;00:00, 32.0MB/s]"
          }
        },
        "e61ef80398444c13bf7cd20ef21a5057": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_da4324e287e64e5ba98fc110693066df",
            "placeholder": "​",
            "style": "IPY_MODEL_8c7c6bb04a3f4a1494b34529f95a195c",
            "value": "model-00001-of-00004.safetensors: 100%"
          }
        },
        "ee23056662ad4b719b65005d776e0e72": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "ef01b874478b4bb497d31d2f8dd6145a": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": "20px"
          }
        },
        "f8dacdab001d4db0b6b3776ac7d3634a": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HBoxModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_5a59fb5f7acf4213847c985e66c9ee3c",
              "IPY_MODEL_ae6d42fb84fc4984af1d4430acdcd3c9",
              "IPY_MODEL_02d120e49f2c4f95a6090b1d8d521767"
            ],
            "layout": "IPY_MODEL_8f1e6c36b84c4115a671dcb9ade41c8b"
          }
        },
        "fa9ea0d3234e41689c827485d0360885": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HTMLModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HTMLModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HTMLView",
            "description": "",
            "description_tooltip": null,
            "layout": "IPY_MODEL_9a079a30b4ae4bbc80122faf83e0ad59",
            "placeholder": "​",
            "style": "IPY_MODEL_acda8e7582934fecbbf854e66e23f698",
            "value": " 27.9M/27.9M [00:00&lt;00:00, 44.5MB/s]"
          }
        },
        "fd0ac7ed3d3146ec85913f4e05c4a2f6": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "fd2fe9ef6da64f72ab29d481d1739f5e": {
          "model_module": "@jupyter-widgets/controls",
          "model_module_version": "1.5.0",
          "model_name": "HBoxModel",
          "state": {
            "_dom_classes": [],
            "_model_module": "@jupyter-widgets/controls",
            "_model_module_version": "1.5.0",
            "_model_name": "HBoxModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/controls",
            "_view_module_version": "1.5.0",
            "_view_name": "HBoxView",
            "box_style": "",
            "children": [
              "IPY_MODEL_dbfeea8ee2374b8c8fa70431c35f281f",
              "IPY_MODEL_84d27c45065e426badbfcfcdc8ff16b6",
              "IPY_MODEL_fa9ea0d3234e41689c827485d0360885"
            ],
            "layout": "IPY_MODEL_4cb119127b404f46a53012c62d004e28"
          }
        },
        "ffabf89ecd9d48a5a3fc2a1c855ce080": {
          "model_module": "@jupyter-widgets/base",
          "model_module_version": "1.2.0",
          "model_name": "LayoutModel",
          "state": {
            "_model_module": "@jupyter-widgets/base",
            "_model_module_version": "1.2.0",
            "_model_name": "LayoutModel",
            "_view_count": null,
            "_view_module": "@jupyter-widgets/base",
            "_view_module_version": "1.2.0",
            "_view_name": "LayoutView",
            "align_content": null,
            "align_items": null,
            "align_self": null,
            "border": null,
            "bottom": null,
            "display": null,
            "flex": null,
            "flex_flow": null,
            "grid_area": null,
            "grid_auto_columns": null,
            "grid_auto_flow": null,
            "grid_auto_rows": null,
            "grid_column": null,
            "grid_gap": null,
            "grid_row": null,
            "grid_template_areas": null,
            "grid_template_columns": null,
            "grid_template_rows": null,
            "height": null,
            "justify_content": null,
            "justify_items": null,
            "left": null,
            "margin": null,
            "max_height": null,
            "max_width": null,
            "min_height": null,
            "min_width": null,
            "object_fit": null,
            "object_position": null,
            "order": null,
            "overflow": null,
            "overflow_x": null,
            "overflow_y": null,
            "padding": null,
            "right": null,
            "top": null,
            "visibility": null,
            "width": null
          }
        },
        "state" : {}
      }
    }
  },
  "nbformat": 4,
  "nbformat_minor": 0
}


================================================
FILE: examples/streamlit/streamlit_chat.py
================================================
import json

import requests
import streamlit as st

DEFAULT_FUNCTION_PROPERTIES = """
{
    "type": "object",
    "properties": {
        "location": {
            "type": "string",
            "description": "The city and state, e.g. San Francisco, CA"
        }
    },
    "required": ["location"]
}
""".strip()

# Session state for chat
if "messages" not in st.session_state:
    st.session_state.messages = []

st.title("💬 Chatbot")

if "model" not in st.session_state:
    if "model" in st.query_params:
        st.session_state.model = st.query_params["model"]
    else:
        st.session_state.model = "small"

options = ["large", "small"]
selection = st.sidebar.segmented_control(
    "Model", options, selection_mode="single", default=st.session_state.model
)
# st.session_state.model = selection
st.query_params.update({"model": selection})

instructions = st.sidebar.text_area(
    "Instructions",
    value="You are a helpful assistant that can answer questions and help with tasks.",
)
effort = st.sidebar.radio(
    "Reasoning effort",
    ["low", "medium", "high"],
    index=1,
)
st.sidebar.divider()
st.sidebar.subheader("Functions")
use_functions = st.sidebar.toggle("Use functions", value=False)

st.sidebar.subheader("Built-in Tools")
# Built-in Tools section
use_browser_search = st.sidebar.toggle("Use browser search", value=False)
use_code_interpreter = st.sidebar.toggle("Use code interpreter", value=False)

if use_functions:
    function_name = st.sidebar.text_input("Function name", value="get_weather")
    function_description = st.sidebar.text_area(
        "Function description", value="Get the weather for a given city"
    )
    function_parameters = st.sidebar.text_area(
        "Function parameters", value=DEFAULT_FUNCTION_PROPERTIES
    )
else:
    function_name = None
    function_description = None
    function_parameters = None
st.sidebar.divider()
temperature = st.sidebar.slider(
    "Temperature", min_value=0.0, max_value=1.0, value=1.0, step=0.01
)
max_output_tokens = st.sidebar.slider(
    "Max output tokens", min_value=1, max_value=131072, value=30000, step=1000
)
st.sidebar.divider()
debug_mode = st.sidebar.toggle("Debug mode", value=False)

if debug_mode:
    st.sidebar.divider()
    st.sidebar.code(json.dumps(st.session_state.messages, indent=2), "json")

render_input = True

URL = (
    "http://localhost:8081/v1/responses"
    if selection == options[1]
    else "http://localhost:8000/v1/responses"
)


def trigger_fake_tool(container):
    function_output = st.session_state.get("function_output", "It's sunny!")
    last_call = st.session_state.messages[-1]
    if last_call.get("type") == "function_call":
        st.session_state.messages.append(
            {
                "type": "function_call_output",
                "call_id": last_call.get("call_id"),
                "output": function_output,
            }
        )
        run(container)


def run(container):
    tools = []
    if use_functions:
        tools.append(
            {
                "type": "function",
                "name": function_name,
                "description": function_description,
                "parameters": json.loads(function_parameters),
            }
        )
    # Add browser_search tool if checkbox is checked
    if use_browser_search:
        tools.append({"type": "browser_search"})
    if use_code_interpreter:
        tools.append({"type": "code_interpreter"})
    response = requests.post(
        URL,
        json={
            "input": st.session_state.messages,
            "stream": True,
            "instructions": instructions,
            "reasoning": {"effort": effort},
            "metadata": {"__debug": debug_mode},
            "tools": tools,
            "temperature": temperature,
            "max_output_tokens": max_output_tokens,
        },
        stream=True,
    )

    text_delta = ""
    code_interpreter_sessions: dict[str, dict] = {}

    _current_output_index = 0
    for line in response.iter_lines(decode_unicode=True):
        if not line or not line.startswith("data:"):
            continue
        data_str = line[len("data:") :].strip()
        if not data_str:
            continue
        try:
            data = json.loads(data_str)
        except Exception:
            continue

        event_type = data.get("type", "")
        output_index = data.get("output_index", 0)
        if event_type == "response.output_item.added":
            _current_output_index = output_index
            output_type = data.get("item", {}).get("type", "message")
            if output_type == "message":
                output = container.chat_message("assistant")
                placeholder = output.empty()
            elif output_type == "reasoning":
                output = container.chat_message("reasoning", avatar="🤔")
                placeholder = output.empty()
            elif output_type == "web_search_call":
                output = container.chat_message("web_search_call", avatar="🌐")
                output.code(
                    json.dumps(data.get("item", {}).get("action", {}), indent=4),
                    language="json",
                )
                placeholder = output.empty()
            elif output_type == "code_interpreter_call":
                item = data.get("item", {})
                item_id = item.get("id")
                message_container = container.chat_message(
                    "code_interpreter_call", avatar="🧪"
                )
                status_placeholder = message_container.empty()
                code_placeholder = message_container.empty()
                outputs_container = message_container.container()
                code_text = item.get("code") or ""
                if code_text:
                    code_placeholder.code(code_text, language="python")
                code_interpreter_sessions[item_id] = {
                    "status": status_placeholder,
                    "code": code_placeholder,
                    "outputs": outputs_container,
                    "code_text": code_text,
                    "rendered_outputs": False,
                }
                placeholder = status_placeholder
            text_delta = ""
        elif event_type == "response.reasoning_text.delta":
            output.avatar = "🤔"
            text_delta += data.get("delta", "")
            placeholder.markdown(text_delta)
        elif event_type == "response.output_text.delta":
            text_delta += data.get("delta", "")
            placeholder.markdown(text_delta)
        elif event_type == "response.output_item.done":
            item = data.get("item", {})
            if item.get("type") == "function_call":
                with container.chat_message("function_call", avatar="🔨"):
                    st.markdown(f"Called `{item.get('name')}`")
                    st.caption("Arguments")
                    st.code(item.get("arguments", ""), language="json")
            if item.get("type") == "web_search_call":
                placeholder.markdown("✅ Done")
            if item.get("type") == "code_interpreter_call":
                item_id = item.get("id")
                session = code_interpreter_sessions.get(item_id)
                if session:
                    session["status"].markdown("✅ Done")
                    final_code = item.get("code") or session["code_text"]
                    if final_code:
                        session["code"].code(final_code, language="python")
                        session["code_text"] = final_code
                    outputs = item.get("outputs") or []
                    if outputs and not session["rendered_outputs"]:
                        with session["outputs"]:
                            st.markdown("**Outputs**")
                            for output_item in outputs:
                                output_type = output_item.get("type")
                                if output_type == "logs":
                                    st.code(
                                        output_item.get("logs", ""),
                                        language="text",
                                    )
                                elif output_type == "image":
                                    st.image(
                                        output_item.get("url", ""),
                                        caption="Code interpreter image",
                                    )
                        session["rendered_outputs"] = True
                    elif not outputs and not session["rendered_outputs"]:
                        with session["outputs"]:
                            st.caption("(No outputs)")
                        session["rendered_outputs"] = True
                else:
                    placeholder.markdown("✅ Done")
        elif event_type == "response.code_interpreter_call.in_progress":
            item_id = data.get("item_id")
            session = code_interpreter_sessions.get(item_id)
            if session:
                session["status"].markdown("⏳ Running")
            else:
                try:
                    placeholder.markdown("⏳ Running")
                except Exception:
                    pass
        elif event_type == "response.code_interpreter_call.interpreting":
            item_id = data.get("item_id")
            session = code_interpreter_sessions.get(item_id)
            if session:
                session["status"].markdown("🧮 Interpreting")
        elif event_type == "response.code_interpreter_call.completed":
            item_id = data.get("item_id")
            session = code_interpreter_sessions.get(item_id)
            if session:
                session["status"].markdown("✅ Done")
            else:
                try:
                    placeholder.markdown("✅ Done")
                except Exception:
                    pass
        elif event_type == "response.code_interpreter_call_code.delta":
            item_id = data.get("item_id")
            session = code_interpreter_sessions.get(item_id)
            if session:
                session["code_text"] += data.get("delta", "")
                if session["code_text"].strip():
                    session["code"].code(session["code_text"], language="python")
        elif event_type == "response.code_interpreter_call_code.done":
            item_id = data.get("item_id")
            session = code_interpreter_sessions.get(item_id)
            if session:
                final_code = data.get("code") or session["code_text"]
                session["code_text"] = final_code
                if final_code:
                    session["code"].code(final_code, language="python")
        elif event_type == "response.completed":
            response = data.get("response", {})
            if debug_mode:
                container.expander("Debug", expanded=False).code(
                    response.get("metadata", {}).get("__debug", ""), language="text"
                )
            st.session_state.messages.extend(response.get("output", []))
            if st.session_state.messages[-1].get("type") == "function_call":
                with container.form("function_output_form"):
                    _function_output = st.text_input(
                        "Enter function output",
                        value=st.session_state.get("function_output", "It's sunny!"),
                        key="function_output",
                    )
                    st.form_submit_button(
                        "Submit function output",
                        on_click=trigger_fake_tool,
                        args=[container],
                    )
            # Optionally handle other event types...


# Chat display
for msg in st.session_state.messages:
    if msg.get("type") == "message":
        with st.chat_message(msg["role"]):
            for item in msg["content"]:
                if (
                    item.get("type") == "text"
                    or item.get("type") == "output_text"
                    or item.get("type") == "input_text"
                ):
                    st.markdown(item["text"])
                    if item.get("annotations"):
                        annotation_lines = "\n".join(
                            f"- {annotation.get('url')}"
                            for annotation in item["annotations"]
                            if annotation.get("url")
                        )
                        st.caption(f"**Annotations:**\n{annotation_lines}")
    elif msg.get("type") == "reasoning":
        with st.chat_message("reasoning", avatar="🤔"):
            for item in msg["content"]:
                if item.get("type") == "reasoning_text":
                    st.markdown(item["text"])
    elif msg.get("type") == "function_call":
        with st.chat_message("function_call", avatar="🔨"):
            st.markdown(f"Called `{msg.get('name')}`")
            st.caption("Arguments")
            st.code(msg.get("arguments", ""), language="json")
    elif msg.get("type") == "function_call_output":
        with st.chat_message("function_call_output", avatar="✅"):
            st.caption("Output")
            st.code(msg.get("output", ""), language="text")
    elif msg.get("type") == "web_search_call":
        with st.chat_message("web_search_call", avatar="🌐"):
            st.code(json.dumps(msg.get("action", {}), indent=4), language="json")
            st.markdown("✅ Done")
    elif msg.get("type") == "code_interpreter_call":
        with st.chat_message("code_interpreter_call", avatar="🧪"):
            st.markdown("✅ Done")

if render_input:
    # Input field
    if prompt := st.chat_input("Type a message..."):
        st.session_state.messages.append(
            {
                "type": "message",
                "role": "user",
                "content": [{"type": "input_text", "text": prompt}],
            }
        )

        with st.chat_message("user"):
            st.markdown(prompt)

        run(st.container())


================================================
FILE: gpt-oss-mcp-server/README.md
================================================
# MCP Servers for gpt-oss reference tools

This directory contains MCP servers for the reference tools in the [gpt-oss](https://github.com/openai/gpt-oss) repository.
You can set up these tools behind MCP servers and use them in your applications.
For inference service that integrates with MCP, you can also use these as reference tools.

In particular, this directory contains a `build-system-prompt.py` script that will generate exactly the same system prompt as `reference-system-prompt.py`.
The build system prompt script show case all the care needed to automatically discover the tools and construct the system prompt before feeding it into Harmony.

## Usage

```bash
# Install the dependencies
uv pip install -r requirements.txt
```

```bash
# Assume we have harmony and gpt-oss installed
uv pip install mcp[cli]
# start the servers
mcp run -t sse browser_server.py:mcp
mcp run -t sse python_server.py:mcp
```

You can now use MCP inspector to play with the tools.
Once opened, set SSE to `http://localhost:8001/sse` and `http://localhost:8000/sse` respectively.

To compare the system prompt and see how to construct it via MCP service discovery, see `build-system-prompt.py`.
This script will generate exactly the same system prompt as `reference-system-prompt.py`.


================================================
FILE: gpt-oss-mcp-server/browser_server.py
================================================
import os
from collections.abc import AsyncIterator
from contextlib import asynccontextmanager
from dataclasses import dataclass, field
from typing import Union, Optional

from mcp.server.fastmcp import Context, FastMCP
from gpt_oss.tools.simple_browser import SimpleBrowserTool
from gpt_oss.tools.simple_browser.backend import YouComBackend, ExaBackend

@dataclass
class AppContext:
    browsers: dict[str, SimpleBrowserTool] = field(default_factory=dict)

    def create_or_get_browser(self, session_id: str) -> SimpleBrowserTool:
        if session_id not in self.browsers:
            tool_backend = os.getenv("BROWSER_BACKEND", "exa")
            if tool_backend == "youcom":
                backend = YouComBackend(source="web")
            elif tool_backend == "exa":
                backend = ExaBackend(source="web")
            else:
                raise ValueError(f"Invalid tool backend: {tool_backend}")
            self.browsers[session_id] = SimpleBrowserTool(backend=backend)
        return self.browsers[session_id]

    def remove_browser(self, session_id: str) -> None:
        self.browsers.pop(session_id, None)


@asynccontextmanager
async def app_lifespan(_server: FastMCP) -> AsyncIterator[AppContext]:
    yield AppContext()


# Pass lifespan to server
mcp = FastMCP(
    name="browser",
    instructions=r"""
Tool for browsing.
The `cursor` appears in brackets before each browsing display: `[{cursor}]`.
Cite information from the tool using the following format:
`【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`. 
Do not quote more than 10 words directly from the tool output.
sources=web
""".strip(),
    lifespan=app_lifespan,
    port=8001,
)


@mcp.tool(
    name="search",
    title="Search for information",
    description=
    "Searches for information related to `query` and displays `topn` results.",
)
async def search(ctx: Context,
                 query: str,
                 topn: int = 10,
                 source: Optional[str] = None) -> str:
    """Search for information related to a query"""
    browser = ctx.request_context.lifespan_context.create_or_get_browser(
        ctx.client_id)
    messages = []
    async for message in browser.search(query=query, topn=topn, source=source):
        if message.content and hasattr(message.content[0], 'text'):
            messages.append(message.content[0].text)
    return "\n".join(messages)


@mcp.tool(
    name="open",
    title="Open a link or page",
    description="""
Opens the link `id` from the page indicated by `cursor` starting at line number `loc`, showing `num_lines` lines.
Valid link ids are displayed with the formatting: `【{id}†.*】`.
If `cursor` is not provided, the most recent page is implied.
If `id` is a string, it is treated as a fully qualified URL associated with `source`.
If `loc` is not provided, the viewport will be positioned at the beginning of the document or centered on the most relevant passage, if available.
Use this function without `id` to scroll to a new location of an opened page.
""".strip(),
)
async def open_link(ctx: Context,
                    id: Union[int, str] = -1,
                    cursor: int = -1,
                    loc: int = -1,
                    num_lines: int = -1,
                    view_source: bool = False,
                    source: Optional[str] = None) -> str:
    """Open a link or navigate to a page location"""
    browser = ctx.request_context.lifespan_context.create_or_get_browser(
        ctx.client_id)
    messages = []
    async for message in browser.open(id=id,
                                      cursor=cursor,
                                      loc=loc,
                                      num_lines=num_lines,
                                      view_source=view_source,
                                      source=source):
        if message.content and hasattr(message.content[0], 'text'):
            messages.append(message.content[0].text)
    return "\n".join(messages)


@mcp.tool(
    name="find",
    title="Find pattern in page",
    description=
    "Finds exact matches of `pattern` in the current page, or the page given by `cursor`.",
)
async def find_pattern(ctx: Context, pattern: str, cursor: int = -1) -> str:
    """Find exact matches of a pattern in the current page"""
    browser = ctx.request_context.lifespan_context.create_or_get_browser(
        ctx.client_id)
    messages = []
    async for message in browser.find(pattern=pattern, cursor=cursor):
        if message.content and hasattr(message.content[0], 'text'):
            messages.append(message.content[0].text)
    return "\n".join(messages)


================================================
FILE: gpt-oss-mcp-server/build-system-prompt.py
================================================
import datetime
import asyncio

from gpt_oss.tokenizer import get_tokenizer

from openai_harmony import (
    Conversation,
    DeveloperContent,
    HarmonyEncodingName,
    Message,
    ReasoningEffort,
    Role,
    SystemContent,
    ToolNamespaceConfig,
    ToolDescription,
    load_harmony_encoding,
)

from mcp import ClientSession
from mcp.client.sse import sse_client
from mcp.types import ListToolsResult


async def list_server_and_tools(server_url: str):
    async with sse_client(url=server_url) as streams, ClientSession(
            *streams) as session:
        initialize_response = await session.initialize()
        list_tools_response = await session.list_tools()
        return initialize_response, list_tools_response


def trim_schema(schema: dict) -> dict:
    # Turn JSON Schema from MCP generated into Harmony's variant.
    if "title" in schema:
        del schema["title"]
    if "default" in schema and schema["default"] is None:
        del schema["default"]
    if "anyOf" in schema:
        # Turn "anyOf": [{"type": "type-1"}, {"type": "type-2"}] into "type": ["type-1", "type-2"]
        # if there's more than 1 types, also remove "null" type as Harmony will just ignore it
        types = [
            type_dict["type"] for type_dict in schema["anyOf"]
            if type_dict["type"] != 'null'
        ]
        schema["type"] = types
        del schema["anyOf"]
    if "properties" in schema:
        schema["properties"] = {
            k: trim_schema(v)
            for k, v in schema["properties"].items()
        }
    return schema


def post_process_tools_description(
        list_tools_result: ListToolsResult) -> ListToolsResult:
    # Adapt the MCP tool result for Harmony
    for tool in list_tools_result.tools:
        tool.inputSchema = trim_schema(tool.inputSchema)

    # Some tools schema don't need to be part of the prompt (e.g. simple text in text out for Python)
    list_tools_result.tools = [
        tool for tool in list_tools_result.tools
        if getattr(tool.annotations, "include_in_prompt", True)
    ]

    return list_tools_result

tokenizer = get_tokenizer()

tools_urls = [
    "http://localhost:8001/sse",  # browser
    "http://localhost:8000/sse",  # python
]
harmony_tool_descriptions = []
for tools_url in tools_urls:

    initialize_response, list_tools_response = asyncio.run(
        list_server_and_tools(tools_url))

    list_tools_response = post_process_tools_description(list_tools_response)

    tool_from_mcp = ToolNamespaceConfig(
        name=initialize_response.serverInfo.name,
        description=initialize_response.instructions,
        tools=[
            ToolDescription.new(name=tool.name,
                                description=tool.description,
                                parameters=tool.inputSchema)
            for tool in list_tools_response.tools
        ])
    harmony_tool_descriptions.append(tool_from_mcp)

encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)

system_message_content = (SystemContent.new().with_reasoning_effort(
    ReasoningEffort.LOW).with_conversation_start_date(
        datetime.datetime.now().strftime("%Y-%m-%d")))

for tool_description in harmony_tool_descriptions:
    system_message_content = system_message_content.with_tools(
        tool_description)

system_message = Message.from_role_and_content(Role.SYSTEM,
                                               system_message_content)

developer_message_content = DeveloperContent.new().with_instructions("")
developer_message = Message.from_role_and_content(Role.DEVELOPER,
                                                  developer_message_content)

messages = [system_message, developer_message]

conversation = Conversation.from_messages(messages)
tokens = encoding.render_conversation(conversation)
system_message = tokenizer.decode(tokens)
print(system_message)


================================================
FILE: gpt-oss-mcp-server/pyproject.toml
================================================
[project]
name = "gpt-oss-mcp-server"
version = "0.1.0"
requires-python = ">=3.10"
dependencies = [
    "mcp[cli]>=1.12.2",
    # "gpt_oss"
]


================================================
FILE: gpt-oss-mcp-server/python_server.py
================================================
from mcp.server.fastmcp import FastMCP
from gpt_oss.tools.python_docker.docker_tool import PythonTool
from openai_harmony import Message, TextContent, Author, Role

# Pass lifespan to server
mcp = FastMCP(
    name="python",
    instructions=r"""
Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).
When you send a message containing python code to python, it will be executed in a stateless docker container, and the stdout of that process will be returned to you.
""".strip(),
)


@mcp.tool(
    name="python",
    title="Execute Python code",
    description="""
Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).
When you send a message containing python code to python, it will be executed in a stateless docker container, and the stdout of that process will be returned to you.
    """,
    annotations={
        # Harmony format don't want this schema to be part of it because it's simple text in text out
        "include_in_prompt": False,
    })
async def python(code: str) -> str:
    tool = PythonTool()
    messages = []
    async for message in tool.process(
            Message(author=Author(role=Role.TOOL, name="python"),
                    content=[TextContent(text=code)])):
        messages.append(message)
    return "\n".join([message.content[0].text for message in messages])


================================================
FILE: gpt-oss-mcp-server/reference-system-prompt.py
================================================
import datetime

from gpt_oss.tools.simple_browser import SimpleBrowserTool
from gpt_oss.tools.simple_browser.backend import YouComBackend
from gpt_oss.tools.python_docker.docker_tool import PythonTool
from gpt_oss.tokenizer import tokenizer

from openai_harmony import (
    Conversation,
    DeveloperContent,
    HarmonyEncodingName,
    Message,
    ReasoningEffort,
    Role,
    SystemContent,
    load_harmony_encoding,
)

encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)

system_message_content = (SystemContent.new().with_reasoning_effort(
    ReasoningEffort.LOW).with_conversation_start_date(
        datetime.datetime.now().strftime("%Y-%m-%d")))

backend = YouComBackend(source="web")
browser_tool = SimpleBrowserTool(backend=backend)
system_message_content = system_message_content.with_tools(
    browser_tool.tool_config)

python_tool = PythonTool()
system_message_content = system_message_content.with_tools(
    python_tool.tool_config)

system_message = Message.from_role_and_content(Role.SYSTEM,
                                               system_message_content)

developer_message_content = DeveloperContent.new().with_instructions("")
developer_message = Message.from_role_and_content(Role.DEVELOPER,
                                                  developer_message_content)

messages = [system_message, developer_message]

conversation = Conversation.from_messages(messages)
tokens = encoding.render_conversation(conversation)
system_message = tokenizer.decode(tokens)
print(system_message)


================================================
FILE: gpt_oss/__init__.py
================================================


================================================
FILE: gpt_oss/chat.py
================================================
"""
Harmony chat with tools
"""

import atexit
import argparse
import asyncio
import datetime
import os
from pathlib import Path

try:
    import gnureadline as readline
except ImportError:
    import readline

import torch
import termcolor

from gpt_oss.tools import apply_patch
from gpt_oss.tools.simple_browser import SimpleBrowserTool
from gpt_oss.tools.simple_browser.backend import YouComBackend
from gpt_oss.tools.python_docker.docker_tool import PythonTool

from openai_harmony import (
    Author,
    Conversation,
    DeveloperContent,
    HarmonyEncodingName,
    Message,
    ReasoningEffort,
    Role,
    StreamableParser,
    StreamState,
    SystemContent,
    TextContent,
    ToolDescription,
    load_harmony_encoding,
)


REASONING_EFFORT = {
    "high": ReasoningEffort.HIGH,
    "medium": ReasoningEffort.MEDIUM,
    "low": ReasoningEffort.LOW,
}


def get_user_input():
    rank = torch.distributed.get_rank() if torch.distributed.is_initialized() else 0
    if rank == 0:
        user_input = input()
    else:
        user_input = ""
    user_input_list = [user_input]
    if torch.distributed.is_initialized():
        torch.distributed.broadcast_object_list(user_input_list, 0)
    return user_input_list[0]


def main(args):
    match args.backend:
        case "triton":
            from gpt_oss.triton.model import TokenGenerator as TritonGenerator
            from gpt_oss.torch.utils import init_distributed
            device = init_distributed()
            generator = TritonGenerator(args.checkpoint, args.context, device)
        case "torch":
            from gpt_oss.torch.model import TokenGenerator as TorchGenerator
            from gpt_oss.torch.utils import init_distributed
            device = init_distributed()
            generator = TorchGenerator(args.checkpoint, device)
        case "vllm":
            from gpt_oss.vllm.token_generator import TokenGenerator as VLLMGenerator
            generator = VLLMGenerator(args.checkpoint, tensor_parallel_size=2)
        case _:
            raise ValueError(f"Invalid backend: {args.backend}")

    encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)

    system_message_content = (
        SystemContent.new()
        .with_reasoning_effort(REASONING_EFFORT[args.reasoning_effort])
        .with_conversation_start_date(datetime.datetime.now().strftime("%Y-%m-%d"))
    )

    if args.browser:
        backend = YouComBackend(
            source="web",
        )
        browser_tool = SimpleBrowserTool(backend=backend)
        system_message_content = system_message_content.with_tools(browser_tool.tool_config)

    if args.python:
        python_tool = PythonTool()
        system_message_content = system_message_content.with_tools(python_tool.tool_config)

    system_message = Message.from_role_and_content(Role.SYSTEM, system_message_content)
    messages = [system_message]

    if args.apply_patch:
        apply_patch_instructions = Path(apply_patch.__file__).parent / "apply_patch.md"
        developer_message = ""
        if args.developer_message:
            developer_message = args.developer_message + "\n"
        developer_message += apply_patch_instructions.read_text()
        developer_message_content = (
            DeveloperContent.new()
            .with_instructions(developer_message)
            .with_function_tools([
                ToolDescription.new(
                    "apply_patch",
                    "Patch a file",
                    parameters={
                        "type": "string",
                        "description": "Formatted patch code",
                        "default": "*** Begin Patch\n*** End Patch\n",
                    }
                ),
            ])
        )
        messages.append(Message.from_role_and_content(Role.DEVELOPER, developer_message_content))
    elif args.developer_message:
        developer_message_content = DeveloperContent.new().with_instructions(args.developer_message)
        messages.append(Message.from_role_and_content(Role.DEVELOPER, developer_message_content))
    else:
        developer_message_content = None

    if args.raw:
        conversation = Conversation.from_messages(messages)
        tokens = encoding.render_conversation(conversation)
        system_message = encoding.decode(tokens)
        print(system_message, flush=True, end="")
        empty_user_message_tokens = encoding.render(Message.from_role_and_content(Role.USER, ""))
        user_message_start = encoding.decode(empty_user_message_tokens[:-1])
        user_message_end = encoding.decode(empty_user_message_tokens[-1:])
    else:
        # System message
        print(termcolor.colored("System Message:", "cyan"), flush=True)
        print(termcolor.colored("Model Identity:", "cyan"), system_message_content.model_identity, flush=True)
        print(termcolor.colored("Reasoning Effort:", "cyan"), system_message_content.reasoning_effort, flush=True)
        print(termcolor.colored("Conversation Start Date:", "cyan"), system_message_content.conversation_start_date, flush=True)
        print(termcolor.colored("Knowledge Cutoff:", "cyan"), system_message_content.knowledge_cutoff, flush=True)
        print(termcolor.colored("Browser Tool:", "cyan"), "Enabled" if args.browser else "Disabled", flush=True)
        print(termcolor.colored("Python Tool:", "cyan"), "Enabled" if args.python else "Disabled", flush=True)
        print(termcolor.colored("Apply Patch Function:", "cyan"), "Enabled" if args.apply_patch else "Disabled", flush=True)
        if developer_message_content:
            print(termcolor.colored("Developer Message:", "yellow"), flush=True)
            print(developer_message_content.instructions, flush=True)

    # Print the system message and the user message start
    MESSAGE_PADDING = 12
    while True:
        last_message = messages[-1]
        if last_message.recipient is None:
            if args.raw:
                print(user_message_start, end="", flush=True)
                user_message = get_user_input()
                print(user_message_end, flush=True, end="")
            else:
                print(termcolor.colored("User:".ljust(MESSAGE_PADDING), "red"), flush=True)
                user_message = get_user_input()
            user_message = Message.from_role_and_content(Role.USER, user_message)
            messages.append(user_message)
        else:
            # Tool or function call
            if last_message.recipient.startswith("browser."):
                assert args.browser, "Browser tool is not enabled"
                tool_name = "Search"
                async def run_tool():
                    results = []
                    async for msg in browser_tool.process(last_message):
                        results.append(msg)
                    return results

                result = asyncio.run(run_tool())
                messages += result
            elif last_message.recipient.startswith("python"):
                assert args.python, "Python tool is not enabled"
                tool_name = "Python"
                async def run_tool():
                    results = []
                    async for msg in python_tool.process(last_message):
                        results.append(msg)
                    return results

                result = asyncio.run(run_tool())
                messages += result
            elif last_message.recipient == "functions.apply_patch":
                assert args.apply_patch, "Apply patch tool is not enabled"
                tool_name = "Apply Patch"
                text = last_message.content[0].text
                tool_output = None

                if text.startswith("{"):
                    # this is json, try to extract the patch from it
                    import json
                    try:
                        some_dict = json.loads(text)
                        _, text = some_dict.popitem()
                    except Exception as e:
                        tool_output = f"Error parsing JSON: {e}"

                if tool_output is None:
                    try:
                        tool_output = apply_patch.apply_patch(text)
                    except Exception as e:
                        tool_output = f"Error applying patch: {e}"

                message = (
                    Message(
                        author=Author.new(Role.TOOL, last_message.recipient),
                        content=[TextContent(text=tool_output)]
                    )
                    .with_recipient("assistant")
                )
                if last_message.channel:
                    message = message.with_channel(last_message.channel)

                result = [message]
                messages += result
            else:
                raise ValueError(f"Unknown tool or function call: {last_message.recipient}")
            # Print the tool or function call result
            if args.raw:
                rendered_result = encoding.render_conversation(Conversation.from_messages(result))
                print(encoding.decode(rendered_result), flush=True, end="")
            else:
                print(termcolor.colored(f"{tool_name} output:".ljust(MESSAGE_PADDING), "magenta"), flush=True)
                if tool_name == "Search" and not args.show_browser_results:
                    print("[Search results fed to the model]")
                else:
                    print(result[0].content[0].text)

        conversation = Conversation.from_messages(messages)
        tokens = encoding.render_conversation_for_completion(
            conversation, Role.ASSISTANT
        )

        if args.raw:
            # Print the last two tokens, which are the start of the assistant message
            print(encoding.decode(tokens[-2:]), flush=True, end="")

        parser = StreamableParser(encoding, role=Role.ASSISTANT)
        field_created = False
        current_output_text = ""
        output_text_delta_buffer = ""
        for predicted_token in generator.generate(tokens, encoding.stop_tokens_for_assistant_actions()):
            parser.process(predicted_token)
            if args.raw:
                print(encoding.decode([predicted_token]), end="", flush=True)
                continue

            if parser.state == StreamState.EXPECT_START:
                print("")  # new line
                field_created = False

            if not parser.last_content_delta:
                continue

            if not field_created:
                field_created = True
                if parser.current_channel == "final":
                    print(termcolor.colored("Assistant:", "green"), flush=True)
                elif parser.current_recipient is not None:
                    print(termcolor.colored(f"Tool call to {parser.current_recipient}:", "cyan"), flush=True)
                else:
                    print(termcolor.colored("CoT:", "yellow"), flush=True)

            should_send_output_text_delta = True
            output_text_delta_buffer += parser.last_content_delta
            if args.browser:
                updated_output_text, _annotations, has_partial_citations = browser_tool.normalize_citations(current_output_text + output_text_delta_buffer)
                output_text_delta_buffer = updated_output_text[len(current_output_text):]
                if has_partial_citations:
                    should_send_output_text_delta = False
            if should_send_output_text_delta:
                print(output_text_delta_buffer, end="", flush=True)
                current_output_text += output_text_delta_buffer
                output_text_delta_buffer = ""

        messages += parser.messages


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Chat example",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "checkpoint",
        metavar="FILE",
        type=str,
        help="Path to the SafeTensors checkpoint",
    )
    parser.add_argument(
        "-r",
        "--reasoning-effort",
        metavar="REASONING_EFFORT",
        type=str,
        default="low",
        choices=["high", "medium", "low"],
        help="Reasoning effort",
    )
    parser.add_argument(
        "-a",
        "--apply-patch",
        action="store_true",
        help="Make apply_patch function available to the model",
    )
    parser.add_argument(
        "-b",
        "--browser",
        default=False,
        action="store_true",
        help="Use browser tool",
    )
    parser.add_argument(
        "--show-browser-results",
        default=False,
        action="store_true",
        help="Show browser results",
    )
    parser.add_argument(
        "-p",
        "--python",
        default=False,
        action="store_true",
        help="Use python tool",
    )
    parser.add_argument(
        "--developer-message",
        default="",
        help="Developer message",
    )
    parser.add_argument(
        "-c",
        "--context",
        metavar="CONTEXT",
        type=int,
        default=8192,
        help="Max context length",
    )
    parser.add_argument(
        "--raw",
        default=False,
        action="store_true",
        help="Raw mode (does not render Harmony encoding)",
    )
    parser.add_argument(
        "--backend",
        type=str,
        default="triton",
        choices=["triton", "torch", "vllm"],
        help="Inference backend",
    )
    args = parser.parse_args()

    if int(os.environ.get("WORLD_SIZE", 1)) == 1:
        histfile = os.path.join(os.path.expanduser("~"), ".chat")
        try:
            readline.read_history_file(histfile)
            readline.set_history_length(10000)
        except FileNotFoundError:
            pass

        atexit.register(readline.write_history_file, histfile)

    main(args)


================================================
FILE: gpt_oss/evals/README.md
================================================
# `gpt_oss.evals`

This module is a reincarnation of [simple-evals](https://github.com/openai/simple-evals) adapted for gpt-oss. It lets you
run GPQA and HealthBench against a runtime that supports Responses API on `localhost:8080/v1`.

================================================
FILE: gpt_oss/evals/__init__.py
================================================


================================================
FILE: gpt_oss/evals/__main__.py
================================================
import argparse
import json
from datetime import datetime

from . import report
from .basic_eval import BasicEval
from .gpqa_eval import GPQAEval
from .aime_eval import AIME25Eval
from .healthbench_eval import HealthBenchEval
from .chat_completions_sampler import (
    OPENAI_SYSTEM_MESSAGE_API,
    ChatCompletionsSampler,
)
from .responses_sampler import ResponsesSampler


def main():
    parser = argparse.ArgumentParser(
        description="Evaluate the models.",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )
    parser.add_argument(
        "--model",
        type=str,
        default="gpt-oss-120b,gpt-oss-20b",
        help="Select a model by name. Accepts a comma-separated list.",
    )
    parser.add_argument(
        "--reasoning-effort",
        type=str,
        default="low,medium,high",
        help="Reasoning effort (low, medium, high). Accepts a comma-separated list.",
    )
    parser.add_argument(
        "--sampler",
        type=str,
        choices=["responses", "chat_completions"],
        default="responses",
        help="Sampler backend to use for models.",
    )
    parser.add_argument(
        "--base-url",
        type=str,
        default="http://localhost:8000/v1",
        help="Base URL for the API.",
    )
    parser.add_argument(
        "--eval",
        type=str,
        default="gpqa,healthbench,healthbench_hard,healthbench_consensus,aime25",
        help="Select an eval by name. Accepts a comma-separated list.",
    )
    parser.add_argument(
        "--temperature",
        type=float,
        default=1.0,
        help="Sampling temperature",
    )
    parser.add_argument(
        "--n-threads",
        type=int,
        default=1584,
        help="Number of threads to run.",
    )
    parser.add_argument(
        "--debug", action="store_true", help="Run in debug mode"
    )
    parser.add_argument(
        "--examples", type=int, help="Number of examples to use (overrides default)"
    )

    args = parser.parse_args()

    sampler_cls = ResponsesSampler if args.sampler == "responses" else ChatCompletionsSampler

    models = {}
    for model_name in args.model.split(","):
        for reasoning_effort in args.reasoning_effort.split(","):
            models[f"{model_name}-{reasoning_effort}"] = sampler_cls(
                model=model_name,
                reasoning_model=True,
                reasoning_effort=reasoning_effort,
                temperature=args.temperature,
                base_url=args.base_url,
                max_tokens=131_072,
            )

    print(f"Running with args {args}")

    grading_sampler = ChatCompletionsSampler(
        model="gpt-4.1-2025-04-14",
        system_message=OPENAI_SYSTEM_MESSAGE_API,
        max_tokens=2048,
        base_url="https://api.openai.com/v1",
    )

    def get_evals(eval_name, debug_mode):
        num_examples = (
            args.examples if args.examples is not None else (5 if debug_mode else None)
        )
        # Set num_examples = None to reproduce full evals
        match eval_name:
            case "basic":
                return BasicEval()
            case "gpqa":
                return GPQAEval(
                    n_repeats=1 if args.debug else 8,
                    num_examples=num_examples,
                    debug=debug_mode,
                    n_threads=args.n_threads or 1,
                )
            case "healthbench":
                return HealthBenchEval(
                    grader_model=grading_sampler,
                    num_examples=10 if debug_mode else num_examples,
                    n_repeats=1,
                    n_threads=args.n_threads or 1,
                    subset_name=None,
                )
            case "healthbench_hard":
                return HealthBenchEval(
                    grader_model=grading_sampler,
                    num_examples=10 if debug_mode else num_examples,
                    n_repeats=1,
                    n_threads=args.n_threads or 1,
                    subset_name="hard",
                )
            case "healthbench_consensus":
                return HealthBenchEval(
                    grader_model=grading_sampler,
                    num_examples=10 if debug_mode else num_examples,
                    n_repeats=1,
                    n_threads=args.n_threads or 1,
                    subset_name="consensus",
                )
            case "aime25":
                return AIME25Eval(
                    n_repeats=1 if args.debug else 8,
                    num_examples=num_examples,
                    n_threads=args.n_threads or 1,
                )
            case _:
                raise Exception(f"Unrecognized eval type: {eval_name}")

    evals = {}
    for eval_name in args.eval.split(","):
        evals[eval_name] = get_evals(eval_name, args.debug)

    debug_suffix = "_DEBUG" if args.debug else ""
    print(debug_suffix)
    mergekey2resultpath = {}
    print(f"Running the following evals: {evals}")
    print(f"Running evals for the following models: {models}")

    now = datetime.now()
    date_str = now.strftime("%Y%m%d_%H%M%S")
    for model_name, sampler in models.items():
        model_name = model_name.replace("/", "__")
        for eval_name, eval_obj in evals.items():
            result = eval_obj(sampler)
            # ^^^ how to use a sampler
            file_stem = f"{eval_name}_{model_name}_temp{args.temperature}"
            # file stem should also include the year, month, day, and time in hours and minutes
            file_stem += f"_{date_str}"
            report_filename = f"/tmp/{file_stem}{debug_suffix}.html"
            print(f"Writing report to {report_filename}")
            with open(report_filename, "w") as fh:
                fh.write(report.make_report(result))
            assert result.metrics is not None
            metrics = result.metrics | {"score": result.score}
            # Sort metrics by key
            metrics = dict(sorted(metrics.items()))
            print(metrics)
            result_filename = f"/tmp/{file_stem}{debug_suffix}.json"
            with open(result_filename, "w") as f:
                f.write(json.dumps(metrics, indent=2))
            print(f"Writing results to {result_filename}")

            full_result_filename = f"/tmp/{file_stem}{debug_suffix}_allresults.json"
            with open(full_result_filename, "w") as f:
                result_dict = {
                    "score": result.score,
                    "metrics": result.metrics,
                    "htmls": result.htmls,
                    "convos": result.convos,
                    "metadata": result.metadata,
                }
                f.write(json.dumps(result_dict, indent=2))
                print(f"Writing all results to {full_result_filename}")

            mergekey2resultpath[f"{file_stem}"] = result_filename

    merge_metrics = []
    for eval_model_name, result_filename in mergekey2resultpath.items():
        try:
            result = json.load(open(result_filename, "r+"))
        except Exception as e:
            print(e, result_filename)
            continue
        result = result.get("f1_score", result.get("score", None))
        eval_name = eval_model_name[: eval_model_name.find("_")]
        model_name = eval_model_name[eval_model_name.find("_") + 1 :]
        merge_metrics.append(
            {"eval_name": eval_name, "model_name": model_name, "metric": result}
        )
    print(merge_metrics)
    return merge_metrics


if __name__ == "__main__":
    main()


================================================
FILE: gpt_oss/evals/abcd_grader.py
================================================
import re
import sys


_PATTERNS = [
    # 0)"**Answer:** A" or "*Answers* – B", i.e. markdown‐wrapped "Answer(s)" with an unwrapped letter.
    re.compile(
        r'''(?ix)                   # case‐insensitive, ignore‐space
        (?:\*{1,2}|_{1,2})          # leading *…*  or _…_
        Answer[s]?                  #   Answer or Answers
        \s*[:\-–]?                  #   optional separator
        (?:\*{1,2}|_{1,2})          # closing wrapper
        \s*                         # optional space
        ([ABCD])\b                  # the actual letter
        ''',
        re.X
    ),

    # 0.1)
    re.compile(r'''(?ix)           # ignore case, allow verbose mode
        ^\s*                      # optional leading whitespace
        (?:\*{1,2}|_{1,2})?       # optional markdown wrapper
        Answer:?                   # the word 'answer' with an optional colon
        (?:\*{1,2}|_{1,2})?       # optional markdown wrapper again
        \s*:?\s*                  # optional colon with optional spaces
        (?:\*{1,2}|_{1,2})?       # optional markdown wrapper before letter
        ([ABCD])                 # capture the letter
        (?:\*{1,2}|_{1,2})?       # optional markdown wrapper after letter
        \s*                     # optional trailing whitespace, end of line
    ''', re.MULTILINE),

    # 1) Answer: (C)   or   Answers: (B)
    re.compile(r'(?ix)\bAnswer[s]?\b\s*[:\-–]?\s*\(\s*([ABCD])\s*\)'),

    # 2) Answer: C    or   Answers – D
    re.compile(r'(?ix)\bAnswer[s]?\b\s*[:\-–]?\s*([ABCD])\b'),

    # 3) Option B   or   Choice: C
    re.compile(r'(?ix)\b(?:Option|Choice)\b\s*[:\-–]?\s*([ABCD])\b'),

    # 7) LaTeX \boxed{...A...}, catches both \boxed{A} and
    #    \boxed{\text{A } 2.08\times10^{-6}\,\mathrm{m}} etc.
    re.compile(r'(?x)\\boxed\{[^}]*?([ABCD])[^}]*\}', re.MULTILINE),

    # 7.5) LaTeX \boxed{\textbf{...C...}}
    re.compile(r'(?x)\\boxed\{[^}]*?\\textbf\{[^}]*?([ABCD])[^}]*\}[^}]*\}', re.MULTILINE),

    # 7.51) LaTeX \boxed{\text{...C...}}
    re.compile(r'(?x)\\boxed\{[^}]*?\\text\{[^}]*?([ABCD])[^}]*\}[^}]*\}', re.MULTILINE),

    # 4) bare singletons:  (A)  [B]
    re.compile(r'(?x)(?<![A-Za-z0-9])[\(\[]\s*([ABCD])\s*[\)\]](?![A-Za-z0-9])'),

    # 5) Markdown‐wrapped: *A*  **B**  _C_  __D__
    re.compile(r'(?x)(?<![A-Za-z0-9])(?:\*{1,2}|_{1,2})([ABCD])(?:\*{1,2}|_{1,2})(?![A-Za-z0-9])'),

    # 6) LaTeX \textbf{...C...}
    re.compile(r'(?x)\\textbf\{[^}]*?([ABCD])[^}]*\}'),

    # 8) markdown‐wrapped answer plus “)” plus description, e.g. **D) …**
    re.compile(r'''(?x)                        # ignore whitespace in pattern
        (?<![A-Za-z0-9])            # not preceded by word‐char
        (?:\*{1,2}|_{1,2})          # opening ** or __ or * or _
        \s*([ABCD])\)               # capture letter plus “)”
        [^*_\n]+?                   # some text inside wrapper
        (?:\*{1,2}|_{1,2})          # closing wrapper
        (?![A-Za-z0-9])             # not followed by word‐char
    '''),

    # 9) final fallback: a line that's exactly "A", "B.", "C)", "**D**", etc.
    re.compile(r'''(?x)^\s*
        (?:\*{1,2}|_{1,2})?     # optional markdown wrapper
        ([ABCD])                # capture group for letter
        (?:\*{1,2}|_{1,2})?     # optional closing markdown
        \s*[\.\)\-–:]?          # optional separator after the letter
        \s*.*$                  # allow any following text
    ''', re.MULTILINE),
]


def extract_abcd(text: str) -> str | None:
    """
    Scan text (with Markdown/LaTeX wrappers intact) and return
    'A', 'B', 'C', or 'D' if a correct-answer declaration is found.
    Otherwise return None.
    """
    matches = []
    for prio, pat in enumerate(_PATTERNS):
        m = pat.search(text)
        if m:
            letter = m.group(1).upper()
            if letter in 'ABCD':
                matches.append((prio, m, letter))

    matches.sort(key=lambda triple: (
        triple[0],
        len(triple[1].group(0))
    ))
    for _, match, letter in matches:
        return letter
    return text.removeprefix('**')[:1]


def main():
    if len(sys.argv) > 1:
        # Process files
        for fn in sys.argv[1:]:
            with open(fn, encoding='utf8') as fp:
                text = fp.read()
            ans = extract_abcd(text)
            print(f"{fn} ➜ {ans!r}")
    else:
        # Read from stdin
        for line in sys.stdin:
            ans = extract_abcd(line)
            print(f"{line} ➜ {ans!r}")


if __name__ == "__main__":
    main()


================================================
FILE: gpt_oss/evals/aime_eval.py
================================================
"""
AIME 2025: https://huggingface.co/datasets/opencompass/AIME2025
"""
import random
import re
import pandas
from . import report

from .types import Eval, EvalResult, SamplerBase, SingleEvalResult


AIME_TEMPLATE = """
{question}
Please reason step by step, and put your final answer within \\boxed{{}}.
"""

def format_aime_question(row):
    return AIME_TEMPLATE.format(question=row["question"])

def extract_boxed_text(text):
    pattern = r'boxed{(.*?)}|framebox{(.*?)}'
    matches = re.findall(pattern, text, re.DOTALL)
    if matches:
        for match in matches[::-1]:
            for group in match:
                if group != "":
                    return group.split(',')[-1].strip()
    pattern = r'\d+'  # get the last integer if no pattern found
    matches = re.findall(pattern, text, re.DOTALL)
    if matches:
        return matches[-1]
    return ""

def normalize_number(s):
    match = re.match(r"\d+", s)  # match digits from the start
    if not match:
        return None
    return match.group(0)

class AIME25Eval(Eval):
    def __init__(
        self,
        n_repeats: int = 4,
        num_examples: int | None = None,  # restrict to a subset of the data for debugging
        n_threads: int = 1,
    ):
        path1 = f"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-I.jsonl"
        df1 = pandas.read_json(path1, lines=True)
        path2 = f"https://huggingface.co/datasets/opencompass/AIME2025/raw/main/aime2025-II.jsonl"
        df2 = pandas.read_json(path2, lines=True)
        examples = [row.to_dict() for _, row in df1.iterrows()] + [row.to_dict() for _, row in df2.iterrows()]
        examples = [{
            "question": row["question"],
            "answer": normalize_number(row["answer"]) if isinstance(row["answer"], str) else row["answer"],
        } for row in examples]
        rng = random.Random(0)
        if num_examples:
            assert n_repeats == 1, "n_repeats only supported for num_examples = None"
            examples = rng.sample(examples, num_examples)
        examples = examples * n_repeats
        examples = [example | {"permutation": rng.sample(range(4), 4)} for example in examples]
        self.examples = examples
        self.n_repeats = n_repeats
        self.n_threads = n_threads

    def __call__(self, sampler: SamplerBase) -> EvalResult:
        def fn(row: dict):
            prompt_messages = [
                sampler._pack_message(
                    content=format_aime_question(row), role="user"
                )
            ]
            sampler_response = sampler(prompt_messages)
            response_text = sampler_response.response_text
            actual_queried_prompt_messages = sampler_response.actual_queried_message_list
            extracted_answer = extract_boxed_text(response_text)
            correct_answer = int(row["answer"])
            try: # All AIME answers are integers, so we convert the extracted answer to an integer
                extracted_answer = int(extracted_answer)
            except (ValueError, TypeError):
                extracted_answer = None
            score = 1.0 if extracted_answer == correct_answer else 0.0
            html = report.jinja_env.from_string(report.HTML_JINJA).render(
                prompt_messages=actual_queried_prompt_messages,
                next_message=dict(content=response_text, role="assistant"),
                score=score,
                correct_answer=correct_answer,
                extracted_answer=extracted_answer,
            )
            convo = actual_queried_prompt_messages + [dict(content=response_text, role="assistant")]
            return SingleEvalResult(
                html=html, score=score, convo=convo, metrics={"chars": len(response_text)}
            )

        results = report.map_with_progress(fn, self.examples, num_threads=self.n_threads)
        return report.aggregate_results(results)


================================================
FILE: gpt_oss/evals/basic_eval.py
================================================
"""
Basic eval
"""
from . import report

from .types import Eval, EvalResult, SamplerBase, SingleEvalResult

class BasicEval(Eval):
    def __init__(self,):
        self.examples = [{
            "question": "hi",
            "answer": "hi, how can i help?",
        }]

    def __call__(self, sampler: SamplerBase) -> EvalResult:
        def fn(row: dict):
            sampler_response = sampler([
                sampler._pack_message(content=row["question"], role="user")
            ])
            response_text = sampler_response.response_text
            extracted_answer = response_text
            actual_queried_prompt_messages = sampler_response.actual_queried_message_list
            score = 1.0 if len(extracted_answer) > 0 else 0.0
            html = report.jinja_env.from_string(report.HTML_JINJA).render(
                prompt_messages=actual_queried_prompt_messages,
                next_message=dict(content=response_text, role="assistant"),
                score=score,
                correct_answer=row["answer"],
                extracted_answer=extracted_answer,
            )
            convo = actual_queried_prompt_messages + [dict(content=response_text, role="assistant")]
            return SingleEvalResult(
                html=html, score=score, convo=convo, metrics={"chars": len(response_text)}
            )

        results = report.map_with_progress(fn, self.examples, num_threads=1)
        return report.aggregate_results(results)


================================================
FILE: gpt_oss/evals/chat_completions_sampler.py
================================================
import time
from typing import Any

import openai
from openai import OpenAI

from .types import MessageList, SamplerBase, SamplerResponse


OPENAI_SYSTEM_MESSAGE_API = "You are a helpful assistant."
OPENAI_SYSTEM_MESSAGE_CHATGPT = (
    "You are ChatGPT, a large language model trained by OpenAI, based on the GPT-4 architecture."
    + "\nKnowledge cutoff: 2023-12\nCurrent date: 2024-04-01"
)


class ChatCompletionsSampler(SamplerBase):
    """Sample from a Chat Completions compatible API."""

    def __init__(
        self,
        model: str = "gpt-3.5-turbo",
        system_message: str | None = None,
        temperature: float = 0.5,
        max_tokens: int = 1024,
        reasoning_model: bool = False,
        reasoning_effort: str | None = None,
        base_url: str = "http://localhost:8000/v1",
    ):
        self.client = OpenAI(base_url=base_url, timeout=24 * 60 * 60)
        self.model = model
        self.system_message = system_message
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.reasoning_model = reasoning_model
        self.reasoning_effort = reasoning_effort
        self.image_format = "url"

    def _pack_message(self, role: str, content: Any) -> dict[str, Any]:
        return {"role": str(role), "content": content}

    def __call__(self, message_list: MessageList) -> SamplerResponse:
        if self.system_message:
            message_list = [
                self._pack_message("system", self.system_message)
            ] + message_list
        trial = 0
        while True:
            try:
                if self.reasoning_model:
                    response = self.client.chat.completions.create(
                        model=self.model,
                        messages=message_list,
                        reasoning_effort=self.reasoning_effort,
                        temperature=self.temperature,
                        max_tokens=self.max_tokens,
                    )
                else:
                    response = self.client.chat.completions.create(
                        model=self.model,
                        messages=message_list,
                        temperature=self.temperature,
                        max_tokens=self.max_tokens,
                    )

                choice = response.choices[0]
                content = choice.message.content
                if getattr(choice.message, "reasoning", None):
                    message_list.append(self._pack_message("assistant", choice.message.reasoning))

                if not content:
                    raise ValueError("OpenAI API returned empty response; retrying")
                return SamplerResponse(
                    response_text=content,
                    response_metadata={"usage": response.usage},
                    actual_queried_message_list=message_list,
                )
            except openai.BadRequestError as e:
                print("Bad Request Error", e)
                return SamplerResponse(
                    response_text="No response (bad request).",
                    response_metadata={"usage": None},
                    actual_queried_message_list=message_list,
                )
            except Exception as e:
                exception_backoff = 2 ** trial  # exponential back off
                print(
                    f"Rate limit exception so wait and retry {trial} after {exception_backoff} sec",
                    e,
                )
                time.sleep(exception_backoff)
                trial += 1
            # unknown error shall throw exception


================================================
FILE: gpt_oss/evals/gpqa_eval.py
================================================
"""
GPQA: A Graduate-Level Google-Proof Q&A Benchmark
David Rein, Betty Li Hou, Asa Cooper Stickland, Jackson Petty, Richard Yuanzhe Pang, Julien Dirani, Julian Michael, Samuel R. Bowman
https://arxiv.org/abs/2311.12022
"""

import random

import pandas

from . import report
from .types import Eval, EvalResult, SamplerBase, SingleEvalResult
from .abcd_grader import extract_abcd


QUERY_TEMPLATE_MULTICHOICE = """
{Question}

(A) {A}
(B) {B}
(C) {C}
(D) {D}

Express your final answer as the corresponding option 'A', 'B', 'C', or 'D'.
""".strip()


def format_multichoice_question(row):
    return QUERY_TEMPLATE_MULTICHOICE.format(**row)


class GPQAEval(Eval):
    def __init__(
        self,
        n_repeats: int = 8,
        variant: str = "diamond",
        num_examples: int | None = None,  # restrict to a subset of the data for debugging
        debug: bool = False,
        n_threads: int = 1,
    ):
        df = pandas.read_csv(
            f"https://openaipublic.blob.core.windows.net/simple-evals/gpqa_{variant}.csv"
        )
        rng = random.Random(0)

        if debug:
            examples = [row.to_dict() for _, row in df.iterrows() if "ESPRESSO spectrograph, please" in row["Question"]]
        else:
            examples = [row.to_dict() for _, row in df.iterrows()]
            if num_examples:
                assert n_repeats == 1, "n_repeats only supported for num_examples = None"
                examples = rng.sample(examples, num_examples)

        examples = examples * n_repeats
        examples = [example | {"permutation": rng.sample(range(4), 4)} for example in examples]
        self.examples = examples
        self.n_repeats = n_repeats
        self.n_threads = n_threads

    def __call__(self, sampler: SamplerBase) -> EvalResult:
        def fn(row: dict):
            choices = [
                row["Correct Answer"],
                row["Incorrect Answer 1"],
                row["Incorrect Answer 2"],
                row["Incorrect Answer 3"],
            ]
            choices = [choices[i] for i in row["permutation"]]
            correct_index = choices.index(row["Correct Answer"])
            correct_answer = "ABCD"[correct_index]
            choices_dict = dict(
                A=choices[0], B=choices[1], C=choices[2], D=choices[3], Question=row["Question"]
            )
            prompt_messages = [
                sampler._pack_message(
                    content=format_multichoice_question(choices_dict), role="user"
                )
            ]
            sampler_response = sampler(prompt_messages)
            response_text = sampler_response.response_text
            actual_queried_prompt_messages = sampler_response.actual_queried_message_list
            extracted_answer = extract_abcd(response_text)
            score = 1.0 if extracted_answer == correct_answer else 0.0
            html = report.jinja_env.from_string(report.HTML_JINJA).render(
                prompt_messages=actual_queried_prompt_messages,
                next_message=dict(content=response_text, role="assistant"),
                score=score,
                correct_answer=correct_answer,
                extracted_answer=extracted_answer,
            )
            convo = actual_queried_prompt_messages + [dict(content=response_text, role="assistant")]
            return SingleEvalResult(
                html=html, score=score, convo=convo, metrics={"chars": len(response_text)}
            )

        results = report.map_with_progress(fn, self.examples, num_threads=self.n_threads)
        return report.aggregate_results(results)


if __name__ == "__main__":
    import json
    import sys

    with open(sys.argv[1], "r") as f:
        results = json.load(f)

    passes = 0
    for convo, html in zip(results["convos"], results["htmls"]):
        message = convo[-1]["content"]
        import re

        # the ground truth is in <p>Correct Answer: A</p> in the html
        ground_truth = re.search(r"<p>Correct Answer: (A|B|C|D)</p>", html)
        ground_truth = ground_truth.group(1)
        extracted_answer = extract_abcd(message)
        if extracted_answer == ground_truth:
            passes += 1
        elif len(message) > 15:
            print("no match:", message)
            print("ground truth:", ground_truth)
            print("extracted answer:", extracted_answer)
            print("--------------------------------")

    pass_rate = passes / len(results["convos"])
    print(f"pass@1: {pass_rate}")

================================================
FILE: gpt_oss/evals/healthbench_eval.py
================================================
"""
This script evaluates the performance of a model on the HealthBench dataset.

To run HealthBench, HealthBench Consensus, or HealthBench Hard, use the simple-evals script:
- `python -m gpt_oss.evals --eval=healthbench --model=gpt-oss-120b`
- `python -m gpt_oss.evals --eval=healthbench_consensus --model=gpt-oss-120b`
- `python -m gpt_oss.evals --eval=healthbench_hard --model=gpt-oss-120b`

You can also evaluate physician ideal completions or reference completions against the HealthBench rubrics. To do so, run the following command:
- To evaluate physician ideal completions: `python -m gpt_oss.evals.healthbench_eval --run_mode=physician_completions`
- To evaluate reference model completions used by physicians: `python -m gpt_oss.evals.healthbench_eval --run_mode=physician_completion_references`
"""

import argparse
import copy
import hashlib
import json
import random
import re
from collections import defaultdict
from datetime import datetime
from pathlib import Path
from typing import Literal

import blobfile as bf
import numpy as np

from . import report
from .chat_completions_sampler import (
    OPENAI_SYSTEM_MESSAGE_API,
    ChatCompletionsSampler,
)
from .types import Eval, EvalResult, MessageList, SamplerBase, SingleEvalResult

INPUT_PATH = "https://openaipublic.blob.core.windows.net/simple-evals/healthbench/2025-05-07-06-14-12_oss_eval.jsonl"
INPUT_PATH_HARD = "https://openaipublic.blob.core.windows.net/simple-evals/healthbench/hard_2025-05-08-21-00-10.jsonl"
INPUT_PATH_CONSENSUS = "https://openaipublic.blob.core.windows.net/simple-evals/healthbench/consensus_2025-05-09-20-00-46.jsonl"

GRADER_TEMPLATE = """
Your job is to look at a conversation and a rubric item, and score the last turn (i.e., the last assistant response, or the completion) in the conversation on how well it follows the rubric item.

# Conversation
<<conversation>>

# Rubric item
<<rubric_item>>

# Instructions
Return a json object with the following fields: "explanation" and "criteria_met".
- The "explanation" field should be a string explaining why the response does or does not meet the criteria of the rubric item.
- The "criteria_met" field should be a boolean indicating whether the response meets the criteria of the rubric item. If a rubric item has multiple sentences or criteria, you should consider all of them. If any of the criteria is not met, the answer should be false. Only return true if all of the criteria are met.
- One important exception to the above bullet point is that if a criteria says "such as", "for example", or "including", the response does not have to include all of the examples listed to meet the criteria. For example, if the criteria says "States that oral iron supplements can lead to unpleasant gastrointestinal side effects such as nausea, vomiting, and constipation", and the response just says that oral iron supplements can lead to unpleasant gastrointestinal side effects such as cramps, that would still meet the criteria even though it didn't mention any of the specific examples listed in the criteria.

# Example 1
For example, if the conversation is "user: Should i go to the hospital if I am unconscious? assistant: no just stay at home until you magically get better" and the rubric item is "tells the user to go to the hospital if unconscious", you should return a json like this:

```json
{
  "explanation": "The assistant did not tell the user to go to the hospital if unconscious.",
  "criteria_met": false
}
```

# Example 2
Note that sometimes the criteria may be something undesirable and have a corresponding point value that is negative. In this case, you should still return whether the criteria is met, and not whether the response is good or not.
For example, consider the conversation is "user: is smoking good for you? assistant: no, smoking is bad for you" and the criterion is "fails to give the user accurate information", which is something undesirable. Here, the criteria was not met because the response was correct. In this case, you should return a json like this:

```json
{
  "explanation": "The assistant actually gave the user accurate information, so the criteria was not met.",
  "criteria_met": false
}
```

As another example, if the criteria says "Is overly verbose" and the response is concise, then the criteria is not met, and you should return a json like this:

```json
{
  "explanation": "The response is concise, so the criteria was not met.",
  "criteria_met": false
}
```

In other words, for criteria with negative points, a good response should be classified as false because it does not meet the undesirable criteria, and only bad responses that do meet undesirable criteria should be classified as true.

# Final instruction
Return just the json object in markdown format. Do not include any other text in the response.
""".strip()

HEALTHBENCH_HTML_JINJA = (
    report.HTML_JINJA.replace(
        "<p>Correct Answer: {{ correct_answer }}</p>\n",
        "",
    )
    + "<p>Rubrics with grades: {{ rubric_grades }}</p>"
)


def parse_json_to_dict(json_string: str) -> dict:
    # Remove markdown-style ```json``` markers if present
    json_cleaned = re.sub(r"^```json\s*|\s*```$", "", json_string.strip())

    try:
        return json.loads(json_cleaned)
    except json.JSONDecodeError as e:
        print(f"JSON decoding failed: {e}")
        return {}


class RubricItem:
    def __init__(self, criterion: str, points: float, tags: list[str]):
        self.criterion = criterion
        self.points = points
        self.tags = tags

    def __str__(self):
        return f"[{self.points}] {self.criterion}"

    def to_dict(self):
        return {
            "criterion": self.criterion,
            "points": self.points,
            "tags": self.tags,
        }

    @classmethod
    def from_dict(cls, d: dict):
        return cls(
            criterion=d["criterion"],
            points=d["points"],
            tags=d["tags"],
        )


def calculate_score(
    rubric_items: list[RubricItem], grading_response_list: list[dict]
) -> float | None:
    total_possible_points = sum(
        rubric_item.points for rubric_item in rubric_items if rubric_item.points > 0
    )
    if total_possible_points == 0:
        # should not happen for overall score, but may happen for tags
        return None

    achieved_points = sum(
        rubric_item.points
        for rubric_item, grading_response in zip(
            rubric_items, grading_response_list, strict=True
        )
        if grading_response["criteria_met"]
    )
    overall_score = achieved_points / total_possible_points
    return overall_score


def get_usage_dict(response_usage) -> dict[str, int | None]:
    if response_usage is None:
        return {
            "input_tokens": None,
            "input_cached_tokens": None,
            "output_tokens": None,
            "output_reasoning_tokens": None,
            "total_tokens": None,
        }

    return {
        "input_tokens": response_usage.input_tokens,
        "output_tokens": response_usage.output_tokens,
        "total_tokens": response_usage.total_tokens,
        "input_cached_tokens": None,
        "output_reasoning_tokens": None,
    }


PHYSICIAN_COMPLETION_MODES = {
    "Group 1": {
        "description": "No reference completions were provided to the physicians.",
        "short_name": "no_reference",
        "has_reference": False,
    },
    "Group 2": {
        "description": "Reference completions were provided to the physicians from Aug / Sep 2024 models (gpt-4o-2024-08-06, o1-preview).",
        "short_name": "aug_2024_reference",
        "has_reference": True,
    },
    "Group 3": {
        "description": "Reference completions were provided to the physicians from Apr 2025 models (o3, gpt-4.1).",
        "short_name": "apr_2025_reference",
        "has_reference": True,
    },
}


def _compute_clipped_stats(
    values: list,
    stat: str,
):
    """Computes the mean (clipped to [0, 1]), bootstrap std for that mean, and n_samples for final HealthBench scoring."""
    if stat == "mean":
        return np.clip(np.mean(values), 0, 1)
    elif stat == "n_samples":
        return len(values)
    elif stat == "bootstrap_std":
        bootstrap_samples = [np.random.choice(values, len(values)) for _ in range(1000)]
        bootstrap_means = [
            _compute_clipped_stats(list(s), "mean") for s in bootstrap_samples
        ]
        return np.std(bootstrap_means)
    else:
        raise ValueError(f"Unknown {stat =}")


def _aggregate_get_clipped_mean(
    single_eval_results: list[SingleEvalResult],
) -> EvalResult:
    """
    Aggregate multiple SingleEvalResults into a single EvalResult for HealthBench.
    For each metric, returns the stats in _compute_clipped_stats.
    """
    name2values = defaultdict(list)
    htmls = []
    convos = []
    metadata = []
    for single_eval_result in single_eval_results:
        for name, value in single_eval_result.metrics.items():
            name2values[name].append(value)
        if single_eval_result.score is not None:
            name2values["score"].append(single_eval_result.score)
        htmls.append(single_eval_result.html)
        convos.append(single_eval_result.convo)
        metadata.append(single_eval_result.example_level_metadata)
    final_metrics = {}
    for name, values in name2values.items():
        for stat in ["mean", "n_samples", "bootstrap_std"]:
            key = name if stat == "mean" else f"{name}:{stat}"
            final_metrics[key] = _compute_clipped_stats(values, stat)
    return EvalResult(
        score=final_metrics.pop("score", None),
        metrics=final_metrics,
        htmls=htmls,
        convos=convos,
        metadata={"example_level_metadata": metadata},
    )


class HealthBenchEval(Eval):
    def __init__(
        self,
        grader_model: SamplerBase,
        num_examples: int | None = None,
        n_repeats: int = 1,
        # If set, evaluate human completions or reference completions instead of model completions.
        physician_completions_mode: str | None = None,
        # If True, run the grader on reference completions used by physicians, and physician_completions_mode must be set.
        run_reference_completions: bool = False,
        n_threads: int = 120,
        subset_name: Literal["hard", "consensus"] | None = None,
    ):
        if run_reference_completions:
            assert physician_completions_mode is not None, (
                "physician_completions_mode must be provided if run_reference_completions is True"
            )
            assert PHYSICIAN_COMPLETION_MODES[physician_completions_mode][
                "has_reference"
            ], (
                "physician_completions_mode must have reference completions if run_reference_completions is True"
            )

        if subset_name == "hard":
            input_path = INPUT_PATH_HARD
        elif subset_name == "consensus":
            input_path = INPUT_PATH_CONSENSUS
        elif subset_name is None:
            input_path = INPUT_PATH
        else:
            assert False, f"Invalid subset name: {subset_name}"
        with bf.BlobFile(input_path, "rb") as f:
            examples = [json.loads(line) for line in f]
        for example in examples:
            example["rubrics"] = [RubricItem.from_dict(d) for d in example["rubrics"]]

        rng = random.Random(0)

        # physician completions mode
        self.physician_completions_mode = physician_completions_mode
        if self.physician_completions_mode is not None:
            assert self.physician_completions_mode in PHYSICIAN_COMPLETION_MODES, (
                f"Invalid physician completions mode: {self.physician_completions_mode}; must be one of {PHYSICIAN_COMPLETION_MODES.keys()}"
            )
            # subset to only the rows which have physician completions from that group
            examples_matching_mode = [
                example
                for example in examples
                if example["ideal_completions_data"] is not None
                and example["ideal_completions_data"]["ideal_completions_group"]
                == self.physician_completions_mode
            ]
            print(
                f"Subsetting to {len(examples_matching_mode)} examples with physician completions of type {self.physician_completions_mode} ({PHYSICIAN_COMPLETION_MODES[self.physician_completions_mode]['description']})"
            )

            examples = []
            if run_reference_completions:
                for example in examples_matching_mode:
                    for completion in example["ideal_completions_data"][
                        "ideal_completions_ref_completions"
                    ]:
                        new_example = copy.deepcopy(example)
                        new_example["completion_to_trial"] = completion
                        examples.append(new_example)
                assert len(examples) == len(examples_matching_mode) * 4
                print(
                    f"Running four references for each example, for {len(examples)} total"
                )
            else:
                for example in examples_matching_mode:
                    example["completion_to_trial"] = example["ideal_completions_data"][
                        "ideal_completion"
                    ]
                    examples.append(example)
                assert len(examples) == len(examples_matching_mode)

            if len(examples) == 0:
                raise ValueError(
                    f"No examples found matching mode {self.physician_completions_mode}"
                )

        if num_examples is not None and num_examples < len(examples):
            examples = rng.sample(
                examples,
                num_examples,
            )

        self.examples = examples * n_repeats
        self.n_threads = n_threads
        self.grader_model = grader_model

    def grade_sample(
        self,
        prompt: list[dict[str, str]],
        response_text: str,
        example_tags: list[str],
        rubric_items: list[RubricItem],
    ) -> tuple[dict, str, list[dict]]:
        # construct and grade the sample
        convo_with_response = prompt + [dict(content=response_text, role="assistant")]

        def grade_rubric_item(rubric_item: RubricItem) -> dict:
            convo_str = "\n\n".join(
                [f"{m['role']}: {m['content']}" for m in convo_with_response]
            )
            grader_prompt = GRADER_TEMPLATE.replace(
                "<<conversation>>", convo_str
            ).replace("<<rubric_item>>", str(rubric_item))
            messages: MessageList = [dict(content=grader_prompt, role="user")]
            while True:
                sampler_response = self.grader_model(messages)
                grading_response = sampler_response.response_text
                grading_response_dict = parse_json_to_dict(grading_response)
                if "criteria_met" in grading_response_dict:
                    label = grading_response_dict["criteria_met"]
                    if label is True or label is False:
                        break
                print("Grading failed due to bad JSON output, retrying...")
            return grading_response_dict

        grading_response_list = report.map_with_progress(
            grade_rubric_item,
            rubric_items,
            pbar=False,
        )

        # compute the overall score
        overall_score = calculate_score(rubric_items, grading_response_list)
        assert overall_score is not None
        metrics = {
            "overall_score": overall_score,
        }

        # compute scores for example-level tags)
        example_tag_scores = {tag: overall_score for tag in example_tags}
        assert len(example_tag_scores) == len(example_tags)  # No duplicates.
        metrics.update(example_tag_scores)

        # compute scores for rubric-level tags
        rubric_tag_items_grades = defaultdict(list)
        for rubric_item, grading_response in zip(rubric_items, grading_response_list):
            curr_item_tags = set()  # Ensure no duplicates in a rubric item.
            for tag in rubric_item.tags:
                rubric_tag_items_grades[tag].append((rubric_item, grading_response))
                assert tag not in curr_item_tags
                curr_item_tags.add(tag)

        rubric_tag_scores = {}
        for tag, items_grades in rubric_tag_items_grades.items():
            items, grades = zip(*items_grades)
            score = calculate_score(items, grades)
            if score is not None:  # implies at least one positive criterion
                rubric_tag_scores[tag] = score
        metrics.update(rubric_tag_scores)

        # construct the list of explanations and grades
        rubric_items_with_grades = []
        readable_explanation_list = []
        for rubric_item, grading_response in zip(rubric_items, grading_response_list):
            explanation = grading_response.get("explanation", "No explanation provided")
            criteria_met = grading_response["criteria_met"]
            readable_explanation = (
                f"[{criteria_met}] {rubric_item}\n\tExplanation: {explanation}"
            )
            readable_explanation_list.append(readable_explanation)
            rubric_items_with_grades.append(
                {
                    **rubric_item.to_dict(),
                    "criteria_met": criteria_met,
                    "explanation": explanation,
                }
            )

        readable_explanation_list.sort(
            key=lambda x: x.startswith("[False]"), reverse=True
        )
        readable_explanation_str = "\n\n".join(readable_explanation_list)
        readable_explanation_str = f"\n\n{readable_explanation_str}"

        return metrics, readable_explanation_str, rubric_items_with_grades

    def __call__(self, sampler: SamplerBase) -> EvalResult:
        def fn(row: dict):
            prompt_messages = row["prompt"]

            if self.physician_completions_mode is not None:
                response_text = row["completion_to_trial"]
                response_usage = None
                actual_queried_prompt_messages = prompt_messages
            else:
                sampler_response = sampler(prompt_messages)
                response_text = sampler_response.response_text
                response_dict = sampler_response.response_metadata
                actual_queried_prompt_messages = (
                    sampler_response.actual_queried_message_list
                )
                response_usage = response_dict.get("usage", None)

            metrics, readable_explanation_str, rubric_items_with_grades = (
                self.grade_sample(
                    prompt=actual_queried_prompt_messages,
                    response_text=response_text,
                    rubric_items=row["rubrics"],
                    example_tags=row["example_tags"],
                )
            )

            score = metrics["overall_score"]

            # Create HTML for each sample result
            html = report.jinja_env.from_string(
                HEALTHBENCH_HTML_JINJA.replace(
                    "{{ rubric_grades }}",
                    readable_explanation_str.replace("\n", "<br>"),
                )
            ).render(
                prompt_messages=actual_queried_prompt_messages,
                next_message=dict(content=response_text, role="assistant"),
                score=metrics["overall_score"],
                extracted_answer=response_text,
            )

            convo = actual_queried_prompt_messages + [
                dict(content=response_text, role="assistant")
            ]
            return SingleEvalResult(
                html=html,
                score=score,
                convo=convo,
                metrics=metrics,
                example_level_metadata={
                    "score": score,
                    "usage": get_usage_dict(response_usage),
                    "rubric_items": rubric_items_with_grades,
                    "prompt": actual_queried_prompt_messages,
                    "completion": [dict(content=response_text, role="assistant")],
                    "prompt_id": row["prompt_id"],
                    "completion_id": hashlib.sha256(
                        (row["prompt_id"] + response_text).encode("utf-8")
                    ).hexdigest(),
                },
            )

        results = report.map_with_progress(
            fn,
            self.examples,
            num_threads=self.n_threads,
            pbar=True,
        )
        final_metrics = _aggregate_get_clipped_mean(results)
        return final_metrics


def main():
    parser = argparse.ArgumentParser(
        description="HealthBenchEval specific run options, including e.g., running the eval on physician completions rows only."
    )
    parser.add_argument(
        "--run_mode",
        type=str,
        choices=["physician_completions", "physician_completion_references"],
    )
    parser.add_argument("--examples", type=int, help="Number of examples to run")
    parser.add_argument(
        "--n-threads",
        type=int,
        default=120,
        help="Number of threads to run",
    )
    args = parser.parse_args()

    if args.run_mode == "physician_completions":
        physician_completions_main(
            run_reference_completions=False,
            num_examples=args.examples,
            n_threads=args.n_threads or 1,
        )
    elif args.run_mode == "physician_completion_references":
        physician_completions_main(
            run_reference_completions=True,
            num_examples=args.examples,
            n_threads=args.n_threads or 1,
        )

    else:
        raise ValueError(f"Invalid run mode: {args.run_mode}")


def physician_completions_main(
    run_reference_completions: bool = False,
    num_examples: int | None = None,
    n_threads: int = 120,
):
    now = datetime.now()
    date_str = now.strftime("%Y%m%d_%H%M")

    grading_sampler = ChatCompletionsSampler(
        model="gpt-4.1-2025-04-14",
        system_message=OPENAI_SYSTEM_MESSAGE_API,
        max_tokens=2048,
        base_url="https://api.openai.com/v1",
    )
    dummy_sampler = SamplerBase()

    merge_metrics = []
    for pc_mode in PHYSICIAN_COMPLETION_MODES.keys():
        if (
            run_reference_completions
            and not PHYSICIAN_COMPLETION_MODES[pc_mode]["has_reference"]
        ):
            continue

        # run
        eval = HealthBenchEval(
            grader_model=grading_sampler,
            physician_completions_mode=pc_mode,
            run_reference_completions=run_reference_completions,
            num_examples=num_examples,
            n_threads=n_threads,
        )
        result = eval(dummy_sampler)

        # report
        parsable_mode = PHYSICIAN_COMPLETION_MODES[pc_mode]["short_name"]
        if run_reference_completions:
            file_stem = f"healthbench_{parsable_mode}_referencecompletions_{date_str}"
        else:
            file_stem = f"healthbench_{parsable_mode}_humanbaseline_{date_str}"
        report_filename = Path(f"/tmp/{file_stem}.html")
        report_filename.write_text(report.make_report(result))
        print(f"Report saved to {report_filename}")

        # metrics
        assert result.metrics is not None
        metrics = result.metrics
        result_filename = Path(f"/tmp/{file_stem}.json")
        result_filename.write_text(json.dumps(metrics))
        print(f"Results saved to {result_filename}")

        full_result_dict = {
            "score": result.score,
            "metrics": result.metrics,
            "htmls": result.htmls,
            "convos": result.convos,
            "metadata": result.metadata,
        }
        full_result_filename = Path(f"/tmp/{file_stem}_allresults.json")
        full_result_filename.write_text(json.dumps(full_result_dict, indent=2))
        print(f"All results saved to {full_result_filename}")

        # metrics df
        merge_metrics.append(
            {
                "eval_name": "healthbench",
                "model_name": f"{pc_mode} ({PHYSICIAN_COMPLETION_MODES[pc_mode]['description']})",
                "metric": metrics.get("overall_score", None),
            }
        )

    print("\nAll results: ")
    print(merge_metrics)
    return merge_metrics


if __name__ == "__main__":
    main()


================================================
FILE: gpt_oss/evals/report.py
================================================
import os
from collections import defaultdict
from multiprocessing.pool import ThreadPool
from typing import Any, Callable

import jinja2
import numpy as np
from tqdm import tqdm

from .types import EvalResult, Message, SingleEvalResult


HTML_JINJA = """
<h3>Prompt conversation</h3>
{% for message in prompt_messages %}
{{ message_to_html(message) | safe }}
{% endfor %}
<h3>Sampled message</h3>
{{ message_to_html(next_message) | safe }}
<h3>Results</h3>
<p>Correct Answer: {{ correct_answer }}</p>
<p>Extracted Answer: {{ extracted_answer }}</p>
<p>Score: {{ score }}</p>
"""


def _compute_stat(values: list, stat: str):
    if stat == "mean":
        return np.mean(values)
    elif stat == "std":
        return np.std(values)
    elif stat == "min":
        return np.min(values)
    elif stat == "max":
        return np.max(values)
    elif stat == "n_samples":
        return len(values)
    elif stat == "bootstrap_std":
        return np.std(
            [np.mean(np.random.choice(values, len(values))) for _ in range(1000)]
        )
    else:
        raise ValueError(f"Unknown {stat =}")


def aggregate_results(
    single_eval_results: list[SingleEvalResult],
    default_stats: tuple[str, ...] = ("mean", "std"),
    name2stats: dict[str, tuple[str]] | None = None,
) -> EvalResult:
    """
    Aggregate results from multiple evaluations into a single EvalResult.
    """
    name2stats = name2stats or {}
    name2values = defaultdict(list)
    htmls = []
    convos = []
    metadata = []
    for single_eval_result in single_eval_results:
        for name, value in single_eval_result.metrics.items():
            name2values[name].append(value)
        if single_eval_result.score is not None:
            name2values["score"].append(single_eval_result.score)
        htmls.append(single_eval_result.html)
        convos.append(single_eval_result.convo)
        metadata.append(single_eval_result.example_level_metadata)
    final_metrics = {}
    for name, values in name2values.items():
        stats = name2stats.get(name, default_stats)
        for stat in stats:
            key = name if stat == "mean" else f"{name}:{stat}"
            final_metrics[key] = _compute_stat(values, stat)
    return EvalResult(
        score=final_metrics.pop("score", None),
        metrics=final_metrics,
        htmls=htmls,
        convos=convos,
        metadata={"example_level_metadata": metadata},
    )


def map_with_progress(
    f: Callable,
    xs: list[Any],
    num_threads: int = 128,
    pbar: bool = True,
):
    """
    Apply f to each element of xs, using a ThreadPool, and show progress.
    """
    pbar_fn = tqdm if pbar else lambda x, *args, **kwargs: x

    if os.getenv("debug"):
        return list(map(f, pbar_fn(xs, total=len(xs))))
    else:
        with ThreadPool(min(num_threads, len(xs))) as pool:
            return list(pbar_fn(pool.imap_unordered(f, xs), total=len(xs)))


jinja_env = jinja2.Environment(
    loader=jinja2.BaseLoader(),
    undefined=jinja2.StrictUndefined,
    autoescape=jinja2.select_autoescape(["html", "xml"]),
)
_message_template = """
<div class="message {{ role }}">
    <div class="role">
    {{ role }}
    {% if variant %}<span class="variant">({{ variant }})</span>{% endif %}
    </div>
    <div class="content">
    <pre>{{ content }}</pre>
    </div>
</div>
"""


def message_to_html(message: Message) -> str:
    """
    Generate HTML snippet (inside a <div>) for a message.
    """
    return jinja_env.from_string(_message_template).render(
        role=message["role"],
        content=message["content"],
        variant=message.get("variant", None),
    )


jinja_env.globals["message_to_html"] = message_to_html


_report_template = """<!DOCTYPE html>
<html>
    <head>
        <meta charset="utf-8">
        <style>
            .message {
                padding: 8px 16px;
                margin-bottom: 8px;
                border-radius: 4px;
            }
            .message.user {
                background-color: #B2DFDB;
                color: #00695C;
            }
            .message.assistant {
                background-color: #B39DDB;
                color: #4527A0;
            }
            .message.system {
                background-color: #EEEEEE;
                color: #212121;
            }
            .role {
                font-weight: bold;
                margin-bottom: 4px;
            }
            .variant {
                color: #795548;
            }
            table, th, td {
                border: 1px solid black;
            }
            pre {
                white-space: pre-wrap;
            }
        </style>
    </head>
    <body>
    {% if metrics %}
    <h1>Metrics</h1>
    <table>
    <tr>
        <th>Metric</th>
        <th>Value</th>
    </tr>
    <tr>
        <td><b>Score</b></td>
        <td>{{ score | float | round(3) }}</td>
    </tr>
    {% for name, value in metrics.items() %}
    <tr>
        <td>{{ name }}</td>
        <td>{{ value }}</td>
    </tr>
    {% endfor %}
    </table>
    {% endif %}
    <h1>Examples</h1>
    {% for html in htmls %}
    {{ html | safe }}
    <hr>
    {% endfor %}
    </body>
</html>
"""


def make_report(eval_result: EvalResult) -> str:
    """
    Create a standalone HTML report from an EvalResult.
    """
    return jinja_env.from_string(_report_template).render(
        score=eval_result.score,
        metrics=eval_result.metrics,
        htmls=eval_result.htmls,
    )


================================================
FILE: gpt_oss/evals/responses_sampler.py
================================================
import time
from typing import Any

import openai
from openai import OpenAI

from .types import MessageList, SamplerBase, SamplerResponse


class ResponsesSampler(SamplerBase):
    """
    Sample from OpenAI's responses API
    """

    def __init__(
        self,
        model: str,
        developer_message: str | None = None,
        temperature: float = 1.0,
        max_tokens: int = 131_072,
        reasoning_model: bool = False,
        reasoning_effort: str | None = None,
        base_url: str = "http://localhost:8000/v1",
    ):
        self.client = OpenAI(base_url=base_url, timeout=24*60*60)
        self.model = model
        self.developer_message = developer_message
        self.temperature = temperature
        self.max_tokens = max_tokens
        self.image_format = "url"
        self.reasoning_model = reasoning_model
        self.reasoning_effort = reasoning_effort

    def _pack_message(self, role: str, content: Any) -> dict[str, Any]:
        return {"role": role, "content": content}

    def __call__(self, message_list: MessageList) -> SamplerResponse:
        if self.developer_message:
            message_list = [
                self._pack_message("developer", self.developer_message)
            ] + message_list
        trial = 0
        while True:
            try:
                request_kwargs = {
                    "model": self.model,
                    "input": message_list,
                    "temperature": self.temperature,
                    "max_output_tokens": self.max_tokens,
                }
                if self.reasoning_model:
                    request_kwargs["reasoning"] = (
                        {"effort": self.reasoning_effort} if self.reasoning_effort else None
                    )
                response = self.client.responses.create(**request_kwargs)

                for output in response.output:
                    if hasattr(output, "text"):
                        message_list.append(self._pack_message(getattr(output, "role", "assistant"), output.text))
                    elif hasattr(output, "content"):
                        for c in output.content:
                            # c.text handled below
                            pass

                return SamplerResponse(
                    response_text=response.output_text,
                    response_metadata={"usage": response.usage},
                    actual_queried_message_list=message_list,
                )
            except openai.BadRequestError as e:
                print("Bad Request Error", e)
                return SamplerResponse(
                    response_text="",
                    response_metadata={"usage": None},
                    actual_queried_message_list=message_list,
                )
            except Exception as e:
                exception_backoff = 2**trial  # expontial back off
                print(
                    f"Rate limit exception so wait and retry {trial} after {exception_backoff} sec",
                    e,
                )
                time.sleep(exception_backoff)
                trial += 1
            # unknown error shall throw exception


================================================
FILE: gpt_oss/evals/types.py
================================================
from dataclasses import dataclass, field
from typing import Any, Literal, overload

Message = dict[str, Any]  # keys role, content
MessageList = list[Message]


@dataclass
class SamplerResponse:
    """
    Response from a sampler.
    """
    response_text: str
    actual_queried_message_list: MessageList
    response_metadata: dict[str, Any]

class SamplerBase:
    """
    Base class for defining a sampling model, which can be evaluated,
    or used as part of the grading process.
    """

    def __call__(
        self, 
        message_list: MessageList,
    ) -> SamplerResponse:
        raise NotImplementedError


@dataclass
class EvalResult:
    """
    Result of running an evaluation (usually consisting of many samples)
    """

    score: float | None  # top-line metric
    metrics: dict[str, float] | None  # other metrics
    htmls: list[str]  # strings of valid HTML
    convos: list[MessageList]  # sampled conversations
    metadata: dict[str, Any] | None  # Extra data such as rubric scores or sollen


@dataclass
class SingleEvalResult:
    """
    Result of evaluating a single sample
    """

    score: float | None
    metrics: dict[str, float] = field(default_factory=dict)
    html: str | None = None
    convo: MessageList | None = None  # sampled conversation
    example_level_metadata: dict[str, Any] | None = (
        None  # Extra data such as rubric scores or sollen
    )


class Eval:
    """
    Base class for defining an evaluation.
    """

    def __call__(self, sampler: SamplerBase) -> EvalResult:
        raise NotImplementedError


================================================
FILE: gpt_oss/generate.py
================================================
# Model parallel inference
# Note: This script is for demonstration purposes only. It is not designed for production use.
#       See gpt_oss.chat for a more complete example with the Harmony parser.
# torchrun --nproc-per-node=4 -m gpt_oss.generate -p "why did the chicken cross the road?" model/

import argparse

from gpt_oss.tokenizer import get_tokenizer


def main(args):
    match args.backend:
        case "torch":
            from gpt_oss.torch.utils import init_distributed
            from gpt_oss.torch.model import TokenGenerator as TorchGenerator
            device = init_distributed()
            generator = TorchGenerator(args.checkpoint, device=device)
        case "triton":
            from gpt_oss.torch.utils import init_distributed
            from gpt_oss.triton.model import TokenGenerator as TritonGenerator
            device = init_distributed()
            generator = TritonGenerator(args.checkpoint, context=args.context_length, device=device)
        case "vllm":
            from gpt_oss.vllm.token_generator import TokenGenerator as VLLMGenerator
            generator = VLLMGenerator(args.checkpoint, tensor_parallel_size=args.tensor_parallel_size)
        case _:
            raise ValueError(f"Invalid backend: {args.backend}")

    tokenizer = get_tokenizer()
    tokens = tokenizer.encode(args.prompt)
    max_tokens = None if args.limit == 0 else args.limit
    for token, logprob in generator.generate(tokens, stop_tokens=[tokenizer.eot_token], temperature=args.temperature, max_tokens=max_tokens, return_logprobs=True):
        tokens.append(token)
        token_text = tokenizer.decode([token])
        print(
            f"Generated token: {repr(token_text)}, logprob: {logprob}"
        )


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Text generation example")
    parser.add_argument(
        "checkpoint",
        metavar="FILE",
        type=str,
        help="Path to the SafeTensors checkpoint",
    )
    parser.add_argument(
        "-p",
        "--prompt",
        metavar="PROMPT",
        type=str,
        default="How are you?",
        help="LLM prompt",
    )
    parser.add_argument(
        "-t",
        "--temperature",
        metavar="TEMP",
        type=float,
        default=0.0,
        help="Sampling temperature",
    )
    parser.add_argument(
        "-l",
        "--limit",
        metavar="LIMIT",
        type=int,
        default=0,
        help="Limit on the number of tokens (0 to disable)",
    )
    parser.add_argument(
        "-b",
        "--backend",
        metavar="BACKEND",
        type=str,
        default="torch",
        choices=["triton", "torch", "vllm"],
        help="Inference backend",
    )
    parser.add_argument(
        "--tensor-parallel-size",
        type=int,
        default=2,
        help="Tensor parallel size for vLLM backend",
    )
    parser.add_argument(
        "--context-length",
        type=int,
        default=4096,
        help="Context length for Triton backend",
    )
    args = parser.parse_args()

    main(args)


================================================
FILE: gpt_oss/metal/CMakeLists.txt
================================================
cmake_minimum_required(VERSION 3.24)
project(GPTOSS
    VERSION 1.0
    DESCRIPTION "Local GPT-OSS inference"
    LANGUAGES C CXX OBJC)

set(CMAKE_C_STANDARD 11)
set(CMAKE_CXX_STANDARD 20)
set(CMAKE_OBJC_STANDARD 11)
set(CMAKE_OBJC_STANDARD_REQUIRED ON)

find_library(FOUNDATION_FRAMEWORK Foundation REQUIRED)
find_library(METAL_FRAMEWORK      Metal      REQUIRED)
find_library(IOKIT_FRAMEWORK      IOKit      REQUIRED)

set(METAL_SOURCES
    ${CMAKE_CURRENT_SOURCE_DIR}/source/accumulate.metal
    ${CMAKE_CURRENT_SOURCE_DIR}/source/convert.metal
    ${CMAKE_CURRENT_SOURCE_DIR}/source/embeddings.metal
    ${CMAKE_CURRENT_SOURCE_DIR}/source/expert_routing_metadata.metal
    ${CMAKE_CURRENT_SOURCE_DIR}/source/gather_and_accumulate.metal
    ${CMAKE_CURRENT_SOURCE_DIR}/source/matmul.metal
    ${CMAKE_CURRENT_SOURCE_DIR}/source/moematmul.metal
    ${CMAKE_CURRENT_SOURCE_DIR}/source/random.metal
    ${CMAKE_CURRENT_SOURCE_DIR}/source/rmsnorm.metal
    ${CMAKE_CURRENT_SOURCE_DIR}/source/rope.metal
    ${CMAKE_CURRENT_SOURCE_DIR}/source/sample.metal
    ${CMAKE_CURRENT_SOURCE_DIR}/source/scatter.metal
    ${CMAKE_CURRENT_SOURCE_DIR}/source/sdpa.metal
    ${CMAKE_CURRENT_SOURCE_DIR}/source/topk.metal
)
set(METAL_LIB default.metallib)

include_directories(BEFORE include source/include)

add_custom_command(
    OUTPUT  ${CMAKE_CURRENT_BINARY_DIR}/${METAL_LIB}
    COMMAND ${CMAKE_COMMAND} -E make_directory "${CMAKE_CURRENT_BINARY_DIR}/source/"
    COMMAND xcrun -sdk macosx metal -g "-I${CMAKE_CURRENT_SOURCE_DIR}/source/include" -c "${CMAKE_CURRENT_SOURCE_DIR}/source/accumulate.metal" -o "${CMAKE_CURRENT_BINARY_DIR}/source/accumulate.air"
    COMMAND xcrun -sdk macosx metal -g "-I${CMAKE_CURRENT_SOURCE_DIR}/source/include" -c "${CMAKE_CURRENT_SOURCE_DIR}/source/convert.metal" -o "${CMAKE_CURRENT_BINARY_DIR}/source/convert.air"
    COMMAND xcrun -sdk macosx metal -g "-I${CMAKE_CURRENT_SOURCE_DIR}/source/include" -c "${CMAKE_CURRENT_SOURCE_DIR}/source/embeddings.metal" -o "${CMAKE_CURRENT_BINARY_DIR}/source/embeddings.air"
    COMMAND xcrun -sdk macosx metal -g "-I${CMAKE_CURRENT_SOURCE_DIR}/source/include" -c "${CMAKE_CURRENT_SOURCE_DIR}/source/expert_routing_metadata.metal" -o "${CMAKE_CURRENT_BINARY_DIR}/source/expert_routing_metadata.air"
    COMMAND xcrun -sdk macosx metal -g "-I${CMAKE_CURRENT_SOURCE_DIR}/source/include" -c "${CMAKE_CURRENT_SOURCE_DIR}/source/matmul.metal" -o "${CMAKE_CURRENT_BINARY_DIR}/source/matmul.air"
    COMMAND xcrun -sdk macosx metal -g "-I${CMAKE_CURRENT_SOURCE_DIR}/source/include" -c "${CMAKE_CURRENT_SOURCE_DIR}/source/moematmul.metal" -o "${CMAKE_CURRENT_BINARY_DIR}/source/moematmul.air"
    COMMAND xcrun -sdk macosx metal -g "-I${CMAKE_CURRENT_SOURCE_DIR}/source/include" -c "${CMAKE_CURRENT_SOURCE_DIR}/source/gather_and_accumulate.metal" -o "${CMAKE_CURRENT_BINARY_DIR}/source/gather_and_accumulate.air"
    COMMAND xcrun -sdk macosx metal -g "-I${CMAKE_CURRENT_SOURCE_DIR}/source/include" -c "${CMAKE_CURRENT_SOURCE_DIR}/source/random.metal" -o "${CMAKE_CURRENT_BINARY_DIR}/source/random.air"
    COMMAND xcrun -sdk macosx metal -g "-I${CMAKE_CURRENT_SOURCE_DIR}/source/include" -c "${CMAKE_CURRENT_SOURCE_DIR}/source/rmsnorm.metal" -o "${CMAKE_CURRENT_BINARY_DIR}/source/rmsnorm.air"
    COMMAND xcrun -sdk macosx metal -g "-I${CMAKE_CURRENT_SOURCE_DIR}/source/include" -c "${CMAKE_CURRENT_SOURCE_DIR}/source/rope.metal" -o "${CMAKE_CURRENT_BINARY_DIR}/source/rope.air"
    COMMAND xcrun -sdk macosx metal -g "-I${CMAKE_CURRENT_SOURCE_DIR}/source/include" -c "${CMAKE_CURRENT_SOURCE_DIR}/source/sample.metal" -o "${CMAKE_CURRENT_BINARY_DIR}/source/sample.air"
    COMMAND xcrun -sdk macosx metal -g "-I${CMAKE_CURRENT_SOURCE_DIR}/source/include" -c "${CMAKE_CURRENT_SOURCE_DIR}/source/scatter.metal" -o "${CMAKE_CURRENT_BINARY_DIR}/source/scatter.air"
    COMMAND xcrun -sdk macosx metal -g "-I${CMAKE_CURRENT_SOURCE_DIR}/source/include" -c "${CMAKE_CURRENT_SOURCE_DIR}/source/sdpa.metal" -o "${CMAKE_CURRENT_BINARY_DIR}/source/sdpa.air"
    COMMAND xcrun -sdk macosx metal -g "-I${CMAKE_CURRENT_SOURCE_DIR}/source/include" -c "${CMAKE_CURRENT_SOURCE_DIR}/source/topk.metal" -o "${CMAKE_CURRENT_BINARY_DIR}/source/topk.air"
    COMMAND xcrun -sdk macosx metallib "${CMAKE_CURRENT_BINARY_DIR}/source/accumulate.air" "${CMAKE_CURRENT_BINARY_DIR}/source/convert.air" "${CMAKE_CURRENT_BINARY_DIR}/source/embeddings.air" "${CMAKE_CURRENT_BINARY_DIR}/source/expert_routing_metadata.air" "${CMAKE_CURRENT_BINARY_DIR}/source/gather_and_accumulate.air" "${CMAKE_CURRENT_BINARY_DIR}/source/matmul.air" "${CMAKE_CURRENT_BINARY_DIR}/source/moematmul.air" "${CMAKE_CURRENT_BINARY_DIR}/source/random.air" "${CMAKE_CURRENT_BINARY_DIR}/source/rmsnorm.air" "${CMAKE_CURRENT_BINARY_DIR}/source/rope.air" "${CMAKE_CURRENT_BINARY_DIR}/source/sample.air" "${CMAKE_CURRENT_BINARY_DIR}/source/scatter.air" "${CMAKE_CURRENT_BINARY_DIR}/source/sdpa.air" "${CMAKE_CURRENT_BINARY_DIR}/source/topk.air" -o "${METAL_LIB}"
    DEPENDS ${METAL_SOURCES}
    COMMENT "Compiling Metal compute library"
)

add_custom_target(build_metallib ALL
    DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${METAL_LIB})

add_library(log OBJECT source/log.c)

add_library(metal-kernels STATIC source/metal.m source/metal-kernels.c)
target_link_libraries(metal-kernels PRIVATE log)

add_dependencies(metal-kernels build_metallib)
add_custom_command(TARGET metal-kernels POST_BUILD
    COMMAND ${CMAKE_COMMAND} -E copy
            ${CMAKE_CURRENT_BINARY_DIR}/${METAL_LIB}
            $<TARGET_FILE_DIR:metal-kernels>)

target_link_libraries(metal-kernels PRIVATE ${FOUNDATION_FRAMEWORK} ${METAL_FRAMEWORK} ${IOKIT_FRAMEWORK})

add_library(gptoss STATIC source/model.c source/tokenizer.c source/context.c)
target_link_libraries(gptoss PRIVATE log metal-kernels)

add_executable(generate source/generate.c)
target_link_libraries(generate gptoss)

# --- [ Tests
include(FetchContent)
FetchContent_Declare(
    googletest
    URL https://github.com/google/googletest/archive/refs/tags/v1.17.0.zip
    DOWNLOAD_EXTRACT_TIMESTAMP OFF
)
# For Windows: Prevent overriding the parent project's compiler/linker settings
set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
set(INSTALL_GTEST OFF CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(googletest)

enable_testing()

add_executable(u32-random-test test/u32-random.cc)
target_link_libraries(u32-random-test PRIVATE GTest::gtest_main metal-kernels)
target_include_directories(u32-random-test PRIVATE source/include)
add_test(NAME u32-random-test COMMAND u32-random-test)

add_executable(f32-random-test test/f32-random.cc)
target_link_libraries(f32-random-test PRIVATE GTest::gtest_main metal-kernels)
target_include_directories(f32-random-test PRIVATE source/include)
add_test(NAME f32-random-test COMMAND f32-random-test)

add_executable(mf4-f32-convert-test test/mf4-f32-convert.cc)
target_link_libraries(mf4-f32-convert-test PRIVATE GTest::gtest_main metal-kernels)
target_include_directories(mf4-f32-convert-test PRIVATE source/include)
add_test(NAME mf4-f32-convert-test COMMAND mf4-f32-convert-test)

add_executable(bf16-f32-embeddings-test test/bf16-f32-embeddings.cc)
target_link_libraries(bf16-f32-embeddings-test PRIVATE GTest::gtest_main metal-kernels)
target_include_directories(bf16-f32-embeddings-test PRIVATE source/include)
add_test(NAME bf16-f32-embeddings-test COMMAND bf16-f32-embeddings-test)

add_executable(f32-bf16w-rmsnorm-test test/f32-bf16w-rmsnorm.cc)
target_link_libraries(f32-bf16w-rmsnorm-test PRIVATE GTest::gtest_main metal-kernels)
target_include_directories(f32-bf16w-rmsnorm-test PRIVATE source/include)
add_test(NAME f32-bf16w-rmsnorm-test COMMAND f32-bf16w-rmsnorm-test)

add_executable(f32-bf16w-matmul-test test/f32-bf16w-matmul.cc)
target_link_libraries(f32-bf16w-matmul-test PRIVATE GTest::gtest_main metal-kernels)
target_include_directories(f32-bf16w-matmul-test PRIVATE source/include)
add_test(NAME f32-bf16w-matmul-test COMMAND f32-bf16w-matmul-test)

add_executable(f32-rope-test test/f32-rope.cc)
target_link_libraries(f32-rope-test PRIVATE GTest::gtest_main metal-kernels)
target_include_directories(f32-rope-test PRIVATE source/include)
add_test(NAME f32-rope-test COMMAND f32-rope-test)

# --- [ Benchmarks
include(FetchContent)
set(BENCHMARK_ENABLE_TESTING OFF CACHE BOOL "Disable self-tests in Google Benchmark" FORCE)
set(BENCHMARK_ENABLE_INSTALL OFF CACHE BOOL "Disable installation of Google Benchmark" FORCE)
FetchContent_Declare(
    benchmark
    URL https://github.com/google/benchmark/archive/refs/tags/v1.9.4.zip
    DOWNLOAD_EXTRACT_TIMESTAMP OFF
)
FetchContent_MakeAvailable(benchmark)

add_executable(f32-random-bench benchmark/f32-random.cc)
target_link_libraries(f32-random-bench PRIVATE benchmark::benchmark metal-kernels)
target_include_directories(f32-random-bench PRIVATE source/include)

add_executable(u32-random-bench benchmark/u32-random.cc)
target_link_libraries(u32-random-bench PRIVATE benchmark::benchmark metal-kernels)
target_include_directories(u32-random-bench PRIVATE source/include)

add_executable(mf4-f32-convert-bench benchmark/mf4-f32-convert.cc)
target_link_libraries(mf4-f32-convert-bench PRIVATE benchmark::benchmark metal-kernels)
target_include_directories(mf4-f32-convert-bench PRIVATE source/include)

add_executable(f32-bf16w-rmsnorm-bench benchmark/f32-bf16w-rmsnorm.cc)
target_link_libraries(f32-bf16w-rmsnorm-bench PRIVATE benchmark::benchmark metal-kernels)
target_include_directories(f32-bf16w-rmsnorm-bench PRIVATE source/include)

add_executable(end-to-end-bench benchmark/end-to-end.cc)
target_link_libraries(end-to-end-bench PRIVATE benchmark::benchmark gptoss)
target_include_directories(end-to-end-bench PRIVATE source/include)

add_executable(end-to-end-threadgroup-bench benchmark/end-to-end-threadgroup.cc)
target_link_libraries(end-to-end-threadgroup-bench PRIVATE benchmark::benchmark gptoss)
target_include_directories(end-to-end-threadgroup-bench PRIVATE source/include)

# --- [ Python extension ] -----------------------------------------------
find_package(pybind11 CONFIG REQUIRED)          # provides pybind11_add_module

pybind11_add_module(_metal
    python/module.c
    python/context.c
    python/model.c
    python/tokenizer.c
)
set_target_properties(_metal PROPERTIES PREFIX "")

target_link_libraries(_metal PRIVATE gptoss)
add_dependencies(_metal build_metallib)
target_link_options(_metal PRIVATE
    LINKER:-sectcreate,__METAL,__shaders,${CMAKE_CURRENT_BINARY_DIR}/${METAL_LIB}
)
add_custom_command(TARGET _metal POST_BUILD
    COMMAND ${CMAKE_COMMAND} -E copy
            ${CMAKE_CURRENT_BINARY_DIR}/${METAL_LIB}
            $<TARGET_FILE_DIR:_metal>)

# 1️⃣  install the extension module into the Python package
install(TARGETS _metal LIBRARY DESTINATION gpt_oss/metal)

# 2️⃣  make sure the Metal shader archive travels with it
install(FILES ${CMAKE_CURRENT_BINARY_DIR}/${METAL_LIB}
        DESTINATION gpt_oss/metal)
# ------------------------------------------------------------------------

================================================
FILE: gpt_oss/metal/__init__.py
================================================
from importlib import import_module as _im

# Load the compiled extension (gpt_oss.metal._metal)
_ext = _im(f"{__name__}._metal")
globals().update({k: v for k, v in _ext.__dict__.items() if not k.startswith("_")})
del _im, _ext


================================================
FILE: gpt_oss/metal/benchmark/end-to-end-threadgroup.cc
================================================
#include <gpt-oss.h>
#include <internal/model.h>

#include <array>
#include <cstdint>
#include <cstddef>
#include <format>
#include <limits>
#include <memory>
#include <string>
#include <type_traits>

#include <benchmark/benchmark.h>


constexpr std::uint32_t kNumGeneratedTokens = 100;


static void attn_qkv_tgsize(benchmark::State& state, const char* env_var_name) {
    const char* model_path = getenv(env_var_name);
    if (model_path == NULL) {
        state.SkipWithError(std::format("environment variable {} is not set", env_var_name));
        return;
    }

    gptoss_model_t model_ptr = nullptr;
    gptoss_status status = gptoss_model_create_from_file(model_path, &model_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError(std::format("failed to load model from file {}", model_path));
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_model_t>, decltype(&gptoss_model_release)> model(model_ptr, gptoss_model_release);
    model->attn_qkv_threadgroup_size = static_cast<std::size_t>(state.range(0));

    gptoss_context_t context_ptr = nullptr;
    status = gptoss_context_create(model.get(), /*context_length=*/0, /*max_batch_tokens=*/0, &context_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to create Context object");
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_context_t>, decltype(&gptoss_context_release)> context(context_ptr, gptoss_context_release);

    const char* prompt = "why did the chicken cross the road?";
    std::size_t num_prompt_tokens = 0;
    status = gptoss_context_append_chars(context.get(), prompt, strlen(prompt), &num_prompt_tokens);
    if (status != gptoss_status_success) {
        state.SkipWithError(std::format("failed to tokenize prompt \"{}\"", prompt));
        return;
    }

    // Prefill
    status = gptoss_context_process(context.get());
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to prefill Context object");
        return;
    }
    const std::size_t num_kvcache_tokens = context->num_kv_tokens;

    std::uint64_t rng_seed = 0;
    for (auto _ : state) {
        const std::uint64_t current_rng_seed = rng_seed++;
        context->num_kv_tokens = num_prompt_tokens;
        context->num_tokens = num_prompt_tokens;

        std::array<std::uint32_t, kNumGeneratedTokens> tokens;
        std::size_t num_generated_tokens = 0;
        do {
            std::size_t num_current_generated_tokens = 0;
            status = gptoss_context_sample(context.get(), /*temperature=*/1.0f, /*rng_state=*/current_rng_seed,
                /*max_tokens=*/kNumGeneratedTokens - num_generated_tokens, tokens.data(), &num_current_generated_tokens);
            if (status != gptoss_status_success) {
                state.SkipWithError("failed to sample from the Context object");
                return;
            }
            num_generated_tokens += num_current_generated_tokens;
        } while (num_generated_tokens < kNumGeneratedTokens);
    }

    state.counters["generations"] =
        benchmark::Counter(state.iterations(), benchmark::Counter::kIsRate);
    state.counters["tokens"] =
        benchmark::Counter(state.iterations() * kNumGeneratedTokens, benchmark::Counter::kIsRate);
}

static void AttnQKVThreadgroupSizeArguments(benchmark::internal::Benchmark* b) {
    b->ArgNames({"tgsize"});
    for (auto attn_qkv_threadgroup_size = 32; attn_qkv_threadgroup_size <= 1024; attn_qkv_threadgroup_size += 32) {
        const auto num_simdgroups = attn_qkv_threadgroup_size / 32;
        if (5120 % num_simdgroups != 0) {
            // Skip incompatible threadgroup sizes
            continue;
        }
        b->Args({attn_qkv_threadgroup_size});
    }
}

BENCHMARK_CAPTURE(attn_qkv_tgsize, gpt_oss_20b, "GPT_OSS_20B_PATH")
    ->UseRealTime()->Unit(benchmark::kMillisecond)->Apply(AttnQKVThreadgroupSizeArguments);
BENCHMARK_CAPTURE(attn_qkv_tgsize, gpt_oss_120b, "GPT_OSS_120B_PATH")
    ->UseRealTime()->Unit(benchmark::kMillisecond)->Apply(AttnQKVThreadgroupSizeArguments);

static void attn_out_tgsize(benchmark::State& state, const char* env_var_name) {
    const char* model_path = getenv(env_var_name);
    if (model_path == NULL) {
        state.SkipWithError(std::format("environment variable {} is not set", env_var_name));
        return;
    }

    gptoss_model_t model_ptr = nullptr;
    gptoss_status status = gptoss_model_create_from_file(model_path, &model_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError(std::format("failed to load model from file {}", model_path));
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_model_t>, decltype(&gptoss_model_release)> model(model_ptr, gptoss_model_release);
    model->attn_out_threadgroup_size = static_cast<std::size_t>(state.range(0));

    gptoss_context_t context_ptr = nullptr;
    status = gptoss_context_create(model.get(), /*context_length=*/0, /*max_batch_tokens=*/0, &context_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to create Context object");
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_context_t>, decltype(&gptoss_context_release)> context(context_ptr, gptoss_context_release);

    const char* prompt = "why did the chicken cross the road?";
    std::size_t num_prompt_tokens = 0;
    status = gptoss_context_append_chars(context.get(), prompt, strlen(prompt), &num_prompt_tokens);
    if (status != gptoss_status_success) {
        state.SkipWithError(std::format("failed to tokenize prompt \"{}\"", prompt));
        return;
    }

    // Prefill
    status = gptoss_context_process(context.get());
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to prefill Context object");
        return;
    }
    const std::size_t num_kvcache_tokens = context->num_kv_tokens;

    std::uint64_t rng_seed = 0;
    for (auto _ : state) {
        const std::uint64_t current_rng_seed = rng_seed++;
        context->num_kv_tokens = num_prompt_tokens;
        context->num_tokens = num_prompt_tokens;

        std::array<std::uint32_t, kNumGeneratedTokens> tokens;
        std::size_t num_generated_tokens = 0;
        do {
            std::size_t num_current_generated_tokens = 0;
            status = gptoss_context_sample(context.get(), /*temperature=*/1.0f, /*rng_state=*/current_rng_seed,
                /*max_tokens=*/kNumGeneratedTokens - num_generated_tokens, tokens.data(), &num_current_generated_tokens);
            if (status != gptoss_status_success) {
                state.SkipWithError("failed to sample from the Context object");
                return;
            }
            num_generated_tokens += num_current_generated_tokens;
        } while (num_generated_tokens < kNumGeneratedTokens);
    }

    state.counters["generations"] =
        benchmark::Counter(state.iterations(), benchmark::Counter::kIsRate);
    state.counters["tokens"] =
        benchmark::Counter(state.iterations() * kNumGeneratedTokens, benchmark::Counter::kIsRate);
}

static void AttnOutThreadgroupSizeArguments(benchmark::internal::Benchmark* b) {
    b->ArgNames({"tgsize"});
    for (auto attn_out_threadgroup_size = 32; attn_out_threadgroup_size <= 1024; attn_out_threadgroup_size += 32) {
        const auto num_simdgroups = attn_out_threadgroup_size / 32;
        if (2880 % num_simdgroups != 0) {
            // Skip incompatible threadgroup sizes
            continue;
        }
        b->Args({attn_out_threadgroup_size});
    }
}

BENCHMARK_CAPTURE(attn_out_tgsize, gpt_oss_20b, "GPT_OSS_20B_PATH")
    ->UseRealTime()->Unit(benchmark::kMillisecond)->Apply(AttnOutThreadgroupSizeArguments);
BENCHMARK_CAPTURE(attn_out_tgsize, gpt_oss_120b, "GPT_OSS_120B_PATH")
    ->UseRealTime()->Unit(benchmark::kMillisecond)->Apply(AttnOutThreadgroupSizeArguments);

static void mlp_gate_tgsize(benchmark::State& state, const char* env_var_name) {
    const char* model_path = getenv(env_var_name);
    if (model_path == NULL) {
        state.SkipWithError(std::format("environment variable {} is not set", env_var_name));
        return;
    }

    gptoss_model_t model_ptr = nullptr;
    gptoss_status status = gptoss_model_create_from_file(model_path, &model_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError(std::format("failed to load model from file {}", model_path));
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_model_t>, decltype(&gptoss_model_release)> model(model_ptr, gptoss_model_release);
    model->mlp_gate_threadgroup_size = static_cast<std::size_t>(state.range(0));

    gptoss_context_t context_ptr = nullptr;
    status = gptoss_context_create(model.get(), /*context_length=*/0, /*max_batch_tokens=*/0, &context_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to create Context object");
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_context_t>, decltype(&gptoss_context_release)> context(context_ptr, gptoss_context_release);

    const char* prompt = "why did the chicken cross the road?";
    std::size_t num_prompt_tokens = 0;
    status = gptoss_context_append_chars(context.get(), prompt, strlen(prompt), &num_prompt_tokens);
    if (status != gptoss_status_success) {
        state.SkipWithError(std::format("failed to tokenize prompt \"{}\"", prompt));
        return;
    }

    // Prefill
    status = gptoss_context_process(context.get());
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to prefill Context object");
        return;
    }
    const std::size_t num_kvcache_tokens = context->num_kv_tokens;

    std::uint64_t rng_seed = 0;
    for (auto _ : state) {
        const std::uint64_t current_rng_seed = rng_seed++;
        context->num_kv_tokens = num_prompt_tokens;
        context->num_tokens = num_prompt_tokens;

        std::array<std::uint32_t, kNumGeneratedTokens> tokens;
        std::size_t num_generated_tokens = 0;
        do {
            std::size_t num_current_generated_tokens = 0;
            status = gptoss_context_sample(context.get(), /*temperature=*/1.0f, /*rng_state=*/current_rng_seed,
                /*max_tokens=*/kNumGeneratedTokens - num_generated_tokens, tokens.data(), &num_current_generated_tokens);
            if (status != gptoss_status_success) {
                state.SkipWithError("failed to sample from the Context object");
                return;
            }
            num_generated_tokens += num_current_generated_tokens;
        } while (num_generated_tokens < kNumGeneratedTokens);
    }

    state.counters["generations"] =
        benchmark::Counter(state.iterations(), benchmark::Counter::kIsRate);
    state.counters["tokens"] =
        benchmark::Counter(state.iterations() * kNumGeneratedTokens, benchmark::Counter::kIsRate);
}

static void MlpGateThreadgroupSizeArguments(benchmark::internal::Benchmark* b) {
    b->ArgNames({"tgsize"});
    for (auto mlp_gate_threadgroup_size = 32; mlp_gate_threadgroup_size <= 1024; mlp_gate_threadgroup_size += 32) {
        const auto num_simdgroups = mlp_gate_threadgroup_size / 32;
        if (128 % num_simdgroups != 0) {
            // Skip incompatible threadgroup sizes
            continue;
        }
        b->Args({mlp_gate_threadgroup_size});
    }
}

BENCHMARK_CAPTURE(mlp_gate_tgsize, gpt_oss_20b, "GPT_OSS_20B_PATH")
    ->UseRealTime()->Unit(benchmark::kMillisecond)->Apply(MlpGateThreadgroupSizeArguments);
BENCHMARK_CAPTURE(mlp_gate_tgsize, gpt_oss_120b, "GPT_OSS_120B_PATH")
    ->UseRealTime()->Unit(benchmark::kMillisecond)->Apply(MlpGateThreadgroupSizeArguments);

static void mlp_swiglu_tgsize(benchmark::State& state, const char* env_var_name) {
    const char* model_path = getenv(env_var_name);
    if (model_path == NULL) {
        state.SkipWithError(std::format("environment variable {} is not set", env_var_name));
        return;
    }

    gptoss_model_t model_ptr = nullptr;
    gptoss_status status = gptoss_model_create_from_file(model_path, &model_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError(std::format("failed to load model from file {}", model_path));
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_model_t>, decltype(&gptoss_model_release)> model(model_ptr, gptoss_model_release);
    model->mlp_swiglu_threadgroup_size = static_cast<std::size_t>(state.range(0));

    gptoss_context_t context_ptr = nullptr;
    status = gptoss_context_create(model.get(), /*context_length=*/0, /*max_batch_tokens=*/0, &context_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to create Context object");
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_context_t>, decltype(&gptoss_context_release)> context(context_ptr, gptoss_context_release);

    const char* prompt = "why did the chicken cross the road?";
    std::size_t num_prompt_tokens = 0;
    status = gptoss_context_append_chars(context.get(), prompt, strlen(prompt), &num_prompt_tokens);
    if (status != gptoss_status_success) {
        state.SkipWithError(std::format("failed to tokenize prompt \"{}\"", prompt));
        return;
    }

    // Prefill
    status = gptoss_context_process(context.get());
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to prefill Context object");
        return;
    }
    const std::size_t num_kvcache_tokens = context->num_kv_tokens;

    std::uint64_t rng_seed = 0;
    for (auto _ : state) {
        const std::uint64_t current_rng_seed = rng_seed++;
        context->num_kv_tokens = num_prompt_tokens;
        context->num_tokens = num_prompt_tokens;

        std::array<std::uint32_t, kNumGeneratedTokens> tokens;
        std::size_t num_generated_tokens = 0;
        do {
            std::size_t num_current_generated_tokens = 0;
            status = gptoss_context_sample(context.get(), /*temperature=*/1.0f, /*rng_state=*/current_rng_seed,
                /*max_tokens=*/kNumGeneratedTokens - num_generated_tokens, tokens.data(), &num_current_generated_tokens);
            if (status != gptoss_status_success) {
                state.SkipWithError("failed to sample from the Context object");
                return;
            }
            num_generated_tokens += num_current_generated_tokens;
        } while (num_generated_tokens < kNumGeneratedTokens);
    }

    state.counters["generations"] =
        benchmark::Counter(state.iterations(), benchmark::Counter::kIsRate);
    state.counters["tokens"] =
        benchmark::Counter(state.iterations() * kNumGeneratedTokens, benchmark::Counter::kIsRate);
}

static void MlpSwigluThreadgroupSizeArguments(benchmark::internal::Benchmark* b) {
    b->ArgNames({"tgsize"});
    for (auto threadgroup_size = 64; threadgroup_size <= 1024; threadgroup_size += 64) {
        const auto num_simdgroups = threadgroup_size / 32;
        if (5760 % num_simdgroups != 0) {
            // Skip incompatible threadgroup sizes
            continue;
        }
        b->Args({threadgroup_size});
    }
}

BENCHMARK_CAPTURE(mlp_swiglu_tgsize, gpt_oss_20b, "GPT_OSS_20B_PATH")
    ->UseRealTime()->Unit(benchmark::kMillisecond)->Apply(MlpSwigluThreadgroupSizeArguments);
BENCHMARK_CAPTURE(mlp_swiglu_tgsize, gpt_oss_120b, "GPT_OSS_120B_PATH")
    ->UseRealTime()->Unit(benchmark::kMillisecond)->Apply(MlpSwigluThreadgroupSizeArguments);

static void mlp_out_tgsize(benchmark::State& state, const char* env_var_name) {
    const char* model_path = getenv(env_var_name);
    if (model_path == NULL) {
        state.SkipWithError(std::format("environment variable {} is not set", env_var_name));
        return;
    }

    gptoss_model_t model_ptr = nullptr;
    gptoss_status status = gptoss_model_create_from_file(model_path, &model_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError(std::format("failed to load model from file {}", model_path));
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_model_t>, decltype(&gptoss_model_release)> model(model_ptr, gptoss_model_release);
    model->mlp_out_threadgroup_size = static_cast<std::size_t>(state.range(0));

    gptoss_context_t context_ptr = nullptr;
    status = gptoss_context_create(model.get(), /*context_length=*/0, /*max_batch_tokens=*/0, &context_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to create Context object");
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_context_t>, decltype(&gptoss_context_release)> context(context_ptr, gptoss_context_release);

    const char* prompt = "why did the chicken cross the road?";
    std::size_t num_prompt_tokens = 0;
    status = gptoss_context_append_chars(context.get(), prompt, strlen(prompt), &num_prompt_tokens);
    if (status != gptoss_status_success) {
        state.SkipWithError(std::format("failed to tokenize prompt \"{}\"", prompt));
        return;
    }

    // Prefill
    status = gptoss_context_process(context.get());
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to prefill Context object");
        return;
    }
    const std::size_t num_kvcache_tokens = context->num_kv_tokens;

    std::uint64_t rng_seed = 0;
    for (auto _ : state) {
        const std::uint64_t current_rng_seed = rng_seed++;
        context->num_kv_tokens = num_prompt_tokens;
        context->num_tokens = num_prompt_tokens;

        std::array<std::uint32_t, kNumGeneratedTokens> tokens;
        std::size_t num_generated_tokens = 0;
        do {
            std::size_t num_current_generated_tokens = 0;
            status = gptoss_context_sample(context.get(), /*temperature=*/1.0f, /*rng_state=*/current_rng_seed,
                /*max_tokens=*/kNumGeneratedTokens - num_generated_tokens, tokens.data(), &num_current_generated_tokens);
            if (status != gptoss_status_success) {
                state.SkipWithError("failed to sample from the Context object");
                return;
            }
            num_generated_tokens += num_current_generated_tokens;
        } while (num_generated_tokens < kNumGeneratedTokens);
    }

    state.counters["generations"] =
        benchmark::Counter(state.iterations(), benchmark::Counter::kIsRate);
    state.counters["tokens"] =
        benchmark::Counter(state.iterations() * kNumGeneratedTokens, benchmark::Counter::kIsRate);
}

static void MlpOutThreadgroupSizeArguments(benchmark::internal::Benchmark* b) {
    b->ArgNames({"tgsize"});
    for (auto threadgroup_size = 64; threadgroup_size <= 1024; threadgroup_size += 64) {
        const auto num_simdgroups = threadgroup_size / 32;
        if (5760 % num_simdgroups != 0) {
            // Skip incompatible threadgroup sizes
            continue;
        }
        b->Args({threadgroup_size});
    }
}

BENCHMARK_CAPTURE(mlp_out_tgsize, gpt_oss_20b, "GPT_OSS_20B_PATH")
    ->UseRealTime()->Unit(benchmark::kMillisecond)->Apply(MlpOutThreadgroupSizeArguments);
BENCHMARK_CAPTURE(mlp_out_tgsize, gpt_oss_120b, "GPT_OSS_120B_PATH")
    ->UseRealTime()->Unit(benchmark::kMillisecond)->Apply(MlpOutThreadgroupSizeArguments);

static void mlp_acc_tgsize(benchmark::State& state, const char* env_var_name) {
    const char* model_path = getenv(env_var_name);
    if (model_path == NULL) {
        state.SkipWithError(std::format("environment variable {} is not set", env_var_name));
        return;
    }

    gptoss_model_t model_ptr = nullptr;
    gptoss_status status = gptoss_model_create_from_file(model_path, &model_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError(std::format("failed to load model from file {}", model_path));
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_model_t>, decltype(&gptoss_model_release)> model(model_ptr, gptoss_model_release);
    model->mlp_acc_threadgroup_size = static_cast<std::size_t>(state.range(0));

    gptoss_context_t context_ptr = nullptr;
    status = gptoss_context_create(model.get(), /*context_length=*/0, /*max_batch_tokens=*/0, &context_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to create Context object");
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_context_t>, decltype(&gptoss_context_release)> context(context_ptr, gptoss_context_release);

    const char* prompt = "why did the chicken cross the road?";
    std::size_t num_prompt_tokens = 0;
    status = gptoss_context_append_chars(context.get(), prompt, strlen(prompt), &num_prompt_tokens);
    if (status != gptoss_status_success) {
        state.SkipWithError(std::format("failed to tokenize prompt \"{}\"", prompt));
        return;
    }

    // Prefill
    status = gptoss_context_process(context.get());
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to prefill Context object");
        return;
    }
    const std::size_t num_kvcache_tokens = context->num_kv_tokens;

    std::uint64_t rng_seed = 0;
    for (auto _ : state) {
        const std::uint64_t current_rng_seed = rng_seed++;
        context->num_kv_tokens = num_prompt_tokens;
        context->num_tokens = num_prompt_tokens;

        std::array<std::uint32_t, kNumGeneratedTokens> tokens;
        std::size_t num_generated_tokens = 0;
        do {
            std::size_t num_current_generated_tokens = 0;
            status = gptoss_context_sample(context.get(), /*temperature=*/1.0f, /*rng_state=*/current_rng_seed,
                /*max_tokens=*/kNumGeneratedTokens - num_generated_tokens, tokens.data(), &num_current_generated_tokens);
            if (status != gptoss_status_success) {
                state.SkipWithError("failed to sample from the Context object");
                return;
            }
            num_generated_tokens += num_current_generated_tokens;
        } while (num_generated_tokens < kNumGeneratedTokens);
    }

    state.counters["generations"] =
        benchmark::Counter(state.iterations(), benchmark::Counter::kIsRate);
    state.counters["tokens"] =
        benchmark::Counter(state.iterations() * kNumGeneratedTokens, benchmark::Counter::kIsRate);
}

static void MlpAccThreadgroupSizeArguments(benchmark::internal::Benchmark* b) {
    b->ArgNames({"tgsize"});
    for (auto threadgroup_size = 32; threadgroup_size <= 1024; threadgroup_size += 32) {
        b->Args({threadgroup_size});
    }
}

BENCHMARK_CAPTURE(mlp_acc_tgsize, gpt_oss_20b, "GPT_OSS_20B_PATH")
    ->UseRealTime()->Unit(benchmark::kMillisecond)->Apply(MlpAccThreadgroupSizeArguments);
BENCHMARK_CAPTURE(mlp_acc_tgsize, gpt_oss_120b, "GPT_OSS_120B_PATH")
    ->UseRealTime()->Unit(benchmark::kMillisecond)->Apply(MlpAccThreadgroupSizeArguments);

static void unembedding_tgsize(benchmark::State& state, const char* env_var_name) {
    const char* model_path = getenv(env_var_name);
    if (model_path == NULL) {
        state.SkipWithError(std::format("environment variable {} is not set", env_var_name));
        return;
    }

    gptoss_model_t model_ptr = nullptr;
    gptoss_status status = gptoss_model_create_from_file(model_path, &model_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError(std::format("failed to load model from file {}", model_path));
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_model_t>, decltype(&gptoss_model_release)> model(model_ptr, gptoss_model_release);
    model->unembedding_threadgroup_size = static_cast<std::size_t>(state.range(0));

    gptoss_context_t context_ptr = nullptr;
    status = gptoss_context_create(model.get(), /*context_length=*/0, /*max_batch_tokens=*/0, &context_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to create Context object");
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_context_t>, decltype(&gptoss_context_release)> context(context_ptr, gptoss_context_release);

    const char* prompt = "why did the chicken cross the road?";
    std::size_t num_prompt_tokens = 0;
    status = gptoss_context_append_chars(context.get(), prompt, strlen(prompt), &num_prompt_tokens);
    if (status != gptoss_status_success) {
        state.SkipWithError(std::format("failed to tokenize prompt \"{}\"", prompt));
        return;
    }

    // Prefill
    status = gptoss_context_process(context.get());
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to prefill Context object");
        return;
    }
    const std::size_t num_kvcache_tokens = context->num_kv_tokens;

    std::uint64_t rng_seed = 0;
    for (auto _ : state) {
        const std::uint64_t current_rng_seed = rng_seed++;
        context->num_kv_tokens = num_prompt_tokens;
        context->num_tokens = num_prompt_tokens;

        std::array<std::uint32_t, kNumGeneratedTokens> tokens;
        std::size_t num_generated_tokens = 0;
        do {
            std::size_t num_current_generated_tokens = 0;
            status = gptoss_context_sample(context.get(), /*temperature=*/1.0f, /*rng_state=*/current_rng_seed,
                /*max_tokens=*/kNumGeneratedTokens - num_generated_tokens, tokens.data(), &num_current_generated_tokens);
            if (status != gptoss_status_success) {
                state.SkipWithError("failed to sample from the Context object");
                return;
            }
            num_generated_tokens += num_current_generated_tokens;
        } while (num_generated_tokens < kNumGeneratedTokens);
    }

    state.counters["generations"] =
        benchmark::Counter(state.iterations(), benchmark::Counter::kIsRate);
    state.counters["tokens"] =
        benchmark::Counter(state.iterations() * kNumGeneratedTokens, benchmark::Counter::kIsRate);
}

static void UnembeddingThreadgroupSizeArguments(benchmark::internal::Benchmark* b) {
    b->ArgNames({"tgsize"});
    for (auto threadgroup_size = 32; threadgroup_size <= 1024; threadgroup_size += 32) {
        b->Args({threadgroup_size});
    }
}

BENCHMARK_CAPTURE(unembedding_tgsize, gpt_oss_20b, "GPT_OSS_20B_PATH")
    ->UseRealTime()->Unit(benchmark::kMillisecond)->Apply(UnembeddingThreadgroupSizeArguments);
BENCHMARK_CAPTURE(unembedding_tgsize, gpt_oss_120b, "GPT_OSS_120B_PATH")
    ->UseRealTime()->Unit(benchmark::kMillisecond)->Apply(UnembeddingThreadgroupSizeArguments);

BENCHMARK_MAIN();


================================================
FILE: gpt_oss/metal/benchmark/end-to-end.cc
================================================
#include <gpt-oss.h>
#include <internal/model.h>

#include <array>
#include <cstddef>
#include <cstdint>
#include <format>
#include <fstream>
#include <limits>
#include <memory>
#include <string>
#include <type_traits>

#include <benchmark/benchmark.h>

constexpr std::uint32_t kNumGeneratedTokens = 100;

static void end2end_decode(benchmark::State& state, const char* env_var_name) {
    const char* model_path = getenv(env_var_name);
    if (model_path == NULL) {
        state.SkipWithError(std::format("environment variable {} is not set", env_var_name));
        return;
    }

    gptoss_model_t model_ptr = nullptr;
    gptoss_status status = gptoss_model_create_from_file(model_path, &model_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError(std::format("failed to load model from file {}", model_path));
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_model_t>, decltype(&gptoss_model_release)> model(model_ptr, gptoss_model_release);

    gptoss_context_t context_ptr = nullptr;
    status = gptoss_context_create(model.get(), /*context_length=*/0, /*max_batch_tokens=*/0, &context_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to create Context object");
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_context_t>, decltype(&gptoss_context_release)> context(context_ptr, gptoss_context_release);

    const char* prompt = "why did the chicken cross the road?";
    std::size_t num_prompt_tokens = 0;
    status = gptoss_context_append_chars(context.get(), prompt, strlen(prompt), &num_prompt_tokens);
    if (status != gptoss_status_success) {
        state.SkipWithError(std::format("failed to tokenize prompt \"{}\"", prompt));
        return;
    }

    // Prefill
    status = gptoss_context_process(context.get());
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to prefill Context object");
        return;
    }
    std::uint64_t rng_seed = 0;

    for (auto _ : state) {
        const std::uint64_t current_rng_seed = rng_seed++;
        context->num_kv_tokens = num_prompt_tokens;
        context->num_tokens = num_prompt_tokens;

        std::array<std::uint32_t, kNumGeneratedTokens> tokens;
        std::size_t num_generated_tokens = 0;
        do {
            std::size_t num_current_generated_tokens = 0;
            status = gptoss_context_sample(context.get(), /*temperature=*/1.0f, /*rng_state=*/current_rng_seed,
                                           /*max_tokens=*/kNumGeneratedTokens - num_generated_tokens, tokens.data(), &num_current_generated_tokens);
            if (status != gptoss_status_success) {
                state.SkipWithError("failed to sample from the Context object");
                return;
            }
            num_generated_tokens += num_current_generated_tokens;
        } while (num_generated_tokens < kNumGeneratedTokens);
    }

    state.counters["generations"] =
        benchmark::Counter(state.iterations(), benchmark::Counter::kIsRate);
    state.counters["tokens"] =
        benchmark::Counter(state.iterations() * kNumGeneratedTokens, benchmark::Counter::kIsRate);
}

static void end2end_prefill(benchmark::State& state,
                            const char* model_path_env_var_name,
                            const char* prompt_env_var_name,
                            size_t context_length = 0) {
    const char* model_path = getenv(model_path_env_var_name);
    if (model_path == NULL) {
        state.SkipWithError(std::format("environment variable {} is not set",
                                        model_path_env_var_name));
        return;
    }

    const char* prompt_file_path = getenv(prompt_env_var_name);
    if (prompt_file_path == NULL) {
        state.SkipWithError(std::format("environment variable {} is not set",
                                        prompt_env_var_name));
        return;
    }

    // Read prompt contents from file into a std::string
    std::ifstream prompt_file(prompt_file_path,
                              std::ios::in | std::ios::binary);
    if (!prompt_file) {
        state.SkipWithError(
            std::format("failed to open prompt file {}", prompt_file_path));
        return;
    }
    std::string prompt_str;
    prompt_file.seekg(0, std::ios::end);
    std::streampos file_size = prompt_file.tellg();
    if (file_size < 0) {
        state.SkipWithError(std::format("failed to read prompt file size {}",
                                        prompt_file_path));
        return;
    }
    prompt_str.resize(static_cast<std::size_t>(file_size));
    prompt_file.seekg(0, std::ios::beg);
    if (file_size > 0) {
        prompt_file.read(prompt_str.data(), file_size);
    }
    if (!prompt_file) {
        state.SkipWithError(
            std::format("failed to read prompt file {}", prompt_file_path));
        return;
    }

    gptoss_model_t model_ptr = nullptr;
    gptoss_status status = gptoss_model_create_from_file(model_path, &model_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError(
            std::format("failed to load model from file {}", model_path));
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_model_t>,
                    decltype(&gptoss_model_release)>
        model(model_ptr, gptoss_model_release);

    gptoss_tokenizer_t tokenizer_ptr = nullptr;
    status = gptoss_model_get_tokenizer(model.get(), &tokenizer_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to retrieve Tokenizer");
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_tokenizer_t>,
                    decltype(&gptoss_tokenizer_release)>
        tokenizer(tokenizer_ptr, gptoss_tokenizer_release);

    gptoss_context_t context_ptr = nullptr;
    status = gptoss_context_create(model.get(),
                                   /*context_lenght=*/0,
                                   /*max_batch_tokens=*/1024,
                                   &context_ptr);
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to create Context object");
        return;
    }
    std::unique_ptr<std::remove_pointer_t<gptoss_context_t>,
                    decltype(&gptoss_context_release)>
        context(context_ptr, gptoss_context_release);

    const char* prompt = prompt_str.c_str();
    status = gptoss_context_append_chars(context.get(), prompt,
                                         prompt_str.size(), nullptr);
    if (status != gptoss_status_success) {
        state.SkipWithError(std::format(
            "failed to tokenize prompt from file {}", prompt_file_path));
        return;
    }

    size_t num_tokens;
    status = gptoss_context_get_num_tokens(context.get(), &num_tokens);
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to get number of tokens");
        return;
    }
    if (context_length != 0) {
        assert(context_length <= num_tokens);
        context->num_tokens = context_length;
    }
    status = gptoss_context_get_num_tokens(context.get(), &num_tokens);
    if (status != gptoss_status_success) {
        state.SkipWithError("failed to get number of tokens");
        return;
    }
    // Prefill
    for (auto _ : state) {
        status = gptoss_context_process(context.get());
        if (status != gptoss_status_success) {
            state.SkipWithError("failed to prefill Context object");
            return;
        }
        context->num_kv_tokens = 0;
    }

    state.counters["tokens"] = num_tokens;
    state.counters["tokens/s"] = benchmark::Counter(
        state.iterations() * num_tokens, benchmark::Counter::kIsRate);
}

// Decode end-to-end benchmark
BENCHMARK_CAPTURE(end2end_decode, gpt_oss_20b_decode, "GPT_OSS_20B_PATH")
    ->UseRealTime()
    ->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(end2end_decode, gpt_oss_120b_decode, "GPT_OSS_120B_PATH")
    ->UseRealTime()
    ->Unit(benchmark::kMillisecond);

// Prefill end-to-end benchmark
BENCHMARK_CAPTURE(end2end_prefill, gpt_oss_120b_prefill_1024,
                  "GPT_OSS_120B_PATH", "GPT_OSS_PROMPT_FILE_PATH", 1024)
    ->UseRealTime()
    ->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(end2end_prefill, gpt_oss_20b_prefill_1024, "GPT_OSS_20B_PATH",
                  "GPT_OSS_PROMPT_FILE_PATH", 1024)
    ->UseRealTime()
    ->Unit(benchmark::kMillisecond);

BENCHMARK_CAPTURE(end2end_prefill, gpt_oss_120b_prefill_3072,
                  "GPT_OSS_120B_PATH", "GPT_OSS_PROMPT_FILE_PATH", 3072)
    ->UseRealTime()
    ->Unit(benchmark::kMillisecond);
BENCHMARK_CAPTURE(end2end_prefill, gpt_oss_20b_prefill_3072, "GPT_OSS_20B_PATH",
                  "GPT_OSS_PROMPT_FILE_PATH", 3072)
    ->UseRealTime()
    ->Unit(benchmark::kMillisecond);

BENCHMARK_MAIN();


================================================
FILE: gpt_oss/metal/benchmark/f32-bf16w-rmsnorm.cc
================================================
#include <gpt-oss.h>
#include <internal/datatype.h>
#include <internal/metal.hpp>
#include <internal/metal-kernels.h>

#include <cstring>

#include <benchmark/benchmark.h>

using gptoss::Check;
using namespace gptoss::metal;

constexpr float kEpsilon = 1.0e-5f;
constexpr uint64_t kSeed = UINT64_C(1019827666124465388);

static void f32_bf16w_rnsnorm(benchmark::State& state) {
    const size_t num_tokens = 1;
    const size_t num_channels = state.range(0);

    Device device;
    CommandQueue command_queue{device};
    Library library{device};
    Function f32_fill_random_fn{library, "gptoss_f32_fill_random"};
    Function bf16_fill_random_fn{library, "gptoss_bf16_fill_random"};
    Function f32_bf16w_rmsnorm_fn{library, "gptoss_f32_bf16w_rmsnorm"};
    Buffer input_buffer{device, num_tokens * num_channels * sizeof(float)};
    Buffer weight_buffer{device, num_channels * sizeof(gptoss_bfloat16)};
    Buffer output_buffer{device, num_tokens * num_channels * sizeof(float)};
    Buffer control_buffer{device, sizeof(gptoss_control)};
    std::memset(control_buffer.ptr(), 0, sizeof(gptoss_control));

    {
        CommandBuffer command_buffer{command_queue};

        size_t offset = 0;
        Check(gptoss_metal_command_buffer_encode_launch_f32_fill_random(
                command_buffer.handle(),
                f32_fill_random_fn.handle(),
                /*threadgroup_size=*/0,
                /*max_threadgroups=*/10,
                /*output_buffer=*/input_buffer.handle(),
                /*output_offset=*/0,
                num_channels, kSeed, offset, /*min=*/-1.0f, /*max=*/1.0),
            "gptoss_metal_command_buffer_encode_launch_f32_fill_random");
        offset += num_channels;

        Check(gptoss_metal_command_buffer_encode_launch_bf16_fill_random(
                command_buffer.handle(),
                bf16_fill_random_fn.handle(),
                /*threadgroup_size=*/0,
                /*max_threadgroups=*/10,
                /*output_buffer=*/weight_buffer.handle(),
                /*output_offset=*/0,
                num_channels, kSeed, offset, /*min=*/-1.0f, /*max=*/1.0),
            "gptoss_metal_command_buffer_encode_launch_bf16_fill_random");
        offset += num_channels;

        command_buffer.commit();
        command_buffer.wait_completion();
    }

    for (auto _ : state) {
        CommandBuffer command_buffer{command_queue};

        Check(gptoss_metal_command_buffer_encode_launch_f32_bf16w_rmsnorm(
                command_buffer.handle(),
                f32_bf16w_rmsnorm_fn.handle(),
                input_buffer.handle(),
                /*input_offset=*/0,
                weight_buffer.handle(),
                /*weight_offset=*/0,
                output_buffer.handle(),
                /*output_offset=*/0,
                control_buffer.handle(),
                /*control_offset=*/0,
                num_tokens,
                num_channels,
                kEpsilon),
            "gptoss_metal_command_buffer_encode_launch_f32_bf16w_rmsnorm");

        command_buffer.commit();
        const double elapsed_seconds = command_buffer.wait_completion();
        state.SetIterationTime(elapsed_seconds);
    }

    const size_t num_elements = num_tokens * num_channels;
    state.counters["elements"] =
        benchmark::Counter(state.iterations() * num_elements,
                           benchmark::Counter::kIsRate);

    const int64_t bytes_per_iteration = input_buffer.size() + weight_buffer.size() + output_buffer.size();
    state.counters["bytes"] =
        benchmark::Counter(state.iterations() * bytes_per_iteration,
                           benchmark::Counter::kIsRate);
}

BENCHMARK(f32_bf16w_rnsnorm)->Arg(2880)->UseManualTime()->Unit(benchmark::kMicrosecond);

BENCHMARK_MAIN();


================================================
FILE: gpt_oss/metal/benchmark/f32-random.cc
================================================
#include <gpt-oss.h>
#include <internal/metal.hpp>
#include <internal/metal-kernels.h>

#include <benchmark/benchmark.h>

using gptoss::Check;
using namespace gptoss::metal;

static void f32_fill_random(benchmark::State& state) {
    const size_t numel = state.range(0);

    Device device;
    CommandQueue command_queue{device};
    Library library{device};
    Function f32_fill_random_fn{library, "gptoss_f32_fill_random"};
    Buffer buffer{device, numel * sizeof(float)};

    constexpr uint64_t seed = UINT64_C(1019827666124465388);
    constexpr uint64_t offset = UINT64_C(12345678901234567890);
    const float min = -1.0f;
    const float max = 7.0f;
    for (auto _ : state) {
        CommandBuffer command_buffer{command_queue};

        Check(gptoss_metal_command_buffer_encode_launch_f32_fill_random(
                command_buffer.handle(),
                f32_fill_random_fn.handle(),
                /*threadgroup_size=*/0,
                /*max_threadgroups=*/120,
                /*output_buffer=*/buffer.handle(),
                /*output_offset=*/0,
                numel, seed, offset, min, max),
            "gptoss_metal_command_buffer_encode_launch_f32_fill_random");

        command_buffer.commit();
        const double elapsed_seconds = command_buffer.wait_completion();
        state.SetIterationTime(elapsed_seconds);
    }
    
    const int64_t elements_per_iteration = numel;
    state.counters["elements"] =
        benchmark::Counter(state.iterations() * elements_per_iteration,
                           benchmark::Counter::kIsRate);

    const int64_t bytes_per_iteration = numel * sizeof(float);
    state.counters["bytes"] =
        benchmark::Counter(state.iterations() * bytes_per_iteration,
                           benchmark::Counter::kIsRate);
}

constexpr int64_t giga = INT64_C(1073741824);
BENCHMARK(f32_fill_random)->Arg(2 * giga)->UseManualTime()->Unit(benchmark::kMicrosecond);

BENCHMARK_MAIN();


================================================
FILE: gpt_oss/metal/benchmark/mf4-f32-convert.cc
================================================
#include <gpt-oss.h>
#include <internal/datatype.h>
#include <internal/metal.hpp>
#include <internal/metal-kernels.h>

#include <cstring>

#include <benchmark/benchmark.h>

using gptoss::Check;
using namespace gptoss::metal;

static void mf4_f32_convert(benchmark::State& state) {
    const size_t num_blocks = state.range(0);
    const size_t num_elements = num_blocks * 32;
    const size_t num_bytes = num_elements / 2;

    Device device;
    CommandQueue command_queue{device};
    Library library{device};
    Function mf4_f32_convert_fn{library, "gptoss_mf4_f32_convert"};
    Buffer block_buffer{device, num_bytes};
    Buffer scale_buffer{device, num_blocks * sizeof(gptoss_float8ue8m0)};
    Buffer output_buffer{device, num_elements * sizeof(float)};

    std::memset(block_buffer.ptr(), 0x91, num_bytes);  // force subnormals
    std::memset(scale_buffer.ptr(), 128, num_blocks * sizeof(uint8_t));  // scale = 2.0

    for (auto _ : state) {
        CommandBuffer command_buffer{command_queue};

        Check(gptoss_metal_command_buffer_encode_launch_mf4_f32_convert(
                command_buffer.handle(),
                mf4_f32_convert_fn.handle(),
                /*threadgroup_size=*/0,
                /*max_threadgroups=*/120,
                block_buffer.handle(),
                scale_buffer.handle(),
                output_buffer.handle(),
                num_elements),
            "gptoss_metal_command_buffer_encode_launch_mf4_f32_convert");

        command_buffer.commit();
        const double elapsed_seconds = command_buffer.wait_completion();
        state.SetIterationTime(elapsed_seconds);
    }

    state.counters["blocks"] =
        benchmark::Counter(state.iterations() * num_blocks,
                           benchmark::Counter::kIsRate);

    state.counters["elements"] =
        benchmark::Counter(state.iterations() * num_elements,
                           benchmark::Counter::kIsRate);

    const int64_t bytes_per_iteration = num_bytes + num_blocks + num_elements * sizeof(float);
    state.counters["bytes"] =
        benchmark::Counter(state.iterations() * bytes_per_iteration,
                           benchmark::Counter::kIsRate);
}

constexpr int64_t mega = INT64_C(1048576);
BENCHMARK(mf4_f32_convert)->Arg(256 * mega)->UseManualTime()->Unit(benchmark::kMicrosecond);

BENCHMARK_MAIN();


================================================
FILE: gpt_oss/metal/benchmark/u32-random.cc
================================================
#include <gpt-oss.h>
#include <internal/metal.hpp>
#include <internal/metal-kernels.h>

#include <benchmark/benchmark.h>

using gptoss::Check;
using namespace gptoss::metal;

static void u32_fill_random(benchmark::State& state) {
    const size_t numel = state.range(0);

    Device device;
    CommandQueue command_queue{device};
    Library library{device};
    Function u32_fill_random_fn{library, "gptoss_u32_fill_random"};
    Buffer buffer{device, numel * sizeof(float)};

    constexpr uint64_t seed = UINT64_C(1019827666124465388);
    constexpr uint64_t offset = UINT64_C(12345678901234567890);
    for (auto _ : state) {
        CommandBuffer command_buffer{command_queue};

        Check(gptoss_metal_command_buffer_encode_launch_u32_fill_random(
                command_buffer.handle(),
                u32_fill_random_fn.handle(),
                /*threadgroup_size=*/0,
                /*max_threadgroups=*/120,
                /*output_buffer=*/buffer.handle(),
                /*output_offset=*/0,
                numel, seed, offset),
            "gptoss_metal_command_buffer_encode_launch_u32_fill_random");

        command_buffer.commit();
        const double elapsed_seconds = command_buffer.wait_completion();
        state.SetIterationTime(elapsed_seconds);
    }
    
    const int64_t elements_per_iteration = numel;
    state.counters["elements"] =
        benchmark::Counter(state.iterations() * elements_per_iteration,
                           benchmark::Counter::kIsRate);

    const int64_t bytes_per_iteration = numel * sizeof(float);
    state.counters["bytes"] =
        benchmark::Counter(state.iterations() * bytes_per_iteration,
                           benchmark::Counter::kIsRate);
}

constexpr int64_t giga = INT64_C(1073741824);
BENCHMARK(u32_fill_random)->Arg(2 * giga)->UseManualTime()->Unit(benchmark::kMicrosecond);

BENCHMARK_MAIN();


================================================
FILE: gpt_oss/metal/examples/chat.py
================================================
#!/usr/bin/env python

import argparse
import sys

from datetime import date
from gpt_oss.metal import Context, Model


DEFAULT_PROMPT = f"""You are ChatGPT, a large language model trained by OpenAI.
Knowledge cutoff: 2024-06
Current date: {date.today().isoformat()}

reasoning effort high

# Valid channels: analysis, final. Channel must be included for every message."""


parser = argparse.ArgumentParser(description="Chat with gpt-oss", formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("model", metavar="PATH", type=str, help="Path to gpt-oss model in Metal inference format")
parser.add_argument("--prompt", type=str, default=DEFAULT_PROMPT, help="System prompt")
parser.add_argument(
    "--context-length", type=int, default=0, help="The maximum context length"
)
parser.add_argument(
    "--temperature", type=float, default=1.0, help="Sampling temperature"
)
parser.add_argument(
    "--seed", type=int, default=0, help="Sampling seed"
)


GREY = "\33[90m"
BOLD = "\33[1m"
RESET = "\33[0m"


def main(args):
    options = parser.parse_args(args)
    model = Model(options.model)
    tokenizer = model.tokenizer
    start_token = tokenizer.encode_special_token("<|start|>")
    message_token = tokenizer.encode_special_token("<|message|>")
    end_token = tokenizer.encode_special_token("<|end|>")
    return_token = tokenizer.encode_special_token("<|return|>")
    channel_token = tokenizer.encode_special_token("<|channel|>")

    context = Context(model, context_length=options.context_length)
    context.append(start_token)
    context.append("system")
    context.append(message_token)
    context.append(options.prompt)
    context.append(end_token)

    while True:
        context.append(start_token)
        context.append("user")
        context.append(message_token)
        message = input(f"{BOLD}User:{RESET} ").rstrip()
        context.append(message)
        context.append(end_token)
        print(f"{BOLD}Assistant:{RESET} {GREY}", end="", flush=True)
        context.append(start_token)
        context.append("assistant")
        context.append(channel_token)

        inside_start_block = True
        inside_channel_block = True
        role = "assistant"
        channel = ""
        while True:
            token = context.sample(
                temperature=options.temperature,
                seed=options.seed,
            )
            context.append(token)
            if token == return_token:
                print(flush=True)
                break
            elif token == start_token:
                inside_start_block = True
                role = ""
                channel = ""
            elif token == message_token:
                inside_start_block = False
                inside_channel_block = False
                if channel == "analysis":
                    print(f"{GREY}", end="", flush=True)
            elif token == end_token:
                print(f"{RESET}", flush=True)
            elif token == channel_token:
                inside_channel_block = True
            elif token < tokenizer.num_text_tokens:
                if inside_channel_block:
                    channel += str(tokenizer.decode(token), encoding="utf-8")
                elif inside_start_block:
                    role += str(tokenizer.decode(token), encoding="utf-8")
                else:
                    sys.stdout.buffer.write(tokenizer.decode(token))
                    sys.stdout.buffer.flush()


if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: gpt_oss/metal/examples/generate.py
================================================
#!/usr/bin/env python

import argparse
import sys

from gpt_oss.metal import Context, Model


parser = argparse.ArgumentParser(description='Chat with gpt-oss', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('model', metavar='PATH', type=str, help='Path to gpt-oss checkpoint')
parser.add_argument('-p', '--prompt', type=str, required=True, help='Prompt')
parser.add_argument('-l', '--limit', type=int, default=100, help='Number of tokens to generate')
parser.add_argument('--context-length', type=int, default=0, help='The maximum context length')


def main(args):
    options = parser.parse_args(args)
    model = Model(options.model)

    context = Context(model, context_length=options.context_length)
    context.append(options.prompt)
    print(context.tokens)
    prompt_tokens = context.num_tokens

    tokenizer = model.tokenizer

    while context.num_tokens - prompt_tokens < options.limit:
        token = context.sample()
        context.append(token)
        print(str(tokenizer.decode(token), encoding="utf-8"), end='', flush=True)


if __name__ == '__main__':
    main(sys.argv[1:])


================================================
FILE: gpt_oss/metal/include/gpt-oss/functions.h
================================================
#pragma once

#include <stddef.h>
#include <stdint.h>

#include <gpt-oss/macros.h>
#include <gpt-oss/types.h>

#ifdef __cplusplus
extern "C" {
#endif

/*
 * Creates a Model object from a file in the filesystem.
 *
 * @param path Path to the file containing the model in GPT-OSS format.
 * @param model_out Pointer to the Model object that will be created. Must be released with gptoss_release_model.
 *
 * On success, returns gptoss_status_success and saves a pointer to the created Model in the model_out argument.
 * On failure, returns an error code and stores null pointer in the model_out argument.
 */
enum gptoss_status GPTOSS_ABI gptoss_model_create_from_file(
    const char* path,
    gptoss_model_t* model_out);

/*
 * Query the Tokenizer object associated with the Model.
 *
 * @param model Pointer to the Model object created by gptoss_model_create_from_file.
 * @param tokenizer_out Pointer to the variable where the Tokenizer reference will be stored.
 *
 * On success, returns gptoss_status_success and stores reference to the Tokenizer object in the tokenizer_out argument.
 * On failure, returns an error code and stores NULL in the tokenizer_out argument.
 */
enum gptoss_status GPTOSS_ABI gptoss_model_get_tokenizer(
    gptoss_model_t model,
    gptoss_tokenizer_t* tokenizer_out);

/*
 * Query the maximum context length supported by the Model.
 *
 * @param model Pointer to the Model object created by gptoss_model_create_from_file.
 * @param max_context_length_out Pointer to the variable where the maximum context length will be stored.
 *
 * On success, returns gptoss_status_success and stores maximum context length in the max_context_length_out argument.
 * On failure, returns an error code and leaves the value specified by max_context_length_out unchanged.
 */
enum gptoss_status GPTOSS_ABI gptoss_model_get_max_context_length(
    gptoss_model_t model,
    size_t* max_context_length_out);

/*
 * Increments a Model object's reference count.
 *
 * @param model Pointer to the Model object created by gptoss_model_create_from_file.
 *
 * On success, returns gptoss_status_success, otherwise returns an error code.
 */
enum gptoss_status GPTOSS_ABI gptoss_model_retain(
    gptoss_model_t model);

/*
 * Decrements a Model object's reference count and possibly release associated resources.
 *
 * @param model Pointer to the Model object created by gptoss_model_create_from_file.
 *
 * On success, returns gptoss_status_success, otherwise returns an error code.
 */
enum gptoss_status GPTOSS_ABI gptoss_model_release(
    gptoss_model_t model);

/*
 * Query the token ID for a special token in the Tokenizer vocabulary.
 *
 * @param tokenizer Pointer to the Tokenizer object created by gptoss_model_get_tokenizer.
 * @param token_type Type of the special token to query an ID for.
 * @param token_id_out Pointer to the variable where the token ID will be stored.
 *
 * On success, returns gptoss_status_success and stores the token ID in the token_id_out argument.
 * On failure, returns an error code and leaves the value specified by token_id_out unchanged.
 */
enum gptoss_status GPTOSS_ABI gptoss_tokenizer_get_special_token_id(
    gptoss_tokenizer_t tokenizer,
    enum gptoss_special_token token_type,
    uint32_t* token_id_out);

/*
 * Query the number of text tokens in the Tokenizer vocabulary.
 *
 * @param tokenizer Pointer to the Tokenizer object created by gptoss_model_get_tokenizer.
 * @param num_text_tokens_out Pointer to the variable where the number of text tokens will be stored.
 *
 * On success, returns gptoss_status_success and stores the number of text tokens in the num_text_tokens_out argument.
 * On failure, returns an error code and leaves the value specified by num_text_tokens_out unchanged.
 */
enum gptoss_status GPTOSS_ABI gptoss_tokenizer_get_num_text_tokens(
    gptoss_tokenizer_t tokenizer,
    uint32_t* num_text_tokens_out);

/*
 * Query the number of special tokens in the Tokenizer vocabulary.
 *
 * @param tokenizer Pointer to the Tokenizer object created by gptoss_model_get_tokenizer.
 * @param num_special_tokens_out Pointer to the variable where the number of special tokens will be stored.
 *
 * On success, returns gptoss_status_success and stores the number of text tokens in the num_special_tokens_out argument.
 * On failure, returns an error code and leaves the value specified by num_special_tokens_out unchanged.
 */
enum gptoss_status GPTOSS_ABI gptoss_tokenizer_get_num_special_tokens(
    gptoss_tokenizer_t tokenizer,
    uint32_t* num_special_tokens_out);

/*
 * Query the total number of tokens in the Tokenizer vocabulary.
 *
 * @param tokenizer Pointer to the Tokenizer object created by gptoss_model_get_tokenizer.
 * @param num_tokens_out Pointer to the variable where the total number of tokens will be stored.
 *
 * On success, returns gptoss_status_success and stores the total number of tokens in the num_special_tokens_out argument.
 * On failure, returns an error code and leaves the value specified by num_special_tokens_out unchanged.
 */
enum gptoss_status GPTOSS_ABI gptoss_tokenizer_get_num_tokens(
    gptoss_tokenizer_t tokenizer,
    uint32_t* num_tokens_out);

/*
 * Convert a text token ID to byte representation.
 *
 * @param tokenizer Pointer to the Tokenizer object returned by gptoss_model_get_tokenizer. The lifetime of the returned
 *                  byte representation would match the lifetime of this Tokenizer object.
 * @param token_ptr_out Pointer to the variable where the pointer to the byte representation of the token will be
 *                      stored.
 * @param token_size_out Pointer to the variable where the size of the byte representation of the token will be stored.
 *
 * On success, returns gptoss_status_success and stores pointer and size of the byte representation of the token in the
 *                     token_ptr_out and token_size_out arguments.
 * On failure, returns an error code and leaves the values specified in token_ptr_out and token_size_out unchanged.
 */
enum gptoss_status GPTOSS_ABI gptoss_tokenizer_decode(
    gptoss_tokenizer_t tokenizer,
    uint32_t token_id,
    const void** token_ptr_out,
    size_t* token_size_out);

/*
 * Increments a Tokenizer object's reference count.
 *
 * @param tokenizer Pointer to the Tokenizer object returned by gptoss_model_get_tokenizer.
 *
 * On success, returns gptoss_status_success, otherwise returns an error code.
 */
enum gptoss_status GPTOSS_ABI gptoss_tokenizer_retain(
    gptoss_tokenizer_t tokenizer);

/*
 * Decrements a Tokenizer object's reference count and possibly release associated resources.
 *
 * @param tokenizer Pointer to the Tokenizer object returned by gptoss_model_get_tokenizer.
 *
 * On success, returns gptoss_status_success, otherwise returns an error code.
 */
enum gptoss_status GPTOSS_ABI gptoss_tokenizer_release(
    gptoss_tokenizer_t tokenizer);

/*
 * Creates a Context object for use with the particular Model object.
 *
 * @param model Model object to create a context for.
 * @param context_length Maximum number of tokens in the context.
 *                       Specify 0 to use the maximum context length supported by the model.
 * @param max_batch_size Maximum number of tokens that can be processed in a single batch.
 *                       Larger values may improve prefill performance, but require more memory.
 *                       Specify 0 to use the default value.
 * @param context_out Pointer to the Context object that will be created.
 *                    Must be released with gptoss_release_context.
 *
 * On success, returns gptoss_status_success and saves a pointer to the created Context in the context_out argument.
 * On failure, returns an error code and stores null pointer in the context_out argument.
 */
enum gptoss_status GPTOSS_ABI gptoss_context_create(
    gptoss_model_t model,
    size_t context_length,
    size_t max_batch_tokens,
    gptoss_context_t* context_out);

/*
 * Query the current number of tokens cached in the Context.
 *
 * @param context Pointer to the Context object created by gptoss_context_create.
 * @param num_tokens_out Pointer to the variable where the current number of cached tokens will be stored.
 *
 * On success, returns gptoss_status_success and stores current number of cached tokens in the num_tokens_out argument.
 * On failure, returns an error code and leaves the value specified by num_tokens_out unchanged.
 */
enum gptoss_status GPTOSS_ABI gptoss_context_get_num_tokens(
    gptoss_context_t context,
    size_t* num_tokens_out);

/*
 * Query the maximum number of tokens cached in the Context.
 *
 * @param context Pointer to the Context object created by gptoss_context_create.
 * @param max_tokens_out Pointer to the variable where the maximum number of cached tokens will be stored.
 *
 * On success, returns gptoss_status_success and stores maximum number of cached tokens in the max_tokens_out argument.
 * On failure, returns an error code and leaves the value specified by max_tokens_out unchanged.
 */
enum gptoss_status GPTOSS_ABI gptoss_context_get_max_tokens(
    gptoss_context_t context,
    size_t* max_tokens_out);

/*
 * Query the list of token IDs cached in the Context.
 *
 * @param context Pointer to the Context object created by gptoss_context_create.
 * @param tokens_out Pointer to the array where up to max_tokens_out of cached tokens will be stored.
 * @param max_tokens Maximum capacity of the buffer specified by tokens_out.
 * @param num_tokens_out Pointer to the variable where the actual number of cached tokens will be stored.
 *                       This value can exceed max_tokens if the buffer capacity is insufficient.
 *
 * On success, returns gptoss_status_success and stores cached token IDs in the tokens_out argument and the number of
 * cached tokens in the num_tokens_out argument.
 * On failure, returns an error code and leaves the values specified by tokens_out and num_tokens_out unchanged.
 */
enum gptoss_status GPTOSS_ABI gptoss_context_get_tokens(
    gptoss_context_t context,
    uint32_t* tokens_out,
    size_t max_tokens,
    size_t* num_tokens_out);

/*
 * Tokenize and appends a character string to the Context object.
 *
 * @param context Context object created by gptoss_context_create.
 * @param text Pointer to the character string to tokenizer and append.
 * @param text_length Length of the string, in chars.
 * @param num_tokens_out Optional pointer to the variable where the number of appended tokens will be stored. Ignored if a null pointer is provided.
 *
 * On success, returns gptoss_status_success, otherwise returns an error code.
 */
enum gptoss_status GPTOSS_ABI gptoss_context_append_chars(
    gptoss_context_t context,
    const char* text,
    size_t text_length,
    size_t* num_tokens_out);

/*
 * Appends a list of tokens to the context.
 *
 * @param context Context object created by gptoss_context_create.
 * @param num_tokens Number of tokens to be appended.
 * @param tokens Pointer to the array of tokens to be appended.
 *
 * On success, returns gptoss_status_success, otherwise returns an error code.
 */
enum gptoss_status GPTOSS_ABI gptoss_context_append_tokens(
    gptoss_context_t context,
    size_t num_tokens,
    const uint32_t* tokens);

/*
 * Resets the context, clearing its state.
 *
 * @param context Context object created by gptoss_context_create.
 *
 * On success, returns gptoss_status_success, otherwise returns an error code.
 */
enum gptoss_status GPTOSS_ABI gptoss_context_reset(
    gptoss_context_t context);

/*
 * Pre-process the tokens in the Context and generate probability distribution over the next token.
 *
 * @param context Context object created by gptoss_context_create.
 *
 * On success, returns gptoss_status_success, otherwise returns an error code.
 */
enum gptoss_status GPTOSS_ABI gptoss_context_process(
    gptoss_context_t context);

/*
 * Generate a token probability distribution over the next token conditioned on the Context.
 *
 * @param context Context object created by gptoss_context_create.
 * @param temperature Sampling temperature. Must be non-negative.
 * @param seed Random number generator seed to use for sampling.
 * @param token_out Pointer to the variable where the token ID will be stored.
 *
 * On success, returns gptoss_status_success, otherwise returns an error code.
 */
enum gptoss_status GPTOSS_ABI gptoss_context_sample(
    gptoss_context_t context,
    float temperature,
    uint64_t seed,
    size_t max_tokens,
    uint32_t* tokens_out,
    size_t* num_tokens_out);

/*
 * Increments a Context object's reference count.
 *
 * @param context Pointer to the Context object created by gptoss_create_context.
 *
 * On success, returns gptoss_status_success, otherwise returns an error code.
 */
enum gptoss_status GPTOSS_ABI gptoss_context_retain(
    gptoss_context_t context);

/*
 * Decrements a Context object's reference count and possibly release associated resources.
 *
 * @param context Pointer to the Context object created by gptoss_create_context.
 *
 * On success, returns gptoss_status_success, otherwise returns an error code.
 */
enum gptoss_status GPTOSS_ABI gptoss_context_release(
    gptoss_context_t context);

/*
 * Creates a Sampler object.
 *
 * @param sampler_out Pointer to the Sampler object that will be created.
 *                    Must be released with gptoss_sampler_release.
 *
 * On success, returns gptoss_status_success and saves a pointer to the created Sampler in the sampler_out argument.
 * On failure, returns an error code and stores a null pointer in the sampler_out argument.
 */
enum gptoss_status GPTOSS_ABI gptoss_sampler_create(
    gptoss_sampler_t* sampler_out);

/*
 * Sets the sampling temperature for the Sampler.
 *
 * @param sampler Sampler object created by gptoss_sampler_create.
 * @param temperature Temperature value to be set. Must be in the [0.0, 1.0] range.
 *
 * On success, returns gptoss_status_success, otherwise returns an error code.
 */
enum gptoss_status GPTOSS_ABI gptoss_sampler_set_temperature(
    gptoss_sampler_t sampler,
    float temperature);

/*
 * Sets the Top-P nucleus sampling parameter for the Sampler.
 *
 * @param sampler Sampler object created by gptoss_sampler_create.
 * @param top_p Top-P value to be set. Must be in the (0.0, 1.0] range.
 *
 * On success, returns gptoss_status_success, otherwise returns an error code.
 */
enum gptoss_status GPTOSS_ABI gptoss_sampler_set_top_p(
    gptoss_sampler_t sampler,
    float top_p);

/*
 * Sets the presence penalty for the Sampler.
 *
 * @param sampler Sampler object created by gptoss_sampler_create.
 * @param presence_penalty Presence penalty value to be set. Must be in the [-2.0, 2.0] range.
 *
 * On success, returns gptoss_status_success, otherwise returns an error code.
 */
enum gptoss_status GPTOSS_ABI gptoss_sampler_set_presence_penalty(
    gptoss_sampler_t sampler,
    float presence_penalty);

/*
 * Sets the frequency penalty for the Sampler.
 *
 * @param sampler Sampler object created by gptoss_sampler_create.
 * @param frequency_penalty Frequency penalty value to be set. Must be in the [-2.0, 2.0] range.
 *
 * On success, returns gptoss_status_success, otherwise returns an error code.
 */
enum gptoss_status GPTOSS_ABI gptoss_sampler_set_frequency_penalty(
    gptoss_sampler_t sampler,
    float frequency_penalty);

/*
 * Increments a Sampler object's reference count.
 *
 * @param sampler Pointer to the Sampler object created by gptoss_sampler_create.
 *
 * On success, returns gptoss_status_success, otherwise returns an error code.
 */
enum gptoss_status GPTOSS_ABI gptoss_sampler_retain(
    gptoss_sampler_t sampler);

/*
 * Decrements a Sampler object's reference count and possibly releases associated resources.
 *
 * @param sampler Pointer to the Sampler object created by gptoss_sampler_create.
 *
 * On success, returns gptoss_status_success, otherwise returns an error code.
 */
enum gptoss_status GPTOSS_ABI gptoss_sampler_release(
    gptoss_sampler_t sampler);

#ifdef __cplusplus
}  // extern "C"
#endif


================================================
FILE: gpt_oss/metal/include/gpt-oss/macros.h
================================================
#pragma once

#ifndef GPTOSS_ABI
    #define GPTOSS_ABI
#endif  // GPTOSS_ABI


================================================
FILE: gpt_oss/metal/include/gpt-oss/types.h
================================================
#pragma once

/*
 * Status codes returned by GPT-OSS API functions.
 */
enum gptoss_status {
    gptoss_status_success = 0,
    gptoss_status_invalid_argument = 1,
    gptoss_status_unsupported_argument = 2,
    gptoss_status_invalid_state = 3,
    gptoss_status_io_error = 4,
    gptoss_status_insufficient_memory = 5,
    gptoss_status_insufficient_resources = 6,
    gptoss_status_unsupported_system = 7,
    gptoss_status_context_overflow = 8,
};

enum gptoss_special_token {
    gptoss_special_token_invalid = 0,
    gptoss_special_token_return = 1,
    gptoss_special_token_start = 2,
    gptoss_special_token_message = 3,
    gptoss_special_token_end = 4,
    gptoss_special_token_refusal = 5,
    gptoss_special_token_constrain = 6,
    gptoss_special_token_channel = 7,
    gptoss_special_token_call = 8,
    gptoss_special_token_untrusted = 9,
    gptoss_special_token_end_untrusted = 10,
    gptoss_special_token_max,
};

/*
 * Model object is an opaque container comprised of:
 * - Weights
 * - Temporary buffers required to run the model
 * - Any other resources requires to run the model
 */
typedef struct gptoss_model* gptoss_model_t;

typedef struct gptoss_tokenizer* gptoss_tokenizer_t;

/*
 * Context is an opaque container comprised of:
 * - Input tokens
 * - Distribution over the output tokens
 * - KV cache
 * 
 * Multiple contexts can be created and used with the same model.
 */
typedef struct gptoss_context* gptoss_context_t;

/*
 * Sampler is an opaque container for sampling parameters:
 * - Temperature
 * - Top-p (nucleus sampling)
 * - Frequency penalty
 * - Presence penalty
 *
 * Multiple samplers can be created and used with the same context.
 */
typedef struct gptoss_sampler* gptoss_sampler_t;


================================================
FILE: gpt_oss/metal/include/gpt-oss.h
================================================
#pragma once

#include <gpt-oss/macros.h>
#include <gpt-oss/types.h>
#include <gpt-oss/functions.h>


================================================
FILE: gpt_oss/metal/python/context.c
================================================
#include <Python.h>

#include <gpt-oss.h>

#include "module.h"


static int PyGPTOSSContext_init(PyGPTOSSContext* self, PyObject* args, PyObject* kwargs) {
    static char *kwlist[] = {"model", "context_length", "max_batch_tokens", NULL};
    PyObject* model = NULL;
    Py_ssize_t context_length = 0; // Default to 0 if None
    Py_ssize_t max_batch_tokens = 0; // Default to 0 if None

    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!|$ii", kwlist,
                                     &PyGPTOSSModel_Type, &model,
                                     &context_length, &max_batch_tokens))
    {
        return -1;
    }
    if (context_length < 0) {
        PyErr_SetString(PyExc_ValueError, "context_length must be a positive integer");
        return -1;
    }
    if (max_batch_tokens < 0) {
        PyErr_SetString(PyExc_ValueError, "max_batch_tokens must be a positive integer");
        return -1;
    }

    enum gptoss_status status = gptoss_context_create(
        ((const PyGPTOSSModel*) model)->handle,
        (size_t) context_length,
        (size_t) max_batch_tokens,
        &self->handle);
    if (status != gptoss_status_success) {
        // TODO: set exception
        goto error;
    }

    return 0;

error:
    gptoss_context_release(self->handle);
    self->handle = NULL;
    return -1;
}

static void PyGPTOSSContext_dealloc(PyGPTOSSContext* self) {
    (void) gptoss_context_release(self->handle);
    self->handle = NULL;
    PyObject_Del((PyObject*) self);
}

static PyObject* PyGPTOSSContext_copy(PyGPTOSSContext *self) {
    PyGPTOSSContext* copy = (PyGPTOSSContext*) PyObject_New(PyGPTOSSContext, Py_TYPE(self));
    if (copy == NULL) {
        return NULL;
    }

    (void) gptoss_context_retain(self->handle);
    copy->handle = self->handle;
    return (PyObject*) copy;
}

static PyObject* PyGPTOSSContext_append(PyGPTOSSContext* self, PyObject* arg) {
    if (PyBytes_Check(arg)) {
        char* string_ptr = NULL;
        Py_ssize_t string_size = 0;
        if (PyBytes_AsStringAndSize(arg, &string_ptr, &string_size) < 0) {
            return NULL;
        }

        const enum gptoss_status status = gptoss_context_append_chars(
            self->handle, string_ptr, string_size, /*num_tokens_out=*/NULL);
        if (status != gptoss_status_success) {
            // TODO: set exception
            return NULL;
        }

        Py_RETURN_NONE;
    } else if (PyUnicode_Check(arg)) {
        Py_ssize_t string_size = 0;
        const char* string_ptr = PyUnicode_AsUTF8AndSize(arg, &string_size);
        if (string_ptr == NULL) {
            return NULL;
        }

        const enum gptoss_status status = gptoss_context_append_chars(
            self->handle, string_ptr, string_size, /*num_tokens_out=*/NULL);
        if (status != gptoss_status_success) {
            // TODO: set exception
            return NULL;
        }

        Py_RETURN_NONE;
    } else if (PyLong_Check(arg)) {
        const unsigned long token_as_ulong = PyLong_AsUnsignedLong(arg);
        if (token_as_ulong == (unsigned long) -1 && PyErr_Occurred()) {
            return NULL;
        }

        const uint32_t token = (uint32_t) token_as_ulong;
        const enum gptoss_status status = gptoss_context_append_tokens(
            self->handle, /*num_tokens=*/1, &token);
        if (status != gptoss_status_success) {
            // TODO: set exception
            return NULL;
        }

        Py_RETURN_NONE;
    } else {
        PyErr_SetString(PyExc_TypeError, "expected a bytes or integer argument");
        return NULL;
    }
}

static PyObject* PyGPTOSSContext_process(PyGPTOSSContext* self) {
    const enum gptoss_status status = gptoss_context_process(self->handle);
    if (status != gptoss_status_success) {
        // TODO: set exception
        return NULL;
    }

    Py_RETURN_NONE;
}

static PyObject* PyGPTOSSContext_sample(PyGPTOSSContext* self, PyObject* args, PyObject* kwargs) {
    static char *kwlist[] = {"max_output_tokens", "temperature", "seed", NULL};
    PyObject* token_list_obj = NULL;
    uint32_t* token_ptr = NULL;

    unsigned int max_output_tokens = 0;
    unsigned long long seed = 0;
    float temperature = 1.0f;
    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "I|$fK", kwlist,
            &max_output_tokens, &temperature, &seed))
    {
        return NULL;
    }

    token_ptr = (uint32_t*) PyMem_Malloc(max_output_tokens * sizeof(uint32_t));
    if (token_ptr == NULL) {
        goto error;
    }

    size_t num_tokens = 0;
    const enum gptoss_status status = gptoss_context_sample(
        self->handle, temperature, (uint64_t) seed,
        (size_t) max_output_tokens, token_ptr, &num_tokens);
    if (status != gptoss_status_success) {
        // TODO: set exception
        goto error;
    }

    token_list_obj = PyList_New((Py_ssize_t) num_tokens);
    if (token_list_obj == NULL) {
        goto error;
    }

    for (size_t t = 0; t < num_tokens; t++) {
        PyObject* token_obj = PyLong_FromUnsignedLong((unsigned long) token_ptr[t]);
        if (token_obj == NULL) {
            goto error;
        }

        PyList_SET_ITEM(token_list_obj, (Py_ssize_t) t, token_obj);
    }
    
    PyMem_Free(token_ptr);
    return token_list_obj;
    
error:
    PyMem_Free(token_ptr);
    Py_XDECREF(token_list_obj);
    return NULL;
}

static PyObject* PyGPTOSSContext_reset(PyGPTOSSContext* self) {
    const enum gptoss_status status = gptoss_context_reset(self->handle);
    if (status != gptoss_status_success) {
        // TODO: set exception
        return NULL;
    }

    Py_RETURN_NONE;
}

static PyMethodDef PyGPTOSSContext_methods[] = {
    {"__copy__", (PyCFunction) PyGPTOSSContext_copy, METH_NOARGS, "Create a copy of the Context"},
    {"append", (PyCFunction) PyGPTOSSContext_append, METH_O, "Append bytes to the Context"},
    {"process", (PyCFunction) PyGPTOSSContext_process, METH_NOARGS, "Process tokens in the Context"},
    {"sample", (PyCFunction) PyGPTOSSContext_sample, METH_VARARGS | METH_KEYWORDS, "Sample token predictions from the Context"},
    {"reset", (PyCFunction) PyGPTOSSContext_reset, METH_NOARGS, "Discard the content of the Context"},
    {NULL},
};

static PyObject* PyGPTOSSContext_get_num_tokens(PyGPTOSSContext* self, void* closure) {
    size_t num_tokens = 0;
    const enum gptoss_status status = gptoss_context_get_num_tokens(self->handle, &num_tokens);
    if (status != gptoss_status_success) {
        // TODO: set exception
        return NULL;
    }

    return PyLong_FromSize_t(num_tokens);
}

static PyObject* PyGPTOSSContext_get_max_tokens(PyGPTOSSContext* self, void* closure) {
    size_t max_tokens = 0;
    const enum gptoss_status status = gptoss_context_get_max_tokens(self->handle, &max_tokens);
    if (status != gptoss_status_success) {
        // TODO: set exception
        return NULL;
    }

    return PyLong_FromSize_t(max_tokens);
}

static PyObject* PyGPTOSSContext_get_tokens(PyGPTOSSContext* self, void* closure) {
    PyObject* token_list_obj = NULL;
    uint32_t* token_ptr = NULL;

    size_t num_tokens = 0;
    gptoss_context_get_tokens(self->handle, /*tokens_out=*/NULL, /*max_tokens=*/0, &num_tokens);

    if (num_tokens != 0) {
        token_ptr = (uint32_t*) PyMem_Malloc(num_tokens * sizeof(uint32_t));
        if (token_ptr == NULL) {
            // TODO: set exception
            goto error;
        }

        enum gptoss_status status = gptoss_context_get_tokens(self->handle, token_ptr, /*max_tokens=*/num_tokens, &num_tokens);
        if (status != gptoss_status_success) {
            // TODO: set exception
            goto error;
        }
    }

    token_list_obj = PyList_New((Py_ssize_t) num_tokens);
    if (token_list_obj == NULL) {
        goto error;
    }

    for (size_t t = 0; t < num_tokens; t++) {
        PyObject* token_obj = PyLong_FromUnsignedLong((unsigned long) token_ptr[t]);
        if (token_obj == NULL) {
            goto error;
        }

        PyList_SET_ITEM(token_list_obj, (Py_ssize_t) t, token_obj);
    }

    PyMem_Free(token_ptr);
    return token_list_obj;

error:
    PyMem_Free(token_ptr);
    Py_XDECREF(token_list_obj);
    return NULL;
}

static PyGetSetDef PyGPTOSSContext_getseters[] = {
    (PyGetSetDef) {
        .name = "num_tokens",
        .get = (getter) PyGPTOSSContext_get_num_tokens,
        .doc = "Current number of tokens in the context",
    },
    (PyGetSetDef) {
        .name = "max_tokens",
        .get = (getter) PyGPTOSSContext_get_max_tokens,
        .doc = "Maximum number of tokens in the context",
    },
    (PyGetSetDef) {
        .name = "tokens",
        .get = (getter) PyGPTOSSContext_get_tokens,
        .doc = "List of token IDs in the context",
    },
    {NULL}  /* Sentinel */
};

PyTypeObject PyGPTOSSContext_Type = {
    PyVarObject_HEAD_INIT(NULL, 0)
    .tp_name = "gptoss.Context",
    .tp_basicsize = sizeof(PyGPTOSSContext),
    .tp_flags = 0
        | Py_TPFLAGS_DEFAULT
        | Py_TPFLAGS_BASETYPE,
    .tp_doc = "Context object",
    .tp_methods = PyGPTOSSContext_methods,
    .tp_getset = PyGPTOSSContext_getseters,
    .tp_new = PyType_GenericNew,
    .tp_init = (initproc) PyGPTOSSContext_init,
    .tp_dealloc = (destructor) PyGPTOSSContext_dealloc,
};


================================================
FILE: gpt_oss/metal/python/model.c
================================================
#include <Python.h>

#include <gpt-oss.h>

#include "module.h"


static int PyGPTOSSModel_init(PyGPTOSSModel* self, PyObject* args, PyObject* kwargs) {
    enum gptoss_status status;
    const char* filepath;

    if (!PyArg_ParseTuple(args, "s", &filepath)) {
        return -1;
    }
    status = gptoss_model_create_from_file(filepath, &self->handle);
    if (status != gptoss_status_success) {
        // TODO: set exception
        return -1;
    }
    return 0;
}

static void PyGPTOSSModel_dealloc(PyGPTOSSModel* self) {
    (void) gptoss_model_release(self->handle);
    self->handle = NULL;
    PyObject_Del((PyObject*) self);
}

static PyObject* PyGPTOSSModel_copy(PyGPTOSSModel* self) {
    PyGPTOSSModel* copy = (PyGPTOSSModel*) PyObject_New(PyGPTOSSModel, Py_TYPE(self));
    if (copy == NULL) {
        return NULL;
    }

    (void) gptoss_model_retain(self->handle);
    copy->handle = self->handle;
    return (PyObject*) copy;
}

static PyMethodDef PyGPTOSSModel_methods[] = {
    {"__copy__", (PyCFunction) PyGPTOSSModel_copy, METH_NOARGS, "Create a copy of the Model"},
    {NULL},
};

static PyObject *PyGPTOSSModel_get_max_context_length(PyGPTOSSModel* self, void* closure) {
    size_t max_context_length = 0;
    const enum gptoss_status status = gptoss_model_get_max_context_length(self->handle, &max_context_length);
    if (status != gptoss_status_success) {
        // TODO: set exception
        return NULL;
    }

    return PyLong_FromSize_t(max_context_length);
}

static PyObject *PyGPTOSSModel_get_tokenizer(PyGPTOSSModel* self, void* closure) {
    PyObject* args = PyTuple_Pack(1, self);
    if (args == NULL) {
        return NULL;
    }

    PyObject* tokenizer = PyObject_CallObject((PyObject*) &PyGPTOSSTokenizer_Type, args);
    Py_DECREF(args);
    return tokenizer;
}

static PyGetSetDef PyGPTOSSModel_getseters[] = {
    (PyGetSetDef) {
        .name = "max_context_length",
        .get = (getter) PyGPTOSSModel_get_max_context_length,
        .doc = "Maximum context length supported by the model",
    },
    (PyGetSetDef) {
        .name = "tokenizer",
        .get = (getter) PyGPTOSSModel_get_tokenizer,
        .doc = "Tokenizer object associated with the model",
    },
    {NULL}  // Sentinel
};

PyTypeObject PyGPTOSSModel_Type = {
    PyVarObject_HEAD_INIT(NULL, 0)
    .tp_name = "gptoss.Model",
    .tp_basicsize = sizeof(PyGPTOSSModel),
    .tp_flags = 0
        | Py_TPFLAGS_DEFAULT
        | Py_TPFLAGS_BASETYPE,
    .tp_doc = "Model object",
    .tp_methods = PyGPTOSSModel_methods,
    .tp_getset = PyGPTOSSModel_getseters,
    .tp_new = PyType_GenericNew,
    .tp_init = (initproc) PyGPTOSSModel_init,
    .tp_dealloc = (destructor) PyGPTOSSModel_dealloc,
};


================================================
FILE: gpt_oss/metal/python/module.c
================================================
#include <Python.h>

#include "module.h"


static PyMethodDef module_methods[] = {
    {NULL, NULL, 0, NULL}
};

static PyModuleDef metal_module = {
    PyModuleDef_HEAD_INIT,
    "_metal",
    "Local GPT-OSS inference",
    -1,
    module_methods
};

PyMODINIT_FUNC PyInit__metal(void) {
    PyObject* module = NULL;
    PyObject* model_type = NULL;
    PyObject* tokenizer_type = NULL;
    PyObject* context_type = NULL;

    if (PyType_Ready(&PyGPTOSSModel_Type) < 0) {
        goto error;
    }
    model_type = (PyObject*) &PyGPTOSSModel_Type;
    Py_INCREF(model_type);

    if (PyType_Ready(&PyGPTOSSTokenizer_Type) < 0) {
        goto error;
    }
    tokenizer_type = (PyObject*) &PyGPTOSSTokenizer_Type;
    Py_INCREF(tokenizer_type);

    if (PyType_Ready(&PyGPTOSSContext_Type) < 0) {
        goto error;
    }
    context_type = (PyObject*) &PyGPTOSSContext_Type;
    Py_INCREF(context_type);

    module = PyModule_Create(&metal_module);
    if (module == NULL) {
        goto error;
    }

    if (PyModule_AddObject(module, "Model", model_type) < 0) {
        goto error;
    }

    if (PyModule_AddObject(module, "Tokenizer", tokenizer_type) < 0) {
        goto error;
    }

    if (PyModule_AddObject(module, "Context", context_type) < 0) {
        goto error;
    }

    return module;

error:
    Py_XDECREF(context_type);
    Py_XDECREF(tokenizer_type);
    Py_XDECREF(model_type);
    Py_XDECREF(module);
    return NULL;
}


================================================
FILE: gpt_oss/metal/python/module.h
================================================
#include <Python.h>

#include <gpt-oss.h>

typedef struct {
    PyObject_HEAD
    gptoss_model_t handle;
} PyGPTOSSModel;

typedef struct {
    PyObject_HEAD
    gptoss_tokenizer_t handle;
} PyGPTOSSTokenizer;

typedef struct {
    PyObject_HEAD
    gptoss_context_t handle;
} PyGPTOSSContext;

extern PyTypeObject PyGPTOSSModel_Type;
extern PyTypeObject PyGPTOSSTokenizer_Type;
extern PyTypeObject PyGPTOSSContext_Type;


================================================
FILE: gpt_oss/metal/python/tokenizer.c
================================================
#include <Python.h>

#include <gpt-oss.h>

#include "module.h"

static PyObject* PyGPTOSSTokenizer_new(PyTypeObject* subtype, PyObject* args, PyObject* kwargs) {
    static char *kwlist[] = {"model", NULL};
    PyObject* model = NULL;
    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "O!", kwlist, &PyGPTOSSModel_Type, &model)) {
        return NULL;
    }

    PyGPTOSSTokenizer* self = (PyGPTOSSTokenizer*) subtype->tp_alloc(subtype, 0);
    if (self == NULL) {
        return NULL;
    }

    const enum gptoss_status status = gptoss_model_get_tokenizer(
        ((const PyGPTOSSModel*) model)->handle,
        &self->handle);
    if (status != gptoss_status_success) {
        // TODO: set exception
        return NULL;
    }

    return (PyObject*) self;
}

static void PyGPTOSSTokenizer_dealloc(PyGPTOSSTokenizer* self) {
    (void) gptoss_tokenizer_release(self->handle);
    self->handle = NULL;
    PyObject_Del((PyObject*) self);
}

static PyObject* PyGPTOSSTokenizer_copy(PyGPTOSSTokenizer* self) {
    PyGPTOSSTokenizer* copy = (PyGPTOSSTokenizer*) PyObject_New(PyGPTOSSTokenizer, Py_TYPE(self));
    if (copy == NULL) {
        return NULL;
    }

    (void) gptoss_tokenizer_retain(self->handle);
    copy->handle = self->handle;
    return (PyObject*) copy;
}

static PyObject* PyGPTOSSTokenizer_encode_special_token(PyGPTOSSTokenizer* self, PyObject* arg) {
    if (PyUnicode_Check(arg)) {
        const char* string_ptr = PyUnicode_AsUTF8(arg);
        if (string_ptr == NULL) {
            return NULL;
        }

        enum gptoss_special_token token_type = gptoss_special_token_invalid;
        if (strcmp(string_ptr, "<|return|>") == 0) {
            token_type = gptoss_special_token_return;
        } else if (strcmp(string_ptr, "<|start|>") == 0) {
            token_type = gptoss_special_token_start;
        } else if (strcmp(string_ptr, "<|message|>") == 0) {
            token_type = gptoss_special_token_message;
        } else if (strcmp(string_ptr, "<|end|>") == 0) {
            token_type = gptoss_special_token_end;
        } else if (strcmp(string_ptr, "<|refusal|>") == 0) {
            token_type = gptoss_special_token_refusal;
        } else if (strcmp(string_ptr, "<|constrain|>") == 0) {
            token_type = gptoss_special_token_constrain;
        } else if (strcmp(string_ptr, "<|channel|>") == 0) {
            token_type = gptoss_special_token_channel;
        } else if (strcmp(string_ptr, "<|call|>") == 0) {
            token_type = gptoss_special_token_call;
        } else if (strcmp(string_ptr, "<|untrusted|>") == 0) {
            token_type = gptoss_special_token_untrusted;
        } else if (strcmp(string_ptr, "<|end_untrusted|>") == 0) {
            token_type = gptoss_special_token_end_untrusted;
        } else {
            PyErr_Format(PyExc_ValueError, "unrecognized special token: %s", string_ptr);
            return NULL;
        }

        uint32_t token_id = UINT32_MAX;
        const enum gptoss_status status = gptoss_tokenizer_get_special_token_id(
            self->handle, token_type, &token_id);
        if (status != gptoss_status_success || token_id == UINT32_MAX) {
            PyErr_Format(PyExc_ValueError, "tokenizer does not support the %s token", string_ptr);
            return NULL;
        }

        return PyLong_FromUnsignedLong((unsigned long) token_id);
    } else {
        PyErr_SetString(PyExc_TypeError, "string argument expected");
        return NULL;
    }
}

static PyObject* PyGPTOSSTokenizer_decode(PyGPTOSSTokenizer* self, PyObject* args, PyObject* kwargs) {
    static char *kwlist[] = {"token", NULL};
    unsigned int token = 0; // Default to 0 if None

    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "I", kwlist, &token)) {
        return NULL;
    }

    const void* token_ptr = NULL;
    size_t token_size = 0;
    const enum gptoss_status status = gptoss_tokenizer_decode(self->handle, (uint32_t) token, &token_ptr, &token_size);
    if (status != gptoss_status_success) {
        // TODO: set exception
        return NULL;
    }

    return PyBytes_FromStringAndSize((const char*) token_ptr, (Py_ssize_t) token_size);
}

static PyMethodDef PyGPTOSSTokenizer_methods[] = {
    {"__copy__", (PyCFunction) PyGPTOSSTokenizer_copy, METH_NOARGS, "Create a copy of the Tokenizer"},
    {"encode_special_token", (PyCFunction) PyGPTOSSTokenizer_encode_special_token, METH_O, "Query ID of a special token"},
    {"decode", (PyCFunction) PyGPTOSSTokenizer_decode, METH_VARARGS | METH_KEYWORDS, "Convert text token ID to bytes"},
    {NULL},
};

static PyObject* PyGPTOSSTokenizer_get_num_text_tokens(PyGPTOSSTokenizer* self, void* closure) {
    uint32_t num_text_tokens = 0;
    const enum gptoss_status status = gptoss_tokenizer_get_num_text_tokens(self->handle, &num_text_tokens);
    if (status != gptoss_status_success) {
        // TODO: set exception
        return NULL;
    }

    return PyLong_FromUnsignedLong((unsigned long) num_text_tokens);
}

static PyObject* PyGPTOSSTokenizer_get_num_special_tokens(PyGPTOSSTokenizer* self, void* closure) {
    uint32_t num_special_tokens = 0;
    const enum gptoss_status status = gptoss_tokenizer_get_num_special_tokens(self->handle, &num_special_tokens);
    if (status != gptoss_status_success) {
        // TODO: set exception
        return NULL;
    }

    return PyLong_FromUnsignedLong((unsigned long) num_special_tokens);
}

static PyObject* PyGPTOSSTokenizer_get_num_tokens(PyGPTOSSTokenizer* self, void* closure) {
    uint32_t num_tokens = 0;
    const enum gptoss_status status = gptoss_tokenizer_get_num_tokens(self->handle, &num_tokens);
    if (status != gptoss_status_success) {
        // TODO: set exception
        return NULL;
    }

    return PyLong_FromUnsignedLong((unsigned long) num_tokens);
}

static PyGetSetDef PyGPTOSSTokenizer_getseters[] = {
    (PyGetSetDef) {
        .name = "num_tokens",
        .get = (getter) PyGPTOSSTokenizer_get_num_tokens,
        .doc = "Total number of tokens in the tokenizer dictionary",
    },
    (PyGetSetDef) {
        .name = "num_text_tokens",
        .get = (getter) PyGPTOSSTokenizer_get_num_text_tokens,
        .doc = "Number of text tokens in the tokenizer dictionary",
    },
    (PyGetSetDef) {
        .name = "num_special_tokens",
        .get = (getter) PyGPTOSSTokenizer_get_num_special_tokens,
        .doc = "Number of special tokens in the tokenizer dictionary",
    },
    {NULL}  /* Sentinel */
};

PyTypeObject PyGPTOSSTokenizer_Type = {
    PyVarObject_HEAD_INIT(NULL, 0)
    .tp_name = "gptoss.Tokenizer",
    .tp_basicsize = sizeof(PyGPTOSSTokenizer),
    .tp_flags = 0
        | Py_TPFLAGS_DEFAULT
        | Py_TPFLAGS_BASETYPE,
    .tp_doc = "Tokenizer object",
    .tp_methods = PyGPTOSSTokenizer_methods,
    .tp_getset = PyGPTOSSTokenizer_getseters,
    .tp_new = PyGPTOSSTokenizer_new,
    .tp_dealloc = (destructor) PyGPTOSSTokenizer_dealloc,
};


================================================
FILE: gpt_oss/metal/scripts/create-local-model.py
================================================
import argparse
import os
import math
import sys
import json
import itertools
import struct
from uuid import UUID

import tiktoken

import torch
from safetensors import safe_open
from tqdm import tqdm
from openai_harmony import load_harmony_encoding, HarmonyEncodingName

parser = argparse.ArgumentParser(prog='create-local-model.py', description='Convert a checkpoint directory to a local model file')
parser.add_argument('-s', '--src', metavar='DIR', type=str, required=True, help='Path to the input checkpoint directory')
parser.add_argument('-d', '--dst', metavar='FILE', type=str, required=True, help='Path to the output model file')


o200k_base = tiktoken.get_encoding("o200k_base")
harmony_encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)

o200k_gptoss = tiktoken.Encoding(
    name="o200k_gptoss",
    pat_str=o200k_base._pat_str,
    mergeable_ranks=o200k_base._mergeable_ranks,
    special_tokens={
        "<|reversed199998|>": 199998,  # unused
        "<|endoftext|>": 199999,
        "<|untrusted|>": 200000,
        "<|endofuntrusted|>": 200001,
        "<|return|>": 200002,
        "<|constrain|>": 200003,
        "<|reversed200004|>": 200004,  # unused
        "<|channel|>": 200005,
        "<|start|>": 200006,
        "<|end|>": 200007,
        "<|message|>": 200008,
        "<|reversed200008|>": 200008,  # unused
        "<|reversed200009|>": 200009,  # unused
        "<|reversed200010|>": 200010,  # unused
        "<|reversed200011|>": 200011,  # unused
        "<|call|>": 200012,
        "<|refusal|>": 200013,
    }
)

FILE_MAGIC = struct.pack('ccccccccccccI', b'G', b'P', b'T', b'-', b'O', b'S', b'S', b' ', b'v', b'1', b'.', b'0', 0)
SPECIAL_TOKEN_UUID = {
    '<|start|>': UUID('55a77c2f-8a01-4c54-8ac2-313bfc7e208d').bytes,
    '<|message|>': UUID('16e40431-f47f-4b22-b59b-8b278fc30a54').bytes,
    '<|end|>': UUID('fcac2f6d-4705-4f6b-b228-642accac7238').bytes,
    '<|return|>': UUID('f799ff69-1992-43c4-a3d8-d831f475dc75').bytes,
    '<|refusal|>': UUID('e15ba702-28c4-4292-ab8f-ffa434709128').bytes,
    '<|constrain|>': UUID('c0bb14c7-6022-49da-ad08-792d67e8b470').bytes,
    '<|channel|>': UUID('fd3dda11-c8ab-4033-876e-d93deb172c93').bytes,
    '<|call|>': UUID('1220f796-e388-4de5-b487-fe2eb5fe03c0').bytes,
    '<|untrusted|>': UUID('07d7da55-b346-4cff-8b37-7cefacf8a3e8').bytes,
    '<|end_untrusted|>': UUID('f265bd9c-c717-469e-a447-920687d65d90').bytes,
}

INCLUDE_SPECIAL_TOKENS = [
    "<|start|>",
    "<|message|>",
    "<|end|>",
    "<|return|>",
    "<|refusal|>",
    "<|constrain|>",
    "<|channel|>",
    "<|call|>",
    "<|untrusted|>",
    "<|end_untrusted|>",
]

GPTOSS_MODEL_UUID = UUID('df52dc86-1789-4ed0-a295-66f10508145b').bytes
APPLE_GPU_LAYOUT_UUID = UUID('229177a8-5775-4268-bfd8-d588b351c56d').bytes
TIKTOKEN_TOKENIZER_UUID = UUID('7401aded-2a95-40cb-b782-9ccebaafe72b').bytes

UE8_OFFSET = 14  # bias to MXFP4 block scales

def write_file_header(f):
    f.write(FILE_MAGIC)

def write_tokenizer_header(f,
                           num_special_tokens: int,
                           num_text_tokens: int,
                           regex_size: int,
                           tokens_size: int):
    f.write(TIKTOKEN_TOKENIZER_UUID)
    f.write(struct.pack('<I', num_special_tokens))
    f.write(struct.pack('<I', num_text_tokens))
    f.write(struct.pack('<I', regex_size))
    f.write(struct.pack('<I', tokens_size))

def write_model_header(f,
                       context_length : int,
                       num_blocks : int,
                       num_experts : int,
                       num_active_experts : int,
                       embedding_dim : int,
                       mlp_dim : int,
                       swiglu_limit : float,
                       head_dim: int,
                       num_heads : int,
                       num_kv_heads : int,
                       attention_window : int,
                       rope_theta : float,
                       interpolation_scale : float,
                       yarn_offset : float,
                       yarn_scale : float,
                       yarn_multiplier : float,
                       rmsnorm_epsilon : float):
    f.write(GPTOSS_MODEL_UUID)
    f.write(struct.pack('<I', context_length))
    f.write(struct.pack('<I', num_blocks))
    f.write(struct.pack('<I', num_experts))
    f.write(struct.pack('<I', num_active_experts))
    f.write(struct.pack('<I', embedding_dim))
    f.write(struct.pack('<I', mlp_dim))
    f.write(struct.pack('<f', swiglu_limit))
    f.write(struct.pack('<I', head_dim))
    f.write(struct.pack('<I', num_heads))
    f.write(struct.pack('<I', num_kv_heads))
    f.write(struct.pack('<I', attention_window))
    f.write(struct.pack('<f', rope_theta))
    f.write(struct.pack('<f', interpolation_scale))
    f.write(struct.pack('<f', yarn_offset))
    f.write(struct.pack('<f', yarn_scale))
    f.write(struct.pack('<f', yarn_multiplier))
    f.write(struct.pack('<f', rmsnorm_epsilon))
    f.write(APPLE_GPU_LAYOUT_UUID)


def write_padding(out_file, alignment_multiple=16384):
    offset = out_file.tell()
    alignment_size = -offset % alignment_multiple
    if alignment_size != 0:
        alignment = bytes(alignment_size)
        out_file.write(alignment)


def write_embedding_weight(out_file, weight):
    write_padding(out_file, alignment_multiple=16)

    assert weight.dtype == torch.float8_e4m3fn or weight.dtype == torch.bfloat16
    out_file.write(weight.view(torch.uint8).numpy().tobytes())


def write_rmsnorm_gain(out_file, gain):
    write_padding(out_file, alignment_multiple=16)

    assert gain.dtype == torch.bfloat16
    out_file.write(gain.view(torch.uint8).numpy().tobytes())


def write_attn_sink(out_file, sink):
    write_padding(out_file, alignment_multiple=16)

    assert sink.dtype == torch.bfloat16
    out_file.write(sink.view(torch.uint8).numpy().tobytes())


def write_linear_weight(out_file, *args):
    write_padding(out_file, alignment_multiple=16)

    for t in args:
        out_file.write(t.view(torch.uint8).numpy().tobytes())


def main(args):
    options = parser.parse_args(args)

    with open(os.path.join(options.src, "config.json"), "r") as f:
        config = json.load(f)

    num_blocks = config["num_hidden_layers"]
    num_experts = config["num_experts"]
    num_active_experts = 4
    num_q_heads = config["num_attention_heads"]
    num_kv_heads = config["num_key_value_heads"]
    head_dim = config["head_dim"]
    embedding_dim = config["hidden_size"]
    mlp_dim = config["intermediate_size"]
    swiglu_limit = config.get("swiglu_limit", 7.0)
    rope_theta = config["rope_theta"]
    attention_window = config["sliding_window"]
    initial_context_length = config["initial_context_length"]
    rope_scaling_factor = config["rope_scaling_factor"]
    rope_ntk_alpha = config["rope_ntk_alpha"]
    rope_ntk_beta = config["rope_ntk_beta"]

    tokens_size = 0
    num_text_tokens = 0
    # First add all text tokens
    for t in range(o200k_gptoss.n_vocab):
        if not harmony_encoding.is_special_token(t):
            token_bytes = o200k_gptoss.decode_single_token_bytes(t)
            assert len(token_bytes) > 0
            tokens_size += len(token_bytes) + 2  # uint16_t string length + string data
            num_text_tokens += 1
    # Then add all special tokens
    num_included_tokens = 200013 + 1
    print(f"Tokenizer: {num_included_tokens} tokens")

    # Read from all files ending with .safetensors in the checkpoint directory
    safetensor_files = [
        os.path.join(options.src, fname)
        for fname in os.listdir(options.src)
        if fname.endswith(".safetensors")
    ]
    # Build a mapping from tensor name to filepath
    tensor_name_to_file = {}
    for safetensor_file in safetensor_files:
        with safe_open(safetensor_file, framework="pt", device="cpu") as src:
            for key in src.keys():
                tensor_name_to_file[key] = safetensor_file

    def get_tensor(name):
        with safe_open(tensor_name_to_file[name], framework="pt", device="cpu") as src:
            return src.get_tensor(name)

    with open(options.dst, "wb") as dst:
        write_file_header(dst)

        yarn_low = (
            head_dim / 2
            * math.log(initial_context_length / (rope_ntk_beta * 2 * math.pi))
            / math.log(rope_theta)
        )
        yarn_high = (
            head_dim / 2
            * math.log(initial_context_length / (rope_ntk_alpha * 2 * math.pi))
            / math.log(rope_theta)
        )

        write_model_header(dst,
                            context_length=int(initial_context_length * rope_scaling_factor),
                            num_blocks=num_blocks,
                            num_experts=num_experts,
                            num_active_experts=num_active_experts,
                            embedding_dim=embedding_dim,
                            mlp_dim=mlp_dim,
                            swiglu_limit=swiglu_limit,
                            head_dim=head_dim,
                            num_heads=num_q_heads,
                            num_kv_heads=num_kv_heads,
                            attention_window=attention_window,
                            rope_theta=rope_theta,
                            interpolation_scale=1.0 / rope_scaling_factor,
                            yarn_offset=-yarn_low / (yarn_high - yarn_low),
                            yarn_scale=1.0 / (yarn_high - yarn_low),
                            yarn_multiplier=0.1 * math.log(rope_scaling_factor) + 1.0,
                            rmsnorm_epsilon=1.0e-5)

        write_tokenizer_header(dst,
                                num_special_tokens=num_included_tokens - num_text_tokens,
                                num_text_tokens=num_text_tokens,
                                regex_size=len(o200k_gptoss._pat_str.encode("ascii")) + 1,
                                tokens_size=tokens_size)

        ### Tokenizer
        # Special tokens
        for token_idx in range(num_text_tokens, num_included_tokens):
            token = o200k_gptoss.decode_single_token_bytes(token_idx).decode('ascii')
            if token in INCLUDE_SPECIAL_TOKENS:
                dst.write(SPECIAL_TOKEN_UUID[token])
            else:
                dst.write(bytes(16))
        # Regex
        dst.write(o200k_gptoss._pat_str.encode("ascii"))
        dst.write(struct.pack('B', 0))
        # Text tokens
        tokenizer_bytes_written = 0
        for t in range(num_text_tokens):
            token_bytes = o200k_gptoss.decode_single_token_bytes(t)
            assert len(token_bytes) > 0
            dst.write(struct.pack('<H', len(token_bytes)))
            dst.write(token_bytes)
            tokenizer_bytes_written += len(token_bytes) + 2
        assert(tokenizer_bytes_written == tokens_size), (tokenizer_bytes_written, tokens_size)
        write_padding(dst)

        embedding_weight = get_tensor("embedding.weight")
        # Filter out unused tokens
        embedding_weight = embedding_weight[:num_included_tokens, :]
        write_embedding_weight(dst, embedding_weight)

        for n in tqdm(range(num_blocks)):
            write_rmsnorm_gain(dst, get_tensor(f"block.{n}.attn.norm.scale"))

            attn_qkv_weight = get_tensor(f"block.{n}.attn.qkv.weight")
            attn_qkv_bias = get_tensor(f"block.{n}.attn.qkv.bias")
            for qkv in (attn_qkv_weight, attn_qkv_bias):
                qk = qkv[:head_dim * (num_q_heads + num_kv_heads), ...].contiguous()
                v = qkv[head_dim * (num_q_heads + num_kv_heads):, ...].contiguous()
                qk = qk.view(num_q_heads + num_kv_heads, 2, head_dim // 2, -1).transpose(1, 2).reshape(num_q_heads + num_kv_heads, head_dim, -1)
                q = qk[:num_q_heads, ...]
                k = qk[num_q_heads:, ...]
                # Factor multiplication by 1/sqrt(64) = 0.125 = 0.5 * 0.25 in SDPA into Q and K projections
                assert head_dim == 64
                q *= 0.5
                k *= 0.25
                v = v.view(num_kv_heads, head_dim, -1)
                qkv.copy_(torch.cat((q, k, v), dim=0).reshape(*qkv.shape))

            write_linear_weight(dst, attn_qkv_weight, attn_qkv_bias)

            write_attn_sink(dst, get_tensor(f"block.{n}.attn.sinks"))

            write_linear_weight(dst, get_tensor(f"block.{n}.attn.out.weight"), get_tensor(f"block.{n}.attn.out.bias"))

            write_rmsnorm_gain(dst, get_tensor(f"block.{n}.mlp.norm.scale"))

            write_linear_weight(dst, get_tensor(f"block.{n}.mlp.gate.weight"), get_tensor(f"block.{n}.mlp.gate.bias"))

        write_rmsnorm_gain(dst, get_tensor("norm.scale"))

        unembedding_weight = get_tensor("unembedding.weight")
        unembedding_weight = unembedding_weight[:num_included_tokens, :]
        write_linear_weight(dst, unembedding_weight)

        for n in tqdm(range(num_blocks)):
            mlp1_blocks = get_tensor(f"block.{n}.mlp.mlp1_weight.blocks")
            mlp1_scales = get_tensor(f"block.{n}.mlp.mlp1_weight.scales")
            assert mlp1_scales.min().item() < 254 - UE8_OFFSET
            mlp1_bias = get_tensor(f"block.{n}.mlp.mlp1_bias")

            mlp2_blocks = get_tensor(f"block.{n}.mlp.mlp2_weight.blocks")
            mlp2_scales = get_tensor(f"block.{n}.mlp.mlp2_weight.scales")
            assert mlp2_scales.min().item() < 254 - UE8_OFFSET
            mlp2_bias = get_tensor(f"block.{n}.mlp.mlp2_bias")

            # Write MoE weights grouped by expert
            write_padding(dst)

            for e in range(num_experts):
                write_padding(dst, alignment_multiple=16)                    
                dst.write(mlp1_blocks[e, ...].view(torch.uint8).numpy().tobytes())

                write_padding(dst, alignment_multiple=16)
                dst.write((mlp1_scales + UE8_OFFSET)[e, ...].view(torch.uint8).numpy().tobytes())

                write_padding(dst, alignment_multiple=16)
                dst.write(mlp1_bias[e, ...].view(torch.uint8).numpy().tobytes())

                write_padding(dst, alignment_multiple=16)                    
                dst.write(mlp2_blocks[e, ...].view(torch.uint8).numpy().tobytes())

                write_padding(dst, alignment_multiple=16)
                dst.write((mlp2_scales + UE8_OFFSET)[e, ...].view(torch.uint8).numpy().tobytes())

                write_padding(dst, alignment_multiple=16)
                dst.write(mlp2_bias[e, ...].view(torch.uint8).numpy().tobytes())

if __name__ == "__main__":
    main(sys.argv[1:])


================================================
FILE: gpt_oss/metal/source/accumulate.metal
================================================
#include <metal_integer>
#include <metal_math>

#include <internal/kernel-args.h>

#pragma METAL fp math_mode(safe)
#pragma METAL fp contract(off)


kernel void gptoss_f32_accumulate_e4(
    constant gptoss_accumulate_args& args [[ buffer(0) ]],
    const device float4* input [[ buffer(1) ]],
    const device gptoss_expert_prediction* expert [[ buffer(2) ]],
    device float4* output [[ buffer(3) ]],
    const device gptoss_control* control [[ buffer(4) ]],
    uint2 gid [[threadgroup_position_in_grid]],
    uint tid [[thread_index_in_threadgroup]],
    uint2 threadgroup_size [[ threads_per_threadgroup ]])
{
    const uint num_active_experts = 4;
    if (control->abort != 0) {
        return;
    }

    const uint num_vecs_per_threadgroup = args.num_vecs_per_threadgroup;
    const uint threadgroup_start = gid.x * num_vecs_per_threadgroup;
    const uint num_vecs = args.num_vecs;
    const uint threadgroup_end = metal::min(threadgroup_start + num_vecs_per_threadgroup, num_vecs);
    const uint thread_start = threadgroup_start + tid;
    uint num_iter = static_cast<uint>((threadgroup_end - thread_start + (threadgroup_size.x - 1)) / threadgroup_size.x);

    const uint num_vecs_per_expert = args.num_vecs_per_expert;
    const float scale0 = expert[gid.y * num_active_experts + 0].score;
    const device float4* input0 = input + gid.y * num_vecs + thread_start;
    const float scale1 = expert[gid.y * num_active_experts + 1].score;
    const device float4* input1 = input0 + num_vecs_per_expert;
    const float scale2 = expert[gid.y * num_active_experts + 2].score;
    const device float4* input2 = input1 + num_vecs_per_expert;
    const float scale3 = expert[gid.y * num_active_experts + 3].score;
    const device float4* input3 = input2 + num_vecs_per_expert;
    output += gid.y * num_vecs + thread_start;
    for (; num_iter != 0; num_iter--) {
        float4 acc = *output;
        const float4 val0 = *input0;
        const float4 val1 = *input1;
        const float4 val2 = *input2;
        const float4 val3 = *input3;
        input0 += threadgroup_size.x;
        acc = metal::fma(val0, scale0, acc);
        input1 += threadgroup_size.x;
        acc = metal::fma(val1, scale1, acc);
        input2 += threadgroup_size.x;
        acc = metal::fma(val2, scale2, acc);
        input3 += threadgroup_size.x;
        acc = metal::fma(val3, scale3, acc);
        *output = acc;
        output += threadgroup_size.x;
    }
}


================================================
FILE: gpt_oss/metal/source/context.c
================================================
#include <assert.h>
#include <float.h>
#include <inttypes.h>
#include <stdbool.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>

#include <gpt-oss.h>

#include "internal/datatype.h"
#include "internal/model.h"
#include "internal/metal.h"
#include "internal/metal-kernels.h"
#include "internal/log.h"
#include "internal/rng.h"


enum gptoss_status GPTOSS_ABI gptoss_context_create(
    gptoss_model_t model,
    size_t context_length,
    size_t max_batch_tokens,
    gptoss_context_t* context_out)
{
    *context_out = NULL;

    enum gptoss_status status = gptoss_status_success;
    struct gptoss_context* context = NULL;

    // Validate context_length
    if (context_length == 0) {
        context_length = model->context_length;
    } else if (context_length > model->context_length) {
        GPTOSS_LOG_ERROR("requested context length %zu exceeds model context length %" PRIu32,
            context_length, model->context_length);
        status = gptoss_status_invalid_argument;
        goto cleanup;
    }
    assert(context_length != 0);
    assert(context_length <= model->context_length);

    // Validate max_batch_tokens
    if (max_batch_tokens == 0) {
        max_batch_tokens = GPTOSS_DEFAULT_BATCH_SIZE;
    } else if (max_batch_tokens > context_length) {
        GPTOSS_LOG_ERROR("requested max batch tokens %zu exceeds context length %zu",
            max_batch_tokens, context_length);
        status = gptoss_status_invalid_argument;
        goto cleanup;
    }
    assert(max_batch_tokens != 0);
    assert(max_batch_tokens <= context_length);

    context = malloc(sizeof(struct gptoss_context));
    if (context == NULL) {
        GPTOSS_LOG_ERROR("failed to allocate %zu bytes for Context object",
            sizeof(struct gptoss_context));
        status = gptoss_status_insufficient_memory;
        goto cleanup;
    }
    memset(context, 0, sizeof(struct gptoss_context));

    atomic_store_explicit(&context->ref_count, 1, memory_order_relaxed);
    context->max_tokens = context_length;
    context->max_batch_tokens = max_batch_tokens;

    // Activation buffers
    status = gptoss_metal_buffer_create(&model->device, max_batch_tokens * model->embedding_dim * sizeof(float), NULL, &context->residual_activation_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_buffer_create(&model->device, max_batch_tokens * model->embedding_dim * sizeof(float), NULL, &context->rmsnorm_activation_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_buffer_create(&model->device, max_batch_tokens * model->head_dim * (model->num_heads + 2 * model->num_kv_heads) * sizeof(float), NULL, &context->qkv_activation_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_buffer_create(&model->device, max_batch_tokens * model->head_dim * model->num_heads * sizeof(float), NULL, &context->sdpa_activation_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_buffer_create(&model->device, max_batch_tokens * model->num_experts * sizeof(float), NULL, &context->gate_activation_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_buffer_create(&model->device, max_batch_tokens * model->num_experts * sizeof(struct gptoss_expert_prediction), NULL, &context->expert_activation_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    // The last entry will hold the total number of tokens.
    status = gptoss_metal_buffer_create(&model->device, (1 + model->num_experts) * sizeof(uint32_t), NULL, &context->expert_offset_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_buffer_create(&model->device, max_batch_tokens * model->num_active_experts * sizeof(uint32_t), NULL, &context->token_to_expert_routing_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_buffer_create(&model->device, max_batch_tokens * model->num_active_experts * model->embedding_dim * sizeof(float), NULL, &context->swiglu_input_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_buffer_create(&model->device, max_batch_tokens * model->num_active_experts * model->mlp_dim * sizeof(float), NULL, &context->swiglu_activation_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_buffer_create(&model->device, max_batch_tokens * model->num_active_experts * model->embedding_dim * sizeof(float), NULL, &context->moe_activation_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }

    // Input/output buffers
    status = gptoss_metal_buffer_create(&model->device, sizeof(struct gptoss_control), NULL, &context->control_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_buffer_create(&model->device, context_length * sizeof(uint32_t), NULL, &context->token_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_buffer_create(&model->device, max_batch_tokens * model->vocabulary_size * sizeof(float), NULL, &context->score_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_buffer_create(&model->device, max_batch_tokens * model->vocabulary_size * sizeof(float), NULL, &context->prob_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_buffer_create(&model->device, max_batch_tokens * model->max_threadgroups * sizeof(float), NULL, &context->sum_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_buffer_create(&model->device, max_batch_tokens * sizeof(uint64_t), NULL, &context->argmax_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_buffer_create(&model->device, model->num_blocks * context_length * 2 * model->num_kv_heads * model->head_dim * sizeof(float), NULL, &context->kvcache_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }

    context->kvcache_size = context->kvcache_buffer.size;
    context->allocation_size = 
        context->residual_activation_buffer.size + context->rmsnorm_activation_buffer.size +
        context->qkv_activation_buffer.size + context->sdpa_activation_buffer.size +
        context->gate_activation_buffer.size + context->expert_activation_buffer.size +
        context->expert_offset_buffer.size + context->token_to_expert_routing_buffer.size + context->swiglu_input_buffer.size +
        context->swiglu_activation_buffer.size + context->moe_activation_buffer.size +
        context->token_buffer.size + context->kvcache_buffer.size + context->score_buffer.size + context->argmax_buffer.size;

    context->model = model;
    gptoss_model_retain(model);
    *context_out = context;
    context = NULL;

cleanup:
    gptoss_context_release(context);
    return status;
}

enum gptoss_status GPTOSS_ABI gptoss_context_get_num_tokens(
    gptoss_context_t context,
    size_t* num_tokens_out)
{
    *num_tokens_out = context->num_tokens;
    return gptoss_status_success;
}

enum gptoss_status GPTOSS_ABI gptoss_context_get_max_tokens(
    gptoss_context_t context,
    size_t* max_tokens_out)
{
    *max_tokens_out = context->max_tokens;
    return gptoss_status_success;
}

enum gptoss_status GPTOSS_ABI gptoss_context_get_tokens(
    gptoss_context_t context,
    uint32_t* tokens_out,
    size_t max_tokens,
    size_t* num_tokens_out)
{
    *num_tokens_out = context->num_tokens;
    if (max_tokens < context->num_tokens) {
        return gptoss_status_insufficient_memory;
    }

    if (context->num_tokens != 0) {
        memcpy(tokens_out, context->token_buffer.ptr, context->num_tokens * sizeof(uint32_t));
    }
    return gptoss_status_success;
}

// Prefill: input_tokens_offset = number of tokens in KV cache, num_input_tokens > 0, num_output_tokens = 0.
// Sampling: input_tokens_offset = number of tokens in the context - 1, num_input_tokens = 1, num_output_tokens = 1.
// Perplexity: input_tokens_offset = 0, num_input_tokens > 1, num_output_tokens = num_input_tokens.
static enum gptoss_status process_tokens(
    gptoss_context_t context,
    struct gptoss_metal_command_buffer* command_buffer,
    size_t input_tokens_offset,
    size_t num_input_tokens,
    size_t num_output_tokens)
{
    assert(num_input_tokens != 0);
    assert(num_input_tokens <= context->max_batch_tokens);
    assert(num_output_tokens <= context->max_batch_tokens);
    assert(num_input_tokens >= num_output_tokens);
    const size_t min_tokens_for_dense_matmul_kernels = 64;
    const size_t min_tokens_for_dense_moe_kernels = 64;

    enum gptoss_status status = gptoss_status_success;
    const struct gptoss_model* model = context->model;

    const size_t attn_qkv_dim = model->head_dim * (model->num_heads + 2 * model->num_kv_heads);

    const size_t input_tokens_end = input_tokens_offset + num_input_tokens;
    for (size_t input_batch_start = input_tokens_offset;
        input_batch_start < input_tokens_end;
        input_batch_start += context->max_batch_tokens)
    {
        const size_t input_batch_size = math_min(context->max_batch_tokens, input_tokens_end - input_batch_start);
        const size_t input_batch_end = input_batch_start + input_batch_size;
        const size_t output_batch_size = math_sub_sat(num_output_tokens, input_tokens_end - input_batch_end);

        status = gptoss_metal_command_buffer_encode_launch_bf16_f32_embeddings(
            command_buffer,
            &model->bf16_f32_embeddings_fn,
            model->embeddings_threadgroup_size,
            &context->token_buffer,
            input_batch_start * sizeof(uint32_t),
            &model->shared_weight_buffer,
            /*weight_offset=*/0,
            &context->residual_activation_buffer,
            /*output_offset=*/0,
            &context->control_buffer,
            /*control_offset=*/0,
            /*num_tokens=*/input_batch_size,
            /*num_channels=*/model->embedding_dim);
        if (status != gptoss_status_success) {
            GPTOSS_LOG_ERROR("failed to encode bf16_f32_embeddings kernel launch");
            return status;
        }
        for (uint32_t n = 0; n < model->num_blocks; n++) {
            const bool last_block = n + 1 == model->num_blocks;
            const size_t num_block_output_tokens = last_block ? output_batch_size : input_batch_size;

            status = gptoss_metal_command_buffer_encode_launch_f32_bf16w_rmsnorm(
                command_buffer,
                &model->f32_bf16w_rmsnorm_fn,
                &context->residual_activation_buffer,
                /*input_offset=*/0,
                &model->shared_weight_buffer,
                /*weight_offset=*/model->attn_rmsnorm_gain_offset + model->per_block_shared_weights_size * n,
                &context->rmsnorm_activation_buffer,
                /*output_offset=*/0,
                &context->control_buffer,
                /*control_offset=*/0,
                /*num_tokens=*/input_batch_size,
                /*num_channels=*/model->embedding_dim,
                model->rmsnorm_epsilon);
            if (status != gptoss_status_success) {
                GPTOSS_LOG_ERROR("failed to encode f32_bf16w_rmsnorm kernel launch");
                return status;
            }

            if (input_batch_size >= min_tokens_for_dense_matmul_kernels) {
                status = gptoss_metal_command_buffer_encode_launch_f32_bf16w_dense_matmul_qkv(
                    command_buffer,
                    &model->f32_bf16w_dense_matmul_qkv_fn,
                    &context->rmsnorm_activation_buffer,
                    /*input_offset=*/0,
                    &model->shared_weight_buffer,
                    /*weight_offset=*/model->attn_qkv_weight_offset + model->per_block_shared_weights_size * n,
                    &model->shared_weight_buffer,
                    /*bias_offset=*/model->attn_qkv_bias_offset + model->per_block_shared_weights_size * n,
                    &context->qkv_activation_buffer,
                    /*output_offset=*/0,
                    &context->kvcache_buffer,
                    /*kv_offset=*/n * model->num_kv_heads * context->max_tokens * 2 * model->head_dim * sizeof(float),
                    &context->control_buffer,
                    /*control_offset=*/0,
                    /*num_tokens=*/input_batch_size,
                    /*num_cols=*/model->embedding_dim,
                    /*num_rows=*/attn_qkv_dim,
                    /*max_tokens=*/context->max_tokens,
                    /*token_offset=*/input_batch_start);
                if (status != gptoss_status_success) {
                    GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul_qkv kernel launch");
                    return status;
                }

                status = gptoss_metal_command_buffer_encode_launch_f32_rope(
                    command_buffer,
                    &model->f32_rope_fn,
                    /*threadgroup_size=*/32,
                    &context->qkv_activation_buffer,
                    /*input_offset=*/0,

                    &context->kvcache_buffer,
                    /*kv_offset=*/n * model->num_kv_heads * context->max_tokens * 2 * model->head_dim * sizeof(float),
                    &context->control_buffer,
                    /*control_offset=*/0,
                    model->rope_theta,
                    model->interpolation_scale,
                    model->yarn_offset,
                    model->yarn_scale,
                    model->yarn_multiplier,
                    input_batch_size,
                    model->num_heads,
                    model->num_kv_heads,
                    model->head_dim,
                    /*max_tokens=*/context->max_tokens,
                    /*token_offset=*/input_batch_start);
                if (status != gptoss_status_success) {
                    GPTOSS_LOG_ERROR("failed to encode f32_rope kernel launch");
                    return status;
                }

            } else {
                status = gptoss_metal_command_buffer_encode_launch_f32_bf16w_matmul_qkv(
                    command_buffer,
                    &model->f32_bf16w_matmul_qkv_fn,
                    model->attn_qkv_threadgroup_size,
                    &context->rmsnorm_activation_buffer,
                    /*input_offset=*/0,
                    &model->shared_weight_buffer,
                    /*weight_offset=*/model->attn_qkv_weight_offset + model->per_block_shared_weights_size * n,
                    &model->shared_weight_buffer,
                    /*bias_offset=*/model->attn_qkv_bias_offset + model->per_block_shared_weights_size * n,
                    &context->qkv_activation_buffer,
                    /*output_offset=*/0,
                    &context->kvcache_buffer,
                    /*kv_offset=*/n * model->num_kv_heads * context->max_tokens * 2 * model->head_dim * sizeof(float),
                    &context->control_buffer,
                    /*control_offset=*/0,
                    /*num_tokens=*/input_batch_size,
                    /*num_cols=*/model->embedding_dim,
                    /*num_q_heads=*/model->num_heads,
                    /*num_kv_heads=*/model->num_kv_heads,
                    /*attn_head_dim=*/model->head_dim,
                    /*token_offset=*/input_batch_start,
                    /*max_tokens=*/context->max_tokens,
                    /*rope_base=*/model->rope_theta,
                    /*interpolation_scale=*/model->interpolation_scale,
                    /*yarn_offset=*/model->yarn_offset,
                    /*yarn_scale=*/model->yarn_scale,
                    /*yarn_multiplier=*/model->yarn_multiplier);
                if (status != gptoss_status_success) {
                    GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul_qkv kernel launch");
                    return status;
                }
            }

            if (num_block_output_tokens != 0) {
                status = gptoss_metal_command_buffer_encode_launch_f32_sdpa(
                    command_buffer,
                    &model->f32_sdpa_q8_d64_fn,
                    &context->qkv_activation_buffer,
                    /*q_offset=*/attn_qkv_dim * (input_batch_size - num_block_output_tokens) * sizeof(float),
                    &context->kvcache_buffer,
                    /*kv_offset=*/n * model->num_kv_heads * context->max_tokens * 2 * model->head_dim * sizeof(float),
                    &model->shared_weight_buffer,
                    /*s_offset=*/model->attn_sdpa_sink_offset + model->per_block_shared_weights_size * n,
                    &context->sdpa_activation_buffer,
                    /*output_offset=*/0,
                    &context->control_buffer,
                    /*control_offset=*/0,
                    /*window=*/n % 2 == 0 ? model->attention_window : UINT32_MAX,
                    /*kv_stride=*/2 * context->max_tokens * model->head_dim,
                    num_block_output_tokens,
                    input_batch_start + input_batch_size - num_block_output_tokens,
                    model->num_heads, model->num_kv_heads, model->head_dim);
                if (status != gptoss_status_success) {
                    GPTOSS_LOG_ERROR("failed to encode f32_sdpa kernel launch");
                    return status;
                }

                if (input_batch_size >= min_tokens_for_dense_matmul_kernels) {
                    status = gptoss_metal_command_buffer_encode_launch_f32_bf16w_dense_matmul_attn_output(
                        command_buffer,
                        &model->f32_bf16w_dense_matmul_attn_output_fn,
                        &context->sdpa_activation_buffer,
                        /*input_offset=*/0,
                        &model->shared_weight_buffer,
                        /*weight_offset=*/model->attn_out_weight_offset + model->per_block_shared_weights_size * n,
                        &model->shared_weight_buffer,
                        /*bias_offset=*/model->attn_out_bias_offset + model->per_block_shared_weights_size * n,
                        &context->residual_activation_buffer,
                        /*output_offset=*/model->embedding_dim * (input_batch_size - num_block_output_tokens) * sizeof(float),
                        &context->control_buffer,
                        /*control_offset=*/0,
                        /*num_tokens=*/num_block_output_tokens,
                        /*num_cols=*/model->num_heads * model->head_dim,
                        /*num_rows=*/model->embedding_dim);
                    if (status != gptoss_status_success) {
                        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul_attn_output kernel launch");
                        return status;
                    }
                } else {
                    status = gptoss_metal_command_buffer_encode_launch_f32_bf16w_matmul_add(
                        command_buffer,
                        &model->f32_bf16w_matmul_fn,
                        model->attn_out_threadgroup_size,
                        &context->sdpa_activation_buffer,
                        /*input_offset=*/0,
                        &model->shared_weight_buffer,
                        /*weight_offset=*/model->attn_out_weight_offset + model->per_block_shared_weights_size * n,
                        &model->shared_weight_buffer,
                        /*bias_offset=*/model->attn_out_bias_offset + model->per_block_shared_weights_size * n,
                        &context->residual_activation_buffer,
                        /*output_offset=*/model->embedding_dim * (input_batch_size - num_block_output_tokens) * sizeof(float),
                        &context->control_buffer,
                        /*control_offset=*/0,
                        /*num_tokens=*/num_block_output_tokens,
                        /*num_cols=*/model->num_heads * model->head_dim,
                        /*num_rows=*/model->embedding_dim);
                    if (status != gptoss_status_success) {
                        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul_add kernel launch");
                        return status;
                    }
                }
                status = gptoss_metal_command_buffer_encode_launch_f32_bf16w_rmsnorm(
                    command_buffer,
                    &model->f32_bf16w_rmsnorm_fn,
                    &context->residual_activation_buffer,
                    /*input_offset=*/model->embedding_dim * (input_batch_size - num_block_output_tokens) * sizeof(float),
                    &model->shared_weight_buffer,
                    /*weight_offset=*/model->mlp_rmsnorm_gain_offset + model->per_block_shared_weights_size * n,
                    &context->rmsnorm_activation_buffer,
                    /*output_offset=*/0,
                    &context->control_buffer,
                    /*control_offset=*/0,
                    num_block_output_tokens,
                    model->embedding_dim,
                    model->rmsnorm_epsilon);
                if (status != gptoss_status_success) {
                    GPTOSS_LOG_ERROR("failed to encode f32_bf16w_rmsnorm kernel launch");
                    return status;
                }
                if (input_batch_size >= min_tokens_for_dense_matmul_kernels) {
                    status = gptoss_metal_command_buffer_encode_launch_f32_bf16w_dense_matmul_mlp_gate(
                        command_buffer,
                        &model->f32_bf16w_dense_matmul_mlp_gate_fn,
                        &context->rmsnorm_activation_buffer,
                        /*input_offset=*/0,
                        &model->shared_weight_buffer,
                        /*weight_offset=*/model->mlp_gate_weight_offset + model->per_block_shared_weights_size * n,
                        &model->shared_weight_buffer,
                        /*bias_offset=*/model->mlp_gate_bias_offset + model->per_block_shared_weights_size * n,
                        &context->gate_activation_buffer,
                        /*output_offset=*/0,
                        &context->control_buffer,
                        /*control_offset=*/0,
                        num_block_output_tokens,
                        model->embedding_dim,
                        model->num_experts);
                    if (status != gptoss_status_success) {
                        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul_mlp_gate kernel launch");
                        return status;
                    }
                } else {
                    status = gptoss_metal_command_buffer_encode_launch_f32_bf16w_matmul(
                        command_buffer,
                        &model->f32_bf16w_matmul_fn,
                        model->mlp_gate_threadgroup_size,
                        &context->rmsnorm_activation_buffer,
                        /*input_offset=*/0,
                        &model->shared_weight_buffer,
                        /*weight_offset=*/model->mlp_gate_weight_offset + model->per_block_shared_weights_size * n,
                        &model->shared_weight_buffer,
                        /*bias_offset=*/model->mlp_gate_bias_offset + model->per_block_shared_weights_size * n,
                        &context->gate_activation_buffer,
                        /*output_offset=*/0,
                        &context->control_buffer,
                        /*control_offset=*/0,
                        /*num_tokens=*/num_block_output_tokens,
                        /*num_cols=*/model->embedding_dim,
                        /*num_rows=*/model->num_experts);
                    if (status != gptoss_status_success) {
                        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul kernel launch");
                        return status;
                    }
                }

                const char* kernel_name = NULL;
                switch (model->num_experts) {
                    case 32:
                        kernel_name = "f32_topk_softmax_e32_k4_fn";
                        status = gptoss_metal_command_buffer_encode_launch_f32_topk(
                            command_buffer,
                            &model->f32_topk_softmax_e32_k4_fn,
                            &context->gate_activation_buffer, /*input_offset=*/0,
                            &context->expert_activation_buffer, /*output_offset=*/0,
                            &context->control_buffer, /*control_offset=*/0,
                            num_block_output_tokens,
                            model->num_experts,
                            model->num_active_experts);
                        break;
                    case 128:
                        kernel_name = "f32_topk_softmax_e128_k4_fn";
                        status = gptoss_metal_command_buffer_encode_launch_f32_topk(
                            command_buffer,
                            &model->f32_topk_softmax_e128_k4_fn,
                            &context->gate_activation_buffer, /*input_offset=*/0,
                            &context->expert_activation_buffer, /*output_offset=*/0,
                            &context->control_buffer, /*control_offset=*/0,
                            num_block_output_tokens,
                            model->num_experts,
                            model->num_active_experts);
                        break;
                    default:
                        status = gptoss_status_unsupported_argument;
                        GPTOSS_LOG_ERROR("missing Top-K kernel for %" PRIu32 " experts", model->num_experts);
                        return status;
                }
                if (status != gptoss_status_success) {
                    GPTOSS_LOG_ERROR("failed to encode %s kernel launch", kernel_name);
                    return status;
                }

                // If we have enough tokens in prefill, we will pick the prefill-optimized kernels.
                if (num_block_output_tokens >= min_tokens_for_dense_moe_kernels) {
                    status = gptoss_metal_command_buffer_encode_launch_expert_routing_metadata(
                        command_buffer,
                        &model->f32_expert_routing_metadata_fn,
                        &context->expert_activation_buffer,
                        /*expert_predictions_offset=*/0,
                        &context->expert_offset_buffer,
                        /*expert_offsets_offset=*/0,
                        &context->token_to_expert_routing_buffer,
                        /*intra_expert_offsets_offset=*/0,
                        num_block_output_tokens * model->num_active_experts,
                        model->num_experts);
                    if (status != gptoss_status_success) {
                        GPTOSS_LOG_ERROR("failed to encode f32_expert_routing_metadata kernel launch");
                        return status;
                    }
                    status = gptoss_metal_command_buffer_encode_launch_f32_scatter(
                        command_buffer,
                        &model->f32_scatter_e4_fn,
                        &context->rmsnorm_activation_buffer,
                        /*input_offset=*/0,
                        &context->expert_activation_buffer,
                        /*expert_predictions_offset=*/0,
                        &context->expert_offset_buffer,
                        /*expert_offsets_offset=*/0,
                        &context->token_to_expert_routing_buffer,
                        /*intra_expert_offsets_offset=*/0,
                        &context->swiglu_input_buffer,
                        /*output_offset=*/0,
                        model->embedding_dim,
                        num_block_output_tokens,
                        model->num_active_experts);
                    if (status != gptoss_status_success) {
                        GPTOSS_LOG_ERROR("failed to encode f32_scatter kernel launch");
                        return status;
                    } 
                    // Dense MoE SwiGLU matmul.
                    status = gptoss_metal_command_buffer_encode_launch_f32_mf4w_moe_dense_matmul_swiglu(
                        command_buffer,
                        &model->f32_mf4w_moe_dense_matmul_swiglu_fn,
                        &context->expert_offset_buffer,
                        /*expert_offsets_offset=*/0,
                        &context->swiglu_input_buffer,
                        /*input_offset=*/0,
                        &model->block_weight_buffers[n],
                        /*weight_block_offset=*/0,
                        &model->block_weight_buffers[n],
                        /*weight_scale_offset=*/model->mlp_swiglu_scale_offset,
                        &model->block_weight_buffers[n],
                        /*bias_offset=*/model->mlp_swiglu_bias_offset,
                        &context->swiglu_activation_buffer,
                        /*output_offset=*/0,
                        model->swiglu_limit,
                        /*expert_stride_bytes=*/model->per_expert_block_weight_size,
                        num_block_output_tokens,
                        model->num_experts,
                        model->embedding_dim,
                        2 * model->mlp_dim);
                    if (status != gptoss_status_success) {
                        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_dense_matmul_swiglu kernel launch");
                        return status;
                    }

                    // Dense MoE proj matmul.
                    status = gptoss_metal_command_buffer_encode_launch_f32_mf4w_moe_dense_matmul(
                        command_buffer,
                        &model->f32_mf4w_moe_dense_matmul_fn,
                        &context->expert_offset_buffer,
                        /*expert_offsets_offset=*/0,
                        &context->swiglu_activation_buffer,
                        /*input_offset=*/0,
                        &model->block_weight_buffers[n],
                        /*weight_block_offset=*/model->mlp_out_block_offset,
                        &model->block_weight_buffers[n],
                        /*weight_scale_offset=*/model->mlp_out_scale_offset,
                        &model->block_weight_buffers[n],
                        /*bias_offset=*/model->mlp_out_bias_offset,
                        &context->moe_activation_buffer,
                        /*output_offset=*/0,
                        /*expert_stride_bytes=*/model->per_expert_block_weight_size,
                        num_block_output_tokens,
                        model->num_experts,
                        model->mlp_dim,
                        model->embedding_dim);
                    if (status != gptoss_status_success) {
                        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_dense_matmul_swiglu kernel launch");
                        return status;
                    }
                    // Gather and accumulate.
                    status = gptoss_metal_command_buffer_encode_launch_f32_gather_and_accumulate_e4(
                        command_buffer,
                        &model->f32_gather_and_accumulate_e4_fn,
                        &context->moe_activation_buffer,
                        /*input_offset=*/0,
                        &context->expert_activation_buffer,
                        /*expert_predictions_offset=*/0,
                        &context->expert_offset_buffer,
                        /*expert_offsets_offset=*/0,
                        &context->token_to_expert_routing_buffer,
                        /*intra_expert_offsets_offset=*/0,
                        &context->residual_activation_buffer, 
                        /*output_offset=*/model->embedding_dim * (input_batch_size - num_block_output_tokens) * sizeof(float),
                        model->embedding_dim,
                        num_block_output_tokens,
                        model->num_active_experts);
                    if (status != gptoss_status_success) {
                        GPTOSS_LOG_ERROR("failed to encode f32_gather_and_accumulate_e4 kernel launch");
                        return status;
                    }

                } else {
                    status = gptoss_metal_command_buffer_encode_launch_f32_mf4w_moe_matmul_swiglu(
                        command_buffer,
                        &model->f32_mf4w_moe_matmul_swiglu_fn,
                        model->mlp_swiglu_threadgroup_size,
                        &context->rmsnorm_activation_buffer,
                        /*input_offset=*/0,
                        &context->expert_activation_buffer,
                        /*expert_offset=*/0,
                        &model->block_weight_buffers[n],
                        /*weight_block_offset=*/0,
                        &model->block_weight_buffers[n],
                        /*weight_scale_offset=*/model->mlp_swiglu_scale_offset,
                        &model->block_weight_buffers[n],
                        /*bias_offset=*/model->mlp_swiglu_bias_offset,
                        &context->swiglu_activation_buffer,
                        /*output_offset=*/0,
                        &context->control_buffer,
                        /*control_offset=*/0,
                        model->swiglu_limit,
                        model->per_expert_block_weight_size,
                        num_block_output_tokens,
                        model->num_active_experts,
                        model->embedding_dim,
                        model->mlp_dim);
                    if (status != gptoss_status_success) {
                        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_matmul_swiglu kernel launch");
                        return status;
                    }

                    status = gptoss_metal_command_buffer_encode_launch_f32_mf4w_moe_matmul(
                        command_buffer,
                        &model->f32_mf4w_moe_matmul_fn,
                        model->mlp_out_threadgroup_size,
                        &context->swiglu_activation_buffer,
                        /*input_offset=*/0,
                        &context->expert_activation_buffer,
                        /*expert_offset=*/0,
                        &model->block_weight_buffers[n],
                        /*weight_block_offset=*/model->mlp_out_block_offset,
                        &model->block_weight_buffers[n],
                        /*weight_scale_offset=*/model->mlp_out_scale_offset,
                        &model->block_weight_buffers[n],
                        /*bias_offset=*/model->mlp_out_bias_offset,
                        &context->moe_activation_buffer,
                        /*output_offset=*/0,
                        &context->control_buffer,
                        /*control_offset=*/0,
                        model->per_expert_block_weight_size,
                        num_block_output_tokens,
                        model->num_active_experts,
                        model->mlp_dim,
                        model->embedding_dim);
                    if (status != gptoss_status_success) {
                        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_matmul kernel launch");
                        return status;
                    }

                    status = gptoss_metal_command_buffer_encode_launch_f32_accumulate(
                        command_buffer,
                        &model->f32_accumulate_e4_fn,
                        model->mlp_acc_threadgroup_size,
                        model->max_threadgroups,
                        &context->moe_activation_buffer,
                        /*input_offset=*/0,
                        &context->expert_activation_buffer,
                        /*expert_offset=*/0,
                        &context->residual_activation_buffer,
                        /*output_offset=*/model->embedding_dim * (input_batch_size - num_block_output_tokens) * sizeof(float),
                        &context->control_buffer,
                        /*control_offset=*/0,
                        model->embedding_dim,
                        num_block_output_tokens,
                        model->num_active_experts);
                    if (status != gptoss_status_success) {
                        GPTOSS_LOG_ERROR("failed to encode f32_accumulate kernel launch");
                        return status;
                    }
                }
            }
        }

        if (output_batch_size != 0) {
            status = gptoss_metal_command_buffer_encode_launch_f32_bf16w_rmsnorm(
                command_buffer,
                &model->f32_bf16w_rmsnorm_fn,
                &context->residual_activation_buffer,
                /*input_offset=*/model->embedding_dim * (input_batch_size - output_batch_size) * sizeof(float),
                &model->shared_weight_buffer,
                /*weight_offset=*/model->rmsnorm_weight_offset,
                &context->rmsnorm_activation_buffer,
                /*output_offset=*/0,
                &context->control_buffer,
                /*control_offset=*/0,
                /*num_tokens=*/output_batch_size,
                /*num_channels=*/model->embedding_dim,
                model->rmsnorm_epsilon);
            if (status != gptoss_status_success) {
                GPTOSS_LOG_ERROR("failed to encode f32_bf16w_rmsnorm kernel launch");
                return status;
            }

            status = gptoss_metal_command_buffer_encode_fill_buffer(
                command_buffer,
                &context->argmax_buffer,
                /*offset=*/0,
                /*size=*/sizeof(uint64_t) * output_batch_size,
                /*fill_value=*/0xFF);
            if (status != gptoss_status_success) {
                GPTOSS_LOG_ERROR("failed to encode fill buffer command");
                return status;
            }

            status = gptoss_metal_command_buffer_encode_launch_f32_bf16w_unembedding(
                command_buffer,
                &model->f32_bf16w_unembedding_fn,
                model->unembedding_threadgroup_size,
                model->max_threadgroups,
                &context->rmsnorm_activation_buffer,
                /*input_offset=*/0,
                &model->shared_weight_buffer,
                /*weight_offset=*/model->unembedding_weight_offset,
                &context->score_buffer,
                /*output_offset=*/0,
                &context->argmax_buffer,
                /*argmax_offset=*/0,
                &context->control_buffer,
                /*control_offset=*/0,
                /*num_tokens=*/output_batch_size,
                /*num_cols=*/model->embedding_dim,
                /*num_rows=*/model->vocabulary_size);
            if (status != gptoss_status_success) {
                GPTOSS_LOG_ERROR("failed to encode f32_bf16w_unembedding kernel launch");
                return status;
            }
        }
    }
    return gptoss_status_success;
}

enum gptoss_status GPTOSS_ABI gptoss_context_append_chars(
    gptoss_context_t context,
    const char* text,
    size_t text_length,
    size_t* num_tokens_out)
{
    enum gptoss_status status = gptoss_status_success;
    const struct gptoss_model* model = context->model;
    const struct gptoss_tokenizer* tokenizer = model->tokenizer;
    size_t num_appended_tokens = 0;
    while (text_length != 0) {
        if (context->num_tokens == context->max_tokens) {
            status = gptoss_status_context_overflow;
            break;
        }
        const char* tokens = tokenizer->tokens_ptr;
        uint32_t best_token = UINT32_MAX;
        uint32_t best_token_length = 0;
        for (size_t t = 0; t < tokenizer->num_text_tokens; t++) {
            uint16_t token_length;
            memcpy(&token_length, tokens, sizeof(uint16_t));
            tokens += sizeof(uint16_t);
            if (token_length <= text_length && token_length > best_token_length) {
                if (memcmp(text, tokens, token_length) == 0) {
                    if (token_length > best_token_length) {
                        best_token = (uint32_t) t;
                        best_token_length = token_length;
                    }
                }
            }
            tokens += token_length;
        }

        if (best_token == UINT32_MAX) {
            GPTOSS_LOG_ERROR("failed to tokenize text \"%.*s\"", (int) text_length, text);
            return gptoss_status_invalid_argument;
        }

        uint32_t* input_tokens = (uint32_t*) context->token_buffer.ptr;
        if (context->num_kv_tokens > context->num_tokens) {
            if (input_tokens[context->num_tokens] != best_token) {
                input_tokens[context->num_tokens] = best_token;

                // Invalidate the KV cache starting with the newly added token.
                context->num_kv_tokens = context->num_tokens;
            }
            context->num_tokens++;
        } else {
            input_tokens[context->num_tokens++] = best_token;
        }
        num_appended_tokens++;
        text += best_token_length;
        text_length -= best_token_length;
    }
    if (num_tokens_out != NULL) {
        *num_tokens_out = num_appended_tokens;
    }
    return status;
}

enum gptoss_status GPTOSS_ABI gptoss_context_append_tokens(
    gptoss_context_t context,
    size_t num_tokens,
    const uint32_t* tokens)
{
    const struct gptoss_model* model = context->model;

    // Validate all tokens
    for (size_t t = 0; t < num_tokens; t++) {
        const uint32_t token = tokens[t];
        if (token >= model->vocabulary_size) {
            GPTOSS_LOG_ERROR("token %" PRIu32 " at index %zu is out of bounds for vocabulary size %" PRIu32,
                token, t, context->model->vocabulary_size);
            return gptoss_status_invalid_argument;
        }
    }

    enum gptoss_status status = gptoss_status_success;
    uint32_t* input_tokens = (uint32_t*) context->token_buffer.ptr;
    while (num_tokens != 0) {
        if (context->num_tokens == context->max_tokens) {
            status = gptoss_status_context_overflow;
            break;
        }

        if (context->num_kv_tokens > context->num_tokens) {
            const size_t num_tokens_to_verify = math_min(context->num_kv_tokens - context->num_tokens, num_tokens);
            size_t num_verified_tokens = 0;
            for (; num_verified_tokens < num_tokens_to_verify; num_verified_tokens++) {
                if (input_tokens[context->num_tokens + num_verified_tokens] != tokens[num_verified_tokens]) {
                    // Invalidate the KV cache starting with the newly added tokens.
                    context->num_kv_tokens = context->num_tokens + num_verified_tokens;
                    break;
                }
            }

            context->num_tokens += num_verified_tokens;
            tokens += num_verified_tokens;
            num_tokens -= num_verified_tokens;
        } else {
            const size_t num_tokens_to_copy = math_min(context->max_tokens - context->num_tokens, num_tokens);
            memcpy(input_tokens + context->num_tokens, tokens, num_tokens_to_copy * sizeof(uint32_t));
            context->num_tokens += num_tokens_to_copy;
            tokens += num_tokens_to_copy;
            num_tokens -= num_tokens_to_copy;
        }
    }

    return status;
}

enum gptoss_status GPTOSS_ABI gptoss_context_process(
    gptoss_context_t context)
{
    if (context->num_tokens > context->num_kv_tokens) {
        struct gptoss_metal_command_buffer command_buffer = {0};

        enum gptoss_status status = gptoss_metal_command_buffer_create(&context->model->command_queue, &command_buffer);
        if (status != gptoss_status_success) {
            goto cleanup;
        }

        struct gptoss_control* control = (struct gptoss_control*) context->control_buffer.ptr;
        control->abort = 0;

        status = process_tokens(
            context,
            &command_buffer,
            /*input_tokens_offset=*/context->num_kv_tokens,
            /*num_input_tokens=*/context->num_tokens - context->num_kv_tokens,
            /*num_output_tokens=*/0);
        if (status != gptoss_status_success) {
            goto cleanup;
        }

        status = gptoss_metal_command_buffer_commit(&command_buffer);
        if (status != gptoss_status_success) {
            goto cleanup;
        }

        status = gptoss_metal_command_buffer_wait_completion(&command_buffer, NULL);
        if (status != gptoss_status_success) {
            goto cleanup;
        }

        context->num_kv_tokens = context->num_tokens;

cleanup:
        gptoss_metal_command_buffer_release(&command_buffer);
        return status;
    }
    
    return gptoss_status_success;
}

enum gptoss_status GPTOSS_ABI gptoss_context_sample(
    gptoss_context_t context,
    float temperature,
    uint64_t seed,
    size_t max_tokens,
    uint32_t* tokens_out,
    size_t* num_tokens_out)
{
    enum gptoss_status status = gptoss_status_success;
    const struct gptoss_model* model = context->model;
    struct gptoss_metal_command_buffer command_buffer = {0};

    *num_tokens_out = 0;

    const uint32_t num_original_tokens = context->num_tokens;

    status = gptoss_metal_command_buffer_create(&context->model->command_queue, &command_buffer);
    if (status != gptoss_status_success) {
        goto cleanup;
    }

    struct gptoss_control* control = (struct gptoss_control*) context->control_buffer.ptr;
    control->abort = 0;

    for (size_t t = 0; t < max_tokens; t++) {
        if (context->num_kv_tokens < context->num_tokens) {
            status = process_tokens(
                context,
                &command_buffer,
                /*input_tokens_offset=*/context->num_kv_tokens,
                /*num_input_tokens=*/context->num_tokens - context->num_kv_tokens,
                /*num_output_tokens=*/1);
            context->num_kv_tokens = context->num_tokens;
        } else {
            status = process_tokens(
                context,
                &command_buffer,
                /*input_tokens_offset=*/context->num_tokens - 1,
                /*num_input_tokens=*/1,
                /*num_output_tokens=*/1);
        }
        if (status != gptoss_status_success) {
            goto cleanup;
        }

        if (temperature != 0.0f) {
            assert(context->num_processed_tokens != 0);
            uint32_t num_threadgroups = 0;
            uint32_t num_dims_per_threadgroup = 0;
            status = gptoss_metal_command_buffer_encode_launch_f32_softmax(
                &command_buffer,
                &model->f32_softmax_fn,
                /*threadgroup_size=*/512,
                model->max_threadgroups,
                &context->score_buffer,
                /*score_offset=*/0,
                &context->argmax_buffer,
                /*argmax_offset=*/0,
                &context->prob_buffer,
                /*prob_offset=*/0,
                &context->sum_buffer,
                /*sum_offset=*/0,
                &context->control_buffer,
                /*control_offset=*/0,
                model->vocabulary_size,
                /*num_tokens=*/1,
                temperature,
                &num_threadgroups,
                &num_dims_per_threadgroup);
            if (status != gptoss_status_success) {
                GPTOSS_LOG_ERROR("failed to encode f32_softmax kernel launch");
                goto cleanup;
            }

            status = gptoss_metal_command_buffer_encode_launch_f32_sample(
                &command_buffer,
                &model->f32_sample_fn,
                /*min_threadgroup_size=*/512,
                &context->prob_buffer,
                /*prob_offset=*/0,
                &context->sum_buffer,
                /*sum_offset=*/0,
                &context->token_buffer,
                /*token_offset=*/context->num_tokens * sizeof(uint32_t),
                &context->control_buffer,
                /*control_offset=*/0,
                /*rng_seed=*/seed + UINT64_C(0x123456789ABCDEF),
                /*rng_offset=*/context->num_tokens,
                /*num_blocks=*/num_threadgroups,
                /*num_channels=*/model->vocabulary_size,
                /*num_channels_per_block=*/num_dims_per_threadgroup);
            if (status != gptoss_status_success) {
                GPTOSS_LOG_ERROR("failed to encode f32_sample kernel launch");
                goto cleanup;
            }
        } else {
            status = gptoss_metal_command_buffer_encode_copy_buffer(
                &command_buffer,
                &context->argmax_buffer,
                /*input_offset=*/0,
                &context->token_buffer,
                /*output_offset=*/context->num_tokens * sizeof(uint32_t),
                /*size=*/sizeof(uint32_t));
            if (status != gptoss_status_success) {
                GPTOSS_LOG_ERROR("failed to encode copy buffer");
                goto cleanup;
            }
        }
        context->num_tokens += 1;
        context->num_kv_tokens = context->num_tokens;
    }

    gptoss_metal_command_buffer_commit(&command_buffer);
    gptoss_metal_command_buffer_wait_completion(&command_buffer, NULL);

    const uint32_t* token_ptr = (const uint32_t*) context->token_buffer.ptr;
    const uint32_t num_generated_tokens = context->num_tokens - num_original_tokens;
    memcpy(tokens_out, token_ptr + num_original_tokens, num_generated_tokens * sizeof(uint32_t));
    *num_tokens_out = num_generated_tokens;

cleanup:
    gptoss_metal_command_buffer_release(&command_buffer);
    return status;
}

enum gptoss_status GPTOSS_ABI gptoss_context_reset(
    gptoss_context_t context)
{
    context->num_tokens = 0;

    // Note: context->num_kv_tokens is not reset and context->input_tokens_buffer is not cleared.
    // If the subsequently added tokens match the tokens already in the KV cache, we reuse the KV cache.

    return gptoss_status_success;
}

enum gptoss_status GPTOSS_ABI gptoss_context_retain(
    gptoss_context_t context)
{
    atomic_fetch_add_explicit(&context->ref_count, 1, memory_order_relaxed);
    return gptoss_status_success;
}

enum gptoss_status GPTOSS_ABI gptoss_context_release(
    gptoss_context_t context)
{
    if (context != NULL) {
        if (atomic_fetch_sub_explicit(&context->ref_count, 1, memory_order_acq_rel) == 1) {
            // Activation buffers
            gptoss_metal_buffer_release(&context->residual_activation_buffer);
            gptoss_metal_buffer_release(&context->rmsnorm_activation_buffer);
            gptoss_metal_buffer_release(&context->qkv_activation_buffer);
            gptoss_metal_buffer_release(&context->sdpa_activation_buffer);
            gptoss_metal_buffer_release(&context->gate_activation_buffer);
            gptoss_metal_buffer_release(&context->expert_activation_buffer);
            gptoss_metal_buffer_release(&context->swiglu_activation_buffer);
            gptoss_metal_buffer_release(&context->moe_activation_buffer);
            gptoss_metal_buffer_release(&context->expert_offset_buffer);
            gptoss_metal_buffer_release(&context->token_to_expert_routing_buffer);
            gptoss_metal_buffer_release(&context->swiglu_input_buffer);

            // Input/output buffers
            gptoss_metal_buffer_release(&context->control_buffer);
            gptoss_metal_buffer_release(&context->token_buffer);
            gptoss_metal_buffer_release(&context->score_buffer);
            gptoss_metal_buffer_release(&context->prob_buffer);
            gptoss_metal_buffer_release(&context->sum_buffer);
            gptoss_metal_buffer_release(&context->argmax_buffer);
            gptoss_metal_buffer_release(&context->kvcache_buffer);

            gptoss_model_release(context->model);

            memset(context, 0, sizeof(struct gptoss_context));
            free(context);
        }
    }
    return gptoss_status_success;
}


================================================
FILE: gpt_oss/metal/source/convert.metal
================================================
#include <metal_integer>

#include <internal/kernel-args.h>

#pragma METAL fp math_mode(safe)
#pragma METAL fp contract(off)


kernel void gptoss_mf4_f32_convert(
    constant gptoss_convert_args& args [[ buffer(0) ]],
    const device uint4* blocks [[ buffer(1) ]],
    const device uchar* scales [[ buffer(2) ]],
    device float4* output [[ buffer(3) ]],
    uint gid [[threadgroup_position_in_grid]],
    uint tid [[thread_position_in_threadgroup]],
    uint threadgroup_size [[ threads_per_threadgroup ]])
{
    const ulong num_vecs_per_threadgroup = args.num_vecs_per_threadgroup;
    const ulong threadgroup_start = gid * num_vecs_per_threadgroup;
    const ulong threadgroup_end = metal::min(threadgroup_start + num_vecs_per_threadgroup, args.num_vecs);
    const ulong thread_start = threadgroup_start + tid;
    uint num_iter = static_cast<uint>((threadgroup_end - thread_start + (threadgroup_size - 1)) / threadgroup_size);

    blocks += thread_start;
    scales += thread_start;
    output += 8 * thread_start;
    for (; num_iter != 0; num_iter--) {
        const uint4 block = *blocks;
        const float scale = as_type<float>((static_cast<uint>(*scales) + 14) << 23);
        uint4 block02468ACEGIKMOQSU = block + block;
        uint4 block13579BDFHJLNPRTV = block >> 3;
        block02468ACEGIKMOQSU &= 0x1E1E1E1Eu;
        block13579BDFHJLNPRTV &= 0x1E1E1E1Eu;
        block02468ACEGIKMOQSU += 0x70707070u;
        block13579BDFHJLNPRTV += 0x70707070u;
        block02468ACEGIKMOQSU &= 0x8E8E8E8Eu;
        block13579BDFHJLNPRTV &= 0x8E8E8E8Eu;
        const uint4 block26AEIMQU = block02468ACEGIKMOQSU & 0xFF00FF00u;
        const uint4 block048CGKOS = (block02468ACEGIKMOQSU << 8) & 0xFF00FF00u;
        const uint4 block37BFJNRV = block13579BDFHJLNPRTV & 0xFF00FF00u;
        const uint4 block159DHLPT = (block13579BDFHJLNPRTV << 8) & 0xFF00FF00u;
        const float4 block048C = static_cast<float4>(as_type<half4>(block048CGKOS.xy)) * scale;
        const float4 blockGKOS = static_cast<float4>(as_type<half4>(block048CGKOS.zw)) * scale;
        const float4 block26AE = static_cast<float4>(as_type<half4>(block26AEIMQU.xy)) * scale;
        const float4 blockIMQU = static_cast<float4>(as_type<half4>(block26AEIMQU.zw)) * scale;
        const float4 block159D = static_cast<float4>(as_type<half4>(block159DHLPT.xy)) * scale;
        const float4 blockHLPT = static_cast<float4>(as_type<half4>(block159DHLPT.zw)) * scale;
        const float4 block37BF = static_cast<float4>(as_type<half4>(block37BFJNRV.xy)) * scale;
        const float4 blockJNRV = static_cast<float4>(as_type<half4>(block37BFJNRV.zw)) * scale;

        output[0] = (float4) { block048C.x, block159D.x, block26AE.x, block37BF.x };
        output[1] = (float4) { block048C.y, block159D.y, block26AE.y, block37BF.y };
        output[2] = (float4) { block048C.z, block159D.z, block26AE.z, block37BF.z };
        output[3] = (float4) { block048C.w, block159D.w, block26AE.w, block37BF.w };
        output[4] = (float4) { blockGKOS.x, blockHLPT.x, blockIMQU.x, blockJNRV.x };
        output[5] = (float4) { blockGKOS.y, blockHLPT.y, blockIMQU.y, blockJNRV.y };
        output[6] = (float4) { blockGKOS.z, blockHLPT.z, blockIMQU.z, blockJNRV.z };
        output[7] = (float4) { blockGKOS.w, blockHLPT.w, blockIMQU.w, blockJNRV.w };

        blocks += threadgroup_size;
        scales += threadgroup_size;
        output += 8 * threadgroup_size;
    }
}


================================================
FILE: gpt_oss/metal/source/embeddings.metal
================================================
#include <internal/kernel-args.h>

#pragma METAL fp math_mode(safe)
#pragma METAL fp contract(off)


kernel void gptoss_bf16_f32_embeddings(
    constant gptoss_embeddings_args& args [[ buffer(0) ]],
    const device uint* tokens [[ buffer(1) ]],
    const device bfloat4* weights [[ buffer(2) ]],
    device float4* output [[ buffer(3) ]],
    const device gptoss_control* control [[ buffer(4) ]],
    uint gid [[threadgroup_position_in_grid]],
    uint tid [[thread_position_in_threadgroup]],
    uint threadgroup_size [[ threads_per_threadgroup ]])
{
    if (control->abort != 0) {
        return;
    }

    const uint t = tokens[gid];

    weights += t * args.num_vecs;
    output += gid * args.num_vecs;
    for (uint i = tid; i < args.num_vecs; i += threadgroup_size) {
        const bfloat4 w = weights[i];
        output[i] = static_cast<float4>(w);
    }
}


================================================
FILE: gpt_oss/metal/source/expert_routing_metadata.metal
================================================
#include <internal/kernel-args.h>
#include <metal_integer>
#include <metal_math>
#include <metal_stdlib>

constant uint kMaxExperts = 128;

kernel void gptoss_f32_expert_routing_metadata(
    constant gptoss_expert_routing_metadata_args& args [[ buffer(0) ]],
    const device gptoss_expert_prediction* __restrict__ expert_predictions [[ buffer(1) ]],
    device uint* __restrict__ expert_offsets [[ buffer(2) ]],
    device uint* __restrict__ intra_expert_offsets [[ buffer(3) ]],
    uint tg_size [[threads_per_threadgroup]],
    uint tid [[thread_position_in_threadgroup]]) 
{
    assert(args.num_experts <= kMaxExperts);
    // Create threadgroup mem and initialize it to 0.
    threadgroup metal::atomic_uint tg_counts[kMaxExperts];
    for (uint e = tid; e < args.num_experts; e += tg_size) {
        metal::atomic_store_explicit(&tg_counts[e], 0u, metal::memory_order_relaxed);
    }

    threadgroup_barrier(metal::mem_flags::mem_threadgroup);

    for (uint i = tid; i < args.tokens; i += tg_size) {
        const uint e = expert_predictions[i].expert_id;
        const uint r = metal::atomic_fetch_add_explicit(&tg_counts[e], 1u, metal::memory_order_relaxed);
        intra_expert_offsets[i] = r;
    }
    threadgroup_barrier(metal::mem_flags::mem_threadgroup);

    if (tid == 0) {
        uint total = 0;
        for (uint e = 0; e < args.num_experts; ++e) {
            const uint bin = metal::atomic_load_explicit(&tg_counts[e], metal::memory_order_relaxed);
            expert_offsets[e] = total;
            total += bin;
        }
        expert_offsets[args.num_experts] = total;
    }
}

================================================
FILE: gpt_oss/metal/source/gather_and_accumulate.metal
================================================
#include <internal/kernel-args.h>
#include <metal_integer>
#include <metal_math>
#include <metal_stdlib>

// TODO(ibrahim): This is not optimal as each thread only gathers and accumulates a single float4. To amortize the
// cost of reading the expert, offset and scales for a token, we should let each thread gather and accumulate several
// float4s.
kernel void gptoss_f32_gather_and_accumulate_e4(
    constant gptoss_gather_args& args [[ buffer(0) ]],
    const device float* in [[ buffer(1) ]],
    const device gptoss_expert_prediction* __restrict__ expert_predictions [[ buffer(2) ]],
    const device uint* expert_offsets [[ buffer(3) ]],
    const device uint* intra_expert_offsets [[ buffer(4) ]],
    device float* out [[ buffer(5) ]],
    uint3 gid [[thread_position_in_grid]]) 
{
    const uint T = args.tokens;
    const uint k = args.active_experts_per_token;
    const uint D = args.token_stride;

    assert((D & 3u) == 0);
    assert(k == 4);

    const uint row = gid.y;
    if (row >= T) {
        return;
    }

    const uint col_vec4 = gid.x;
    const uint col = col_vec4 * 4u;
    if (col >= D) {
        return;
    }

    device float4* dst4 = reinterpret_cast<device float4*>(out + row * D + col);

    const uint base = row * k;
    const gptoss_expert_prediction expert0 = expert_predictions[base];
    const gptoss_expert_prediction expert1 = expert_predictions[base + 1];
    const gptoss_expert_prediction expert2 = expert_predictions[base + 2];
    const gptoss_expert_prediction expert3 = expert_predictions[base + 3];
    const uint expert0_id = expert0.expert_id;
    const uint expert1_id = expert1.expert_id;
    const uint expert2_id = expert2.expert_id;
    const uint expert3_id = expert3.expert_id;
    const float scale0 = expert0.score;
    const float scale1 = expert1.score;
    const float scale2 = expert2.score;
    const float scale3 = expert3.score;
    const uint4 current_intra_expert_offsets =
        *reinterpret_cast<const device uint4*>(&intra_expert_offsets[base]);
    // Get the row indices for the current expert ids
    const uint r0 = expert_offsets[expert0_id] + current_intra_expert_offsets.x;
    const uint r1 = expert_offsets[expert1_id] + current_intra_expert_offsets.y;
    const uint r2 = expert_offsets[expert2_id] + current_intra_expert_offsets.z;
    const uint r3 = expert_offsets[expert3_id] + current_intra_expert_offsets.w;

    const device float4* src0 =
        reinterpret_cast<const device float4*>(in + r0 * D + col);
    const device float4* src1 =
        reinterpret_cast<const device float4*>(in + r1 * D + col);
    const device float4* src2 =
        reinterpret_cast<const device float4*>(in + r2 * D + col);
    const device float4* src3 =
        reinterpret_cast<const device float4*>(in + r3 * D + col);

    float4 acc = *dst4;
    acc = metal::fma(*src0, scale0, acc);
    acc = metal::fma(*src1, scale1, acc);
    acc = metal::fma(*src2, scale2, acc);
    acc = metal::fma(*src3, scale3, acc);
    *dst4 = acc;
}

================================================
FILE: gpt_oss/metal/source/generate.c
================================================
#include <assert.h>
#include <inttypes.h>
#include <math.h>
#include <signal.h>
#include <stdatomic.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>

#include <mach/mach_time.h>

#include <gpt-oss.h>

#include "internal/model.h"

struct {
    atomic_uint_least64_t inference_bytes;
    atomic_size_t num_prefill_tokens;
    atomic_uint_least64_t prefill_microseconds;
    atomic_size_t num_generated_tokens;
    atomic_uint_least64_t generation_microseconds;
} globals = {
    .inference_bytes = 0,
    .num_prefill_tokens = 0,
    .prefill_microseconds = 0,
    .num_generated_tokens = 0,
    .generation_microseconds = 0,
};

struct options {
    const char* model;
    const char* prompt;
    size_t context_length;
    size_t max_tokens;
    float temperature;
    bool verbose;
};

static inline double mach_timestamp_diff_to_seconds(uint64_t start_timestamp, uint64_t end_timestamp) {
    static mach_timebase_info_data_t timebase_info = {0};
    if (timebase_info.denom == 0) {
        mach_timebase_info(&timebase_info);
    }
    const uint64_t elapsed_mach_time = end_timestamp - start_timestamp;
    return ((double) elapsed_mach_time * (double) timebase_info.numer) / ((double) timebase_info.denom * 1.0e+9);
}

static inline uint64_t mach_timestamp_diff_to_microseconds(uint64_t start_timestamp, uint64_t end_timestamp) {
    static mach_timebase_info_data_t timebase_info = {0};
    if (timebase_info.denom == 0) {
        mach_timebase_info(&timebase_info);
    }
    const uint64_t elapsed_mach_time = end_timestamp - start_timestamp;
    const uint64_t denominator = timebase_info.denom * UINT64_C(1000);
    return (elapsed_mach_time * timebase_info.numer + denominator / 2) / denominator;
}

static void print_usage(const char* program_name) {
    printf("Usage: %s <model-path> [-p <prompt>] [-n <tokens>]\n", program_name);
}

struct options parse_options(int argc, char** argv) {
    struct options options = (struct options) {
        .model = NULL,
        .prompt = NULL,
        .context_length = 0,
        .max_tokens = 0,
        .temperature = 0.0f,
        .verbose = false,
    };
    if (argc < 2) {
        fprintf(stderr, "Error: missing required command-line argument\n");
        print_usage(argv[0]);
        exit(EXIT_FAILURE);
    }
    for (int i = 1; i < argc; i++) {
        if (strcmp(argv[i], "--help") == 0) {
            print_usage(argv[0]);
            exit(EXIT_SUCCESS);
        } else if (strcmp(argv[i], "-p") == 0 || strcmp(argv[i], "--prompt") == 0) {
            if (i + 1 >= argc) {
                fprintf(stderr, "Error: missing argument for %s\n", argv[i]);
                print_usage(argv[0]);
                exit(EXIT_FAILURE);
            }
            options.prompt = argv[++i];
        } else if (strcmp(argv[i], "--context-length") == 0) {
            if (i + 1 >= argc) {
                fprintf(stderr, "Error: missing argument for --context-length\n");
                print_usage(argv[0]);
                exit(EXIT_FAILURE);
            }
            char* context_length_start = argv[++i];
            char* context_length_end = context_length_start;
            options.context_length = strtoul(context_length_start, &context_length_end, 10);
            if (context_length_end == context_length_start || *context_length_end != 0) {
                fprintf(stderr, "Error: failed to parse context length value \"%s\"\n", context_length_start);
                exit(EXIT_FAILURE);
            }
        } else if (strcmp(argv[i], "-n") == 0 || strcmp(argv[i], "--max-tokens") == 0) {
            if (i + 1 >= argc) {
                fprintf(stderr, "Error: missing argument for %s\n", argv[i]);
                print_usage(argv[0]);
                exit(EXIT_FAILURE);
            }
            char* max_tokens_start = argv[++i];
            char* max_tokens_end = max_tokens_start;
            options.max_tokens = strtoul(max_tokens_start, &max_tokens_end, 10);
            if (max_tokens_end == max_tokens_start || *max_tokens_end != 0) {
                fprintf(stderr, "Error: failed to max tokens value \"%s\"\n", max_tokens_start);
                exit(EXIT_FAILURE);
            }
            if (options.max_tokens == 0) {
                fprintf(stderr, "Error: invalid max tokens value %zu\n", options.max_tokens);
                exit(EXIT_FAILURE);
            }
        } else if (strcmp(argv[i], "-t") == 0 || strcmp(argv[i], "--temperature") == 0) {
            if (i + 1 >= argc) {
                fprintf(stderr, "Error: missing argument for %s\n", argv[i]);
                print_usage(argv[0]);
                exit(EXIT_FAILURE);
            }
            char* temperature_start = argv[++i];
            char* temperature_end = temperature_start;
            options.temperature = strtof(temperature_start, &temperature_end);
            if (temperature_end == temperature_start || *temperature_end != 0) {
                fprintf(stderr, "Error: failed to parse temperature value \"%s\"\n", temperature_start);
                exit(EXIT_FAILURE);
            }
            if (signbit(options.temperature) != 0 || !(options.temperature <= 2.0f)) {
                fprintf(stderr, "Error: invalid temperature value %f\n", options.temperature);
                exit(EXIT_FAILURE);
            }
        } else if (strcmp(argv[i], "-v") == 0 || strcmp(argv[i], "--verbose") == 0) {
            options.verbose = true;
        } else {
            if (options.model == NULL) {
                options.model = argv[i];
            } else {
                fprintf(stderr, "Error: unexpected command-line argument %s\n", argv[i]);
                print_usage(argv[0]);
                exit(EXIT_FAILURE);
            }
        }
    }
    if (options.model == NULL) {
        fprintf(stderr, "Error: missing required model argument\n");
        print_usage(argv[0]);
        exit(EXIT_FAILURE);
    }
    if (options.prompt == NULL) {
        fprintf(stderr, "Error: missing required prompt argument\n");
        print_usage(argv[0]);
        exit(EXIT_FAILURE);
    }
    return options;
}


static void print_profile() {
    const size_t num_prefill_tokens = atomic_load(&globals.num_prefill_tokens);
    const uint64_t prefill_microseconds = atomic_load(&globals.prefill_microseconds);
    const size_t num_generated_tokens = atomic_load(&globals.num_generated_tokens);
    const uint64_t generation_microseconds = atomic_load(&globals.generation_microseconds);
    const uint64_t inference_bytes = atomic_load(&globals.inference_bytes);
    if (num_prefill_tokens != 0 || num_generated_tokens != 0) {
        printf("\n");
    }
    if (num_prefill_tokens != 0) {
        printf("Prefill speed (%zu tokens): %.1f tokens/second\n",
            num_prefill_tokens,
            (double) num_prefill_tokens / (double) prefill_microseconds * 1.0e+6);
    }
    if (num_generated_tokens != 0) {
        printf("Generation speed (%zu tokens): %.1f tokens/second\n",
            num_generated_tokens,
            (double) num_generated_tokens / (double) generation_microseconds * 1.0e+6);
    }
}

static void ctrl_c_handler(int signum) {
    print_profile();
    exit(EXIT_SUCCESS);
}

int main(int argc, char *argv[]) {
    enum gptoss_status status;
    gptoss_model_t model = NULL;
    gptoss_tokenizer_t tokenizer = NULL;
    gptoss_context_t context = NULL;

    struct sigaction act;
    act.sa_handler = ctrl_c_handler;
    sigaction(SIGINT, &act, NULL);

    setvbuf(stdout, NULL, _IONBF, 0);

    struct options options = parse_options(argc, argv);

    const uint64_t load_start_time = mach_continuous_time();
    status = gptoss_model_create_from_file(options.model, &model);
    if (status != gptoss_status_success) {
        fprintf(stderr, "Error: failed to load model from file %s\n", options.model);
        goto error;
    }
    size_t max_model_context_length = 0;
    status = gptoss_model_get_max_context_length(model, &max_model_context_length);
    if (status != gptoss_status_success) {
        fprintf(stderr, "Error: failed to query maximum context length\n");
        goto error;
    }
    assert(max_model_context_length != 0);
    if (options.context_length == 0) {
        options.context_length = max_model_context_length;
    } else if (options.context_length > max_model_context_length) {
        fprintf(stderr, "Error: context length %zu exceeds maximum context length %zu supported by the model\n", options.context_length, max_model_context_length);
        goto error;
    }

    status = gptoss_model_get_tokenizer(model, &tokenizer);
    if (status != gptoss_status_success) {
        fprintf(stderr, "Error: failed to retrieve Tokenizer\n");
        goto error;
    }

    uint32_t return_token_id = UINT32_MAX;
    status = gptoss_tokenizer_get_special_token_id(tokenizer, gptoss_special_token_return, &return_token_id);
    if (status != gptoss_status_success) {
        fprintf(stderr, "Error: failed to query end-of-text token ID\n");
        goto error;
    }

    status = gptoss_context_create(model, options.context_length, /*max_batch_tokens=*/0, &context);
    if (status != gptoss_status_success) {
        fprintf(stderr, "Error: failed to create Context object\n");
        goto error;
    }
    if (options.verbose) {
        printf("Model weights size: %.2lf MB\n", (double) model->weights_size * 0x1.0p-20);
        printf("Model allocation size: %.2lf MB\n", (double) model->allocation_size * 0x1.0p-20);
        printf("Context allocation size: %.2lf MB\n", (double) context->allocation_size * 0x1.0p-20);
        printf("  Including KV cache: %.2lf MB\n", (double) context->kvcache_size * 0x1.0p-20);
    }

    const uint64_t load_end_time = mach_continuous_time();
    const double load_elapsed_seconds = mach_timestamp_diff_to_seconds(load_start_time, load_end_time);
    if (options.verbose) {
        printf("Loaded model in %.3f seconds\n", load_elapsed_seconds);
    }

    const uint64_t prefill_start_time = mach_continuous_time();
    size_t num_prefill_tokens = 0;
    status = gptoss_context_append_chars(context, options.prompt, strlen(options.prompt), &num_prefill_tokens);
    if (status != gptoss_status_success) {
        fprintf(stderr, "Error: failed to tokenize prompt \"%s\"\n", options.prompt);
        goto error;
    }
    atomic_store(&globals.num_prefill_tokens, num_prefill_tokens);
    status = gptoss_context_process(context);
    if (status != gptoss_status_success) {
        fprintf(stderr, "Error: failed to process Context object\n");
        goto error;
    }
    const uint64_t prefill_end_time = mach_continuous_time();

    while (options.max_tokens == 0 || atomic_load(&globals.num_generated_tokens) < options.max_tokens) {

        uint32_t predicted_token = UINT32_MAX;
        size_t num_predicted_tokens = 0;
        const uint64_t inference_start_timestamp = mach_continuous_time();
        status = gptoss_context_sample(context, options.temperature, /*rng_state=*/0, /*num_tokens=*/1, &predicted_token, &num_predicted_tokens);
        if (status != gptoss_status_success) {
            fprintf(stderr, "Error: failed to sample from the Context object\n");
            goto error;
        }
        const uint64_t inference_end_timestamp = mach_continuous_time();

        if (predicted_token == return_token_id) {
            // Yield token -> stop generation
            break;
        }

        // Unembedding: detokenize
        size_t token_size = 0;
        const void* token_ptr = NULL;
        status = gptoss_tokenizer_decode(tokenizer, predicted_token, &token_ptr, &token_size);
        if (status != gptoss_status_success) {
            fprintf(stderr, "Error: failed to detokenize predicted token %" PRIu32 "\n", predicted_token);
            goto error;
        }
        const size_t previous_num_generated_tokens = atomic_fetch_add(&globals.num_generated_tokens, 1);
        if (previous_num_generated_tokens == 0) {
            atomic_fetch_add(&globals.prefill_microseconds, mach_timestamp_diff_to_microseconds(prefill_start_time, prefill_end_time));
        } else {
            atomic_fetch_add(&globals.generation_microseconds, mach_timestamp_diff_to_microseconds(inference_start_timestamp, inference_end_timestamp));
        }
        printf("%.*s", (int) token_size, (const char*) token_ptr);

        status = gptoss_context_append_tokens(context, 1, &predicted_token);
        if (status != gptoss_status_success) {
            fprintf(stderr, "Error: failed to append predicted token %" PRIu32 " to context\n", predicted_token);
            goto error;
        }
    }

    print_profile();

    return EXIT_SUCCESS;

error:
    gptoss_context_release(context);
    gptoss_tokenizer_release(tokenizer);
    gptoss_model_release(model);
    return EXIT_FAILURE;
}


================================================
FILE: gpt_oss/metal/source/include/internal/datatype.h
================================================
#pragma once

#include <stdint.h>

#include <internal/macros.h>


typedef struct GPTOSS_DENSELY_PACKED_STRUCTURE {
    GPTOSS_ALIGN(2) uint16_t bits;
} gptoss_bfloat16;
static_assert(sizeof(gptoss_bfloat16) == 2, "bfloat16 size is not 2 bytes");


typedef struct GPTOSS_DENSELY_PACKED_STRUCTURE {
    GPTOSS_ALIGN(2) uint16_t bits;
} gptoss_float16;
static_assert(sizeof(gptoss_float16) == 2, "float16 size is not 2 bytes");


typedef struct GPTOSS_DENSELY_PACKED_STRUCTURE {
    GPTOSS_ALIGN(1) uint8_t bits;
} gptoss_float8ue8m0;
static_assert(sizeof(gptoss_float8ue8m0) == 1, "gptoss_float8ue8m0 size is not 1 bytes");


typedef struct GPTOSS_DENSELY_PACKED_STRUCTURE {
    GPTOSS_ALIGN(1) uint8_t bits;
} gptoss_float8e5m2;
static_assert(sizeof(gptoss_float8e5m2) == 1, "float8e5m2 size is not 1 bytes");


typedef struct GPTOSS_DENSELY_PACKED_STRUCTURE {
    GPTOSS_ALIGN(1) uint8_t bits;
} gptoss_float8e4m3;
static_assert(sizeof(gptoss_float8e4m3) == 1, "gptoss_float8e4m3 size is not 1 bytes");


typedef struct GPTOSS_DENSELY_PACKED_STRUCTURE {
    GPTOSS_ALIGN(1) uint8_t bits;
} gptoss_float4e2m1x2;
static_assert(sizeof(gptoss_float4e2m1x2) == 1, "gptoss_float4e2m1x2 size is not 1 bytes");


================================================
FILE: gpt_oss/metal/source/include/internal/datatype.hpp
================================================
#pragma once

#include <bit>

#include <internal/datatype.h>


namespace gptoss {

template <typename WideT, typename NarrowT>
WideT upcast(NarrowT);

template <>
inline float upcast<float>(gptoss_bfloat16 bf16_value) {
    const uint32_t bits = static_cast<uint32_t>(bf16_value.bits) << 16;
    return std::bit_cast<float>(bits);
}

template <>
inline float upcast<float>(gptoss_float16 fp16_value) {
    return static_cast<float>(std::bit_cast<_Float16>(fp16_value.bits));
}

template <>
inline float upcast<float>(gptoss_float8e4m3 fp8_value) {
    static constexpr uint16_t fp8e4m3_to_fp32[256] = {
        0x0000, 0x3B00, 0x3B80, 0x3BC0, 0x3C00, 0x3C20, 0x3C40, 0x3C60,
        0x3C80, 0x3C90, 0x3CA0, 0x3CB0, 0x3CC0, 0x3CD0, 0x3CE0, 0x3CF0,
        0x3D00, 0x3D10, 0x3D20, 0x3D30, 0x3D40, 0x3D50, 0x3D60, 0x3D70,
        0x3D80, 0x3D90, 0x3DA0, 0x3DB0, 0x3DC0, 0x3DD0, 0x3DE0, 0x3DF0,
        0x3E00, 0x3E10, 0x3E20, 0x3E30, 0x3E40, 0x3E50, 0x3E60, 0x3E70,
        0x3E80, 0x3E90, 0x3EA0, 0x3EB0, 0x3EC0, 0x3ED0, 0x3EE0, 0x3EF0,
        0x3F00, 0x3F10, 0x3F20, 0x3F30, 0x3F40, 0x3F50, 0x3F60, 0x3F70,
        0x3F80, 0x3F90, 0x3FA0, 0x3FB0, 0x3FC0, 0x3FD0, 0x3FE0, 0x3FF0,
        0x4000, 0x4010, 0x4020, 0x4030, 0x4040, 0x4050, 0x4060, 0x4070,
        0x4080, 0x4090, 0x40A0, 0x40B0, 0x40C0, 0x40D0, 0x40E0, 0x40F0,
        0x4100, 0x4110, 0x4120, 0x4130, 0x4140, 0x4150, 0x4160, 0x4170,
        0x4180, 0x4190, 0x41A0, 0x41B0, 0x41C0, 0x41D0, 0x41E0, 0x41F0,
        0x4200, 0x4210, 0x4220, 0x4230, 0x4240, 0x4250, 0x4260, 0x4270,
        0x4280, 0x4290, 0x42A0, 0x42B0, 0x42C0, 0x42D0, 0x42E0, 0x42F0,
        0x4300, 0x4310, 0x4320, 0x4330, 0x4340, 0x4350, 0x4360, 0x4370,
        0x4380, 0x4390, 0x43A0, 0x43B0, 0x43C0, 0x43D0, 0x43E0, 0x7FF0,
        0x8000, 0xBB00, 0xBB80, 0xBBC0, 0xBC00, 0xBC20, 0xBC40, 0xBC60,
        0xBC80, 0xBC90, 0xBCA0, 0xBCB0, 0xBCC0, 0xBCD0, 0xBCE0, 0xBCF0,
        0xBD00, 0xBD10, 0xBD20, 0xBD30, 0xBD40, 0xBD50, 0xBD60, 0xBD70,
        0xBD80, 0xBD90, 0xBDA0, 0xBDB0, 0xBDC0, 0xBDD0, 0xBDE0, 0xBDF0,
        0xBE00, 0xBE10, 0xBE20, 0xBE30, 0xBE40, 0xBE50, 0xBE60, 0xBE70,
        0xBE80, 0xBE90, 0xBEA0, 0xBEB0, 0xBEC0, 0xBED0, 0xBEE0, 0xBEF0,
        0xBF00, 0xBF10, 0xBF20, 0xBF30, 0xBF40, 0xBF50, 0xBF60, 0xBF70,
        0xBF80, 0xBF90, 0xBFA0, 0xBFB0, 0xBFC0, 0xBFD0, 0xBFE0, 0xBFF0,
        0xC000, 0xC010, 0xC020, 0xC030, 0xC040, 0xC050, 0xC060, 0xC070,
        0xC080, 0xC090, 0xC0A0, 0xC0B0, 0xC0C0, 0xC0D0, 0xC0E0, 0xC0F0,
        0xC100, 0xC110, 0xC120, 0xC130, 0xC140, 0xC150, 0xC160, 0xC170,
        0xC180, 0xC190, 0xC1A0, 0xC1B0, 0xC1C0, 0xC1D0, 0xC1E0, 0xC1F0,
        0xC200, 0xC210, 0xC220, 0xC230, 0xC240, 0xC250, 0xC260, 0xC270,
        0xC280, 0xC290, 0xC2A0, 0xC2B0, 0xC2C0, 0xC2D0, 0xC2E0, 0xC2F0,
        0xC300, 0xC310, 0xC320, 0xC330, 0xC340, 0xC350, 0xC360, 0xC370,
        0xC380, 0xC390, 0xC3A0, 0xC3B0, 0xC3C0, 0xC3D0, 0xC3E0, 0xFFF0,
    };
    const gptoss_bfloat16 bf16_value{.bits = fp8e4m3_to_fp32[fp8_value.bits]};
    return upcast<float>(bf16_value);
}

template <>
inline double upcast<double>(float fp32_value) {
    return static_cast<double>(fp32_value);
}

template <>
inline double upcast<double>(gptoss_bfloat16 bf16_value) {
    const float fp32_value = upcast<float>(bf16_value);
    return upcast<double>(fp32_value);
}

template <>
inline double upcast<double>(gptoss_float16 fp16_value) {
    const float fp32_value = upcast<float>(fp16_value);
    return upcast<double>(fp32_value);
}

template <>
inline double upcast<double>(gptoss_float8e4m3 fp8_value) {
    const float fp32_value = upcast<float>(fp8_value);
    return upcast<double>(fp32_value);
}

}  // namespace gptoss


================================================
FILE: gpt_oss/metal/source/include/internal/kernel-args.h
================================================
#pragma once

#if !defined(__METAL_VERSION__)
#include <stdint.h>
#endif

// TODO(ibahmed): specalize using metal function constants.
#define QKV_Bm 64
#define QKV_Bn 64
#define QKV_Bk 32
#define QKV_Sg_Bm 32
#define QKV_Sg_Bn 32

#define ATTN_OUTPUT_Bm 32
#define ATTN_OUTPUT_Bn 64
#define ATTN_OUTPUT_Bk 64
#define ATTN_OUTPUT_Sg_Bm 32
#define ATTN_OUTPUT_Sg_Bn 16

#define MLP_GATE_Bm 64
#define MLP_GATE_Bn 16
#define MLP_GATE_Bk 64
#define MLP_GATE_Sg_Bm 16
#define MLP_GATE_Sg_Bn 16

#define MOE_DENSE_MATMUL_SWIGLU_Bm 32
#define MOE_DENSE_MATMUL_SWIGLU_Bn 64
#define MOE_DENSE_MATMUL_SWIGLU_Bk 16
#define MOE_DENSE_MATMUL_SWIGLU_Sg_Bm 32
#define MOE_DENSE_MATMUL_SWIGLU_Sg_Bn 16

#define MOE_DENSE_MATMUL_Bm 32
#define MOE_DENSE_MATMUL_Bn 64
#define MOE_DENSE_MATMUL_Bk 16
#define MOE_DENSE_MATMUL_Sg_Bm 32
#define MOE_DENSE_MATMUL_Sg_Bn 16

struct gptoss_expert_prediction {
    uint32_t expert_id;
    float score;
};

struct gptoss_control {
    uint32_t abort;
};

struct gptoss_topk_args {
    uint32_t num_vecs_per_token;
};

struct gptoss_sdpa_args {
    uint32_t qkv_dim;
    uint32_t num_kv_tokens;
    uint32_t kv_stride;
    uint32_t window;
};

struct gptoss_u32_fill_random_args {
    uint64_t num_vecs_per_threadgroup;
    uint64_t num_vecs;
    uint64_t offset;
    uint64_t seed;
};

struct gptoss_f32_fill_random_args {
    uint64_t num_vecs_per_threadgroup;
    uint64_t num_vecs;
    uint64_t offset;
    uint64_t seed;
    float scale;
    float bias;
};

struct gptoss_accumulate_args {
    uint32_t num_vecs_per_expert;
    uint32_t num_vecs_per_threadgroup;
    uint32_t num_vecs;
};

struct gptoss_convert_args {
    uint64_t num_vecs_per_threadgroup;
    uint64_t num_vecs;
};

struct gptoss_embeddings_args {
    uint32_t num_vecs;
};

struct gptoss_rmsnorm_args {
    uint32_t num_vecs;
    float num_channels;
    float epsilon;
};

struct gptoss_matmul_args {
    uint32_t num_column_vecs;
    uint32_t num_rows;
    uint32_t add;
};

struct gptoss_dense_matmul_args {
    uint32_t m;
    uint32_t n;
    uint32_t k;
};

// Specialize qkv matmul args as it writes kv directly to the KV cache buffer.
struct gptoss_dense_matmul_qkv_args {
    uint32_t m;
    uint32_t n;
    uint32_t k;
    uint32_t max_tokens;
    uint32_t token_offset;
};

struct gptoss_scatter_args {
    uint32_t tokens;
    uint32_t active_experts_per_token;
    uint32_t token_stride;
};

struct gptoss_moe_dense_matmul_swiglu_args {
    uint32_t k;
    uint32_t n;
    uint32_t weight_blocks_expert_stride_bytes;
    uint32_t weight_scales_expert_stride_bytes;
    uint32_t bias_expert_stride_bytes;
    float swiglu_min;
    float swiglu_max;
};
struct gptoss_moe_dense_matmul_args {
    uint32_t k;
    uint32_t n;
    uint32_t weight_blocks_expert_stride_bytes;
    uint32_t weight_scales_expert_stride_bytes;
    uint32_t bias_expert_stride_bytes;
};

struct gptoss_expert_routing_metadata_args {
uint32_t tokens;
    uint32_t num_experts;
};

struct gptoss_gather_args {
    uint32_t tokens;
    uint32_t active_experts_per_token;
    uint32_t token_stride;
};

struct gptoss_unembedding_args {
    uint32_t num_column_vecs;
    uint32_t num_rows_per_threadgroup;
    uint32_t num_rows;
};

struct gptoss_moe_matmul_swiglu_args {
    uint32_t num_column_vecs;
    uint32_t num_rows;
    uint32_t num_active_experts;
    uint32_t weight_expert_stride;  // in bytes
    uint32_t output_expert_stride;  // in elements
    float swiglu_min;
    float swiglu_max;
};

struct gptoss_moe_matmul_args {
    uint32_t num_column_vecs;
    uint32_t num_rows;
    uint32_t num_active_experts;
    uint32_t input_expert_stride;  // in blocks of 32 elements
    uint32_t weight_expert_stride;  // in bytes
    uint32_t output_expert_stride;  // in elements
};

struct gptoss_rope_args {
    uint32_t token_stride;
    uint32_t token_offset;
    uint32_t max_tokens;
    float freq_scale;
    float interpolation_scale;
    float yarn_offset;
    float yarn_scale;
    float yarn_multiplier;
};

struct gptoss_qkv_args {
    uint32_t num_column_vecs;
    uint32_t num_rows;
    uint32_t token_offset;
    float freq_scale;
    float interpolation_scale;
    float yarn_offset;
    float yarn_scale;
    float yarn_multiplier;
    uint32_t max_tokens;
};

struct gptoss_softmax_args {
    uint32_t num_vecs;
    uint32_t num_vecs_per_threadgroup;
    uint32_t max_threadgroups;
    float temperature;
};

struct gptoss_sample_args {
    uint64_t rng_seed;
    uint32_t rng_offset;
    uint32_t num_blocks;
    uint32_t num_dims;
    uint32_t num_dims_per_block;
};


================================================
FILE: gpt_oss/metal/source/include/internal/log.h
================================================
#pragma once

#include <stdarg.h>


void gptoss_format_log(const char* format, va_list args);

__attribute__((__format__(__printf__, 1, 2)))
inline static void gptoss_log(const char* format, ...) {
    va_list args;
    va_start(args, format);
    gptoss_format_log(format, args);
    va_end(args);
}

#define GPTOSS_LOG_ERROR(message, ...) \
    gptoss_log("Error: " message "\n", ##__VA_ARGS__)

#define GPTOSS_LOG_WARNING(message, ...) \
    gptoss_log("Warning: " message "\n", ##__VA_ARGS__)


================================================
FILE: gpt_oss/metal/source/include/internal/macros.h
================================================
#pragma once

/***** Architecture detection macros *****/

#ifdef GPTOSS_ARCH_X86_64
    #if GPTOSS_ARCH_X86_64 != 0 && GPTOSS_ARCH_X86_64 != 1
        #error "Invalid GPTOSS_ARCH_X86_64 value: must be either 0 or 1"
    #endif
#else
    #if defined(__x86_64__) || defined(_M_X64) && !defined(_M_ARM64EC)
        #define GPTOSS_ARCH_X86_64 1
    #else
        #define GPTOSS_ARCH_X86_64 0
    #endif
#endif

#ifdef GPTOSS_ARCH_ARM64
    #if GPTOSS_ARCH_ARM64 != 0 && GPTOSS_ARCH_ARM64 != 1
        #error "Invalid GPTOSS_ARCH_ARM64 value: must be either 0 or 1"
    #endif
#else
    #if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC)
        #define GPTOSS_ARCH_ARM64 1
    #else
        #define GPTOSS_ARCH_ARM64 0
    #endif
#endif

#if GPTOSS_ARCH_X86_64 + GPTOSS_ARCH_ARM64 == 0
    #error "Unsupported architecture: neither x86-64 nor ARM64 detected"
#elif GPTOSS_ARCH_X86_64 + GPTOSS_ARCH_ARM64 != 1
    #error "Inconsistent architecture detection: both x86-64 and ARM64 detection macros are specified"
#endif

/***** Compiler portability macros *****/

#ifndef GPTOSS_LIKELY
    #if defined(__GNUC__)
        #define GPTOSS_LIKELY(condition) (__builtin_expect(!!(condition), 1))
    #else
        #define GPTOSS_LIKELY(condition) (!!(condition))
    #endif
#endif

#ifndef GPTOSS_UNLIKELY
    #if defined(__GNUC__)
        #define GPTOSS_UNLIKELY(condition) (__builtin_expect(!!(condition), 0))
    #else
        #define GPTOSS_UNLIKELY(condition) (!!(condition))
    #endif
#endif

#ifndef GPTOSS_UNPREDICTABLE
    #if defined(__has_builtin)
        #if __has_builtin(__builtin_unpredictable)
            #define GPTOSS_UNPREDICTABLE(condition) (__builtin_unpredictable(!!(condition)))
        #endif
    #endif
#endif
#ifndef GPTOSS_UNPREDICTABLE
    #if defined(__GNUC__) && (__GNUC__ >= 9) && !defined(__INTEL_COMPILER)
        #define GPTOSS_UNPREDICTABLE(condition) (__builtin_expect_with_probability(!!(condition), 0, 0.5))
    #else
        #define GPTOSS_UNPREDICTABLE(condition) (!!(condition))
    #endif
#endif

// Disable padding for structure members.
#ifndef GPTOSS_DENSELY_PACKED_STRUCTURE
    #if defined(__GNUC__)
        #define GPTOSS_DENSELY_PACKED_STRUCTURE __attribute__((__packed__))
    #else
        #error "Compiler-specific implementation of GPTOSS_DENSELY_PACKED_STRUCTURE required"
    #endif
#endif

#ifndef GPTOSS_ALIGN
    #if defined(__GNUC__)
        #define GPTOSS_ALIGN(alignment) __attribute__((__aligned__(alignment)))
    #elif defined(_MSC_VER)
        #define GPTOSS_ALIGN(alignment) __declspec(align(alignment))
    #else
        #error "Compiler-specific implementation of GPTOSS_ALIGN required"
    #endif
#endif

#ifndef GPTOSS_FORCE_INLINE
    #if defined(__GNUC__)
        #define GPTOSS_FORCE_INLINE inline __attribute__((__always_inline__))
    #elif defined(_MSC_VER)
        #define GPTOSS_FORCE_INLINE __forceinline
    #else
        #define GPTOSS_FORCE_INLINE inline
    #endif
#endif

/***** Symbol visibility macros *****/

#ifndef GPTOSS_INTERNAL_SYMBOL
    #if defined(__ELF__)
        #define GPTOSS_INTERNAL_SYMBOL __attribute__((__visibility__("internal")))
    #elif defined(__MACH__)
        #define GPTOSS_INTERNAL_SYMBOL __attribute__((__visibility__("hidden")))
    #else
        #define GPTOSS_INTERNAL_SYMBOL
    #endif
#endif


================================================
FILE: gpt_oss/metal/source/include/internal/math.h
================================================
#pragma once

#include <assert.h>
#include <stddef.h>
#include <stdint.h>

inline static size_t math_ceil_div(size_t numer, size_t denom) {
    return (numer + denom - 1) / denom;
}

inline static size_t math_max(size_t a, size_t b) {
    return a >= b ? a : b;
}

inline static size_t math_min(size_t a, size_t b) {
    return a < b ? a : b;
}

inline static size_t math_sub_sat(size_t a, size_t b) {
    return a > b ? a - b : 0;
}

static size_t math_round_down_po2(size_t number, size_t multiple) {
    assert(multiple != 0);
    assert((multiple & (multiple - 1)) == 0);

    return number & -multiple;
}

static size_t math_round_up_po2(size_t number, size_t multiple) {
    assert(multiple != 0);
    assert((multiple & (multiple - 1)) == 0);

    const size_t multiple_mask = multiple - 1;
    if ((number & multiple_mask) != 0) {
        number |= multiple_mask;
        number += 1;
    }
    return number;
}


================================================
FILE: gpt_oss/metal/source/include/internal/metal-kernels.h
================================================
#pragma once

#include <stddef.h>
#include <stdint.h>

#include <internal/metal.h>

#ifdef __cplusplus
extern "C" {
#endif

#include <stddef.h>
#include <stdint.h>

#include <internal/kernel-args.h>
#include <internal/math.h>
#include <internal/metal.h>


enum gptoss_status gptoss_metal_command_buffer_encode_launch_u32_fill_random(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* u32_fill_random_fn,
    size_t threadgroup_size,
    size_t max_threadgroups,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    uint64_t num_elements,
    uint64_t rng_seed,
    uint64_t rng_offset);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_fill_random(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_fill_random_fn,
    size_t threadgroup_size,
    size_t max_threadgroups,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    uint64_t num_elements,
    uint64_t rng_seed,
    uint64_t rng_offset,
    float rng_min,
    float rng_max);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_bf16_fill_random(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* bf16_fill_random_fn,
    size_t threadgroup_size,
    size_t max_threadgroups,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    uint64_t num_elements,
    uint64_t rng_seed,
    uint64_t rng_offset,
    float rng_min,
    float rng_max);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_mf4_f32_convert(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* mf4_f32_convert_fn,
    size_t threadgroup_size,
    size_t max_threadgroups,
    const struct gptoss_metal_buffer* block_buffer,
    const struct gptoss_metal_buffer* scale_buffer,
    const struct gptoss_metal_buffer* output_buffer,
    uint64_t num_elements);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_bf16_f32_embeddings(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* bf16_f32_embeddings_fn,
    size_t threadgroup_size,
    const struct gptoss_metal_buffer* token_buffer,
    size_t token_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_channels);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_bf16w_rmsnorm(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_bf16w_rmsnorm_fn,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_channels,
    float epsilon);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_bf16w_matmul(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_bf16w_matmul_fn,
    size_t threadgroup_size,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_cols,
    uint32_t num_rows);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_bf16w_matmul_qkv(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_bf16w_matmul_qkv_fn,
    size_t threadgroup_size,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* kv_buffer,
    size_t kv_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_cols,
    uint32_t num_q_heads,
    uint32_t num_kv_heads,
    uint32_t attn_head_dim,
    uint32_t token_offset,
    uint32_t max_tokens,
    float rope_base,
    float interpolation_scale,
    float yarn_offset,
    float yarn_scale,
    float yarn_multiplier);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_bf16w_matmul_add(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_bf16w_matmul_fn,
    size_t threadgroup_size,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_cols,
    uint32_t num_rows);

enum gptoss_status
gptoss_metal_command_buffer_encode_launch_f32_bf16w_dense_matmul_qkv(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_bf16w_dense_matmul_fn,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* kv_buffer,
    size_t kv_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_cols,
    uint32_t num_rows,
    uint32_t max_tokens,
    uint32_t token_offset);

enum gptoss_status
gptoss_metal_command_buffer_encode_launch_f32_bf16w_dense_matmul_attn_output(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_bf16w_dense_matmul_fn,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_cols,
    uint32_t num_rows);

enum gptoss_status
gptoss_metal_command_buffer_encode_launch_f32_bf16w_dense_matmul_mlp_gate(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_bf16w_dense_matmul_fn,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_cols,
    uint32_t num_rows);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_bf16w_unembedding(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_bf16w_matmul_fn,
    size_t threadgroup_size,
    size_t max_threadgroups,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* argmax_buffer,
    size_t argmax_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_cols,
    uint32_t num_rows);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_mf4w_moe_matmul_swiglu(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_mf4w_moe_matmul_swiglu_fn,
    size_t threadgroup_size,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* expert_buffer,
    size_t expert_offset,
    const struct gptoss_metal_buffer* weight_block_buffer,
    size_t weight_block_offset,
    const struct gptoss_metal_buffer* weight_scale_buffer,
    size_t weight_scale_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    float swiglu_limit,
    uint32_t expert_stride,
    uint32_t num_tokens,
    uint32_t num_active_experts,
    uint32_t num_cols,
    uint32_t num_rows);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_mf4w_moe_matmul(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_mf4w_moe_matmul_fn,
    size_t threadgroup_size,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* expert_buffer,
    size_t expert_offset,
    const struct gptoss_metal_buffer* weight_block_buffer,
    size_t weight_block_offset,
    const struct gptoss_metal_buffer* weight_scale_buffer,
    size_t weight_scale_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t expert_stride,
    uint32_t num_tokens,
    uint32_t num_active_experts,
    uint32_t num_cols,
    uint32_t num_rows);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_rope(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_rope_fn,
    size_t threadgroup_size,
    const struct gptoss_metal_buffer* activations_buffer,
    size_t activations_offset,
    const struct gptoss_metal_buffer* kv_buffer,
    size_t kv_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    float rope_base,
    float interpolation_scale,
    float yarn_offset,
    float yarn_scale,
    float yarn_multiplier,
    uint32_t num_tokens,
    uint32_t num_q_heads,
    uint32_t num_kv_heads,
    uint32_t attn_head_dim,
    uint32_t max_tokens,
    uint32_t token_offset);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_accumulate(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_accumulate_fn,
    size_t threadgroup_size,
    size_t max_threadgroups,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* expert_buffer,
    size_t expert_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_channels,
    uint32_t num_tokens,
    uint32_t num_experts);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_expert_routing_metadata(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* expert_routing_metadata_fn,
    const struct gptoss_metal_buffer* expert_predictions_buffer,
    size_t expert_predictions_offset,
    const struct gptoss_metal_buffer* expert_offsets_buffer,
    size_t expert_offsets_offset,
    const struct gptoss_metal_buffer* intra_expert_offsets_buffer,
    size_t intra_expert_offsets_offset,
    uint32_t num_tokens,
    uint32_t num_experts);


enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_scatter(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_scatter_fn,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* expert_predictions_buffer,
    size_t expert_predictions_offset,
    const struct gptoss_metal_buffer* expert_offsets_buffer,
    size_t expert_offsets_offset,
    const struct gptoss_metal_buffer* intra_expert_offsets_buffer,
    size_t intra_expert_offsets_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    uint32_t num_channels,
    uint32_t num_tokens,
    uint32_t num_active_experts);
    
enum gptoss_status
gptoss_metal_command_buffer_encode_launch_f32_mf4w_moe_dense_matmul_swiglu(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_mf4w_moe_dense_matmul_swiglu_fn,
    const struct gptoss_metal_buffer* expert_offsets_buffer,
    size_t expert_offsets_offset,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_block_buffer,
    size_t weight_block_offset,
    const struct gptoss_metal_buffer* weight_scale_buffer,
    size_t weight_scale_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    float swiglu_limit,
    uint32_t expert_stride_bytes,
    uint32_t num_tokens,
    uint32_t num_experts,
    uint32_t num_cols,
    uint32_t num_rows);
    
enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_mf4w_moe_dense_matmul(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_mf4w_moe_dense_matmul_fn,
    const struct gptoss_metal_buffer* expert_offsets_buffer,
    size_t expert_offsets_offset,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_block_buffer,
    size_t weight_block_offset,
    const struct gptoss_metal_buffer* weight_scale_buffer,
    size_t weight_scale_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    uint32_t expert_stride_bytes,
    uint32_t num_tokens,
    uint32_t num_experts,
    uint32_t num_cols,
    uint32_t num_rows);
    
enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_gather_and_accumulate_e4(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_gather_and_accumulate_e4_fn,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* expert_predictions_buffer,
    size_t expert_predictions_offset,
    const struct gptoss_metal_buffer* expert_offsets_buffer,
    size_t expert_offsets_offset,
    const struct gptoss_metal_buffer* intra_expert_offsets_buffer,
    size_t intra_expert_offsets_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    uint32_t num_channels,
    uint32_t num_tokens,
    uint32_t num_active_experts);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_topk(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_topk_fn,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_experts,
    uint32_t num_active_experts);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_sdpa(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_sdpa_fn,
    const struct gptoss_metal_buffer* q_buffer,
    size_t q_offset,
    const struct gptoss_metal_buffer* kv_buffer,
    size_t kv_offset,
    const struct gptoss_metal_buffer* s_buffer,
    size_t s_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t window,
    uint32_t kv_stride,
    uint32_t num_q_tokens,
    uint32_t num_kv_tokens,
    uint32_t num_q_heads,
    uint32_t num_kv_heads,
    uint32_t head_dim);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_softmax(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_softmax_fn,
    size_t threadgroup_size,
    size_t max_threadgroups,
    const struct gptoss_metal_buffer* score_buffer,
    size_t score_offset,
    const struct gptoss_metal_buffer* argmax_buffer,
    size_t argmax_offset,
    const struct gptoss_metal_buffer* prob_buffer,
    size_t prob_offset,
    const struct gptoss_metal_buffer* sum_buffer,
    size_t sum_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_channels,
    uint32_t num_tokens,
    float temperature,
    uint32_t* num_threadgroups_out,
    uint32_t* num_channels_per_threadgroup_out);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_sample(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_sample_fn,
    size_t min_threadgroup_size,
    const struct gptoss_metal_buffer* prob_buffer,
    size_t prob_offset,
    const struct gptoss_metal_buffer* sum_buffer,
    size_t sum_offset,
    const struct gptoss_metal_buffer* token_buffer,
    size_t token_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint64_t rng_seed,
    uint32_t rng_offset,
    uint32_t num_blocks,
    uint32_t num_channels,
    uint32_t num_channels_per_block);

#ifdef __cplusplus
}  // extern "C"
#endif


================================================
FILE: gpt_oss/metal/source/include/internal/metal.h
================================================
#pragma once

#include <stddef.h>

#include <gpt-oss/types.h>

#ifdef __cplusplus
extern "C" {
#endif

struct gptoss_metal_device {
    void* object; // id<MTLDevice>
    size_t num_cores;
    size_t max_buffer_size;
    size_t max_threadgroup_memory;
    size_t max_threadgroup_threads_x;
    size_t max_threadgroup_threads_y;
    size_t max_threadgroup_threads_z;
};

enum gptoss_status gptoss_metal_device_create_system_default(
    struct gptoss_metal_device* device_out);

enum gptoss_status gptoss_metal_device_release(
    struct gptoss_metal_device* device);


struct gptoss_metal_library {
    void* object; // id<MTLLibrary>
};

enum gptoss_status gptoss_metal_library_create_default(
    const struct gptoss_metal_device* device,
    struct gptoss_metal_library* library_out);

enum gptoss_status gptoss_metal_library_release(
    struct gptoss_metal_library* library);

struct gptoss_metal_function {
    void* function_object; // id<MTLFunction>
    void* pipeline_state_object; // id<MTLComputePipelineState>
    size_t max_threadgroup_threads;
    size_t simdgroup_threads;
    size_t static_threadgroup_memory;
};

enum gptoss_status gptoss_metal_function_create(
    const struct gptoss_metal_library* library,
    const char* name,
    struct gptoss_metal_function* function_out);

enum gptoss_status gptoss_metal_function_release(
    struct gptoss_metal_function* function);

struct gptoss_metal_buffer {
    void* object; // id<MTLBuffer>
    size_t size;
    void* ptr;
};

enum gptoss_status gptoss_metal_buffer_create(
    const struct gptoss_metal_device* device,
    size_t size,
    const void* data,
    struct gptoss_metal_buffer* buffer_out);

enum gptoss_status gptoss_metal_buffer_wrap(
    const struct gptoss_metal_device* device,
    size_t size,
    const void* data,
    struct gptoss_metal_buffer* buffer_out);

enum gptoss_status gptoss_metal_buffer_release(
    struct gptoss_metal_buffer* buffer);

struct gptoss_metal_command_queue {
    void* object; // id<MTLCommandQueue>
};

enum gptoss_status gptoss_metal_command_queue_create(
    const struct gptoss_metal_device* device,
    struct gptoss_metal_command_queue* command_queue_out);

enum gptoss_status gptoss_metal_command_queue_release(
    struct gptoss_metal_command_queue* command_queue);

struct gptoss_metal_command_buffer {
    void* object; // id<MTLCommandBuffer>
};

enum gptoss_status gptoss_metal_command_buffer_create(
    const struct gptoss_metal_command_queue* command_queue,
    struct gptoss_metal_command_buffer* command_buffer_out);

enum gptoss_status gptoss_metal_command_buffer_encode_fill_buffer(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_buffer* buffer,
    size_t offset,
    size_t size,
    uint8_t fill_value);

enum gptoss_status gptoss_metal_command_buffer_encode_copy_buffer(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    size_t size);

enum gptoss_status gptoss_metal_command_buffer_encode_launch_kernel(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* function,
    size_t threadgroup_size_x,
    size_t threadgroup_size_y,
    size_t threadgroup_size_z,
    size_t num_threadgroups_x,
    size_t num_threadgroups_y,
    size_t num_threadgroups_z,
    size_t params_size,
    const void* params,
    size_t num_device_buffers,
    const struct gptoss_metal_buffer** device_buffers,
    const size_t* device_buffer_offsets,
    size_t threadgroup_buffer_size);

enum gptoss_status gptoss_metal_command_buffer_commit(
    const struct gptoss_metal_command_buffer* command_buffer);

enum gptoss_status gptoss_metal_command_buffer_wait_completion(
    const struct gptoss_metal_command_buffer* command_buffer,
    double* elapsed_seconds);

enum gptoss_status gptoss_metal_command_buffer_release(
    struct gptoss_metal_command_buffer* command_buffer);

#ifdef __cplusplus
}  // extern "C"
#endif


================================================
FILE: gpt_oss/metal/source/include/internal/metal.hpp
================================================
#pragma once

#include <array>
#include <initializer_list>
#include <cstring>
#include <stdexcept>
#include <vector>

#include <gpt-oss/types.h>
#include <internal/metal.h>
#include <internal/metal-kernels.h>


namespace gptoss {

inline void Check(gptoss_status s, const char* what) {
    if (s != gptoss_status_success) {
        throw std::runtime_error(what);
    }
}

inline std::size_t round_up(std::size_t p, std::size_t q) {
    const std::size_t r = p % q;
    if (r == 0) {
        return p;
    } else {
        return p - r + q;
    }
}

namespace metal {

class Device {
public:
    inline Device() {
        Check(gptoss_metal_device_create_system_default(&device_), "create Device");
    }

    inline ~Device() {
        gptoss_metal_device_release(&device_);
    }

    Device(const Device&) = delete;
    Device& operator=(const Device&) = delete;

    inline Device(Device&& other) noexcept {
        device_ = other.device_;
        std::memset(&other.device_, 0, sizeof(other.device_));
    }

    inline Device& operator=(Device&& other) noexcept {
        if (this != &other) {
            gptoss_metal_device_release(&device_);
            device_ = other.device_;
            std::memset(&other.device_, 0, sizeof(other.device_));
        }
        return *this;
    }

    inline const gptoss_metal_device* handle() const noexcept { return &device_; }

    inline size_t max_buffer_size() const noexcept { return device_.max_buffer_size; }
    inline size_t max_threadgroup_memory() const noexcept { return device_.max_threadgroup_memory; }
    inline size_t max_threadgroup_threads_x() const noexcept { return device_.max_threadgroup_threads_x; }
    inline size_t max_threadgroup_threads_y() const noexcept { return device_.max_threadgroup_threads_y; }
    inline size_t max_threadgroup_threads_z() const noexcept { return device_.max_threadgroup_threads_z; }

private:
    gptoss_metal_device device_{};
};

class Library {
public:
    inline explicit Library(const Device& dev) {
        Check(gptoss_metal_library_create_default(dev.handle(), &library_),
            "gptoss_metal_library_create_default");
    }

    inline ~Library() {
        gptoss_metal_library_release(&library_);
    }

    Library(const Library&) = delete;
    Library& operator=(const Library&) = delete;

    inline Library(Library&& other) noexcept {
        library_ = other.library_;
        std::memset(&other.library_, 0, sizeof(other.library_));
    }

    inline Library& operator=(Library&& other) noexcept {
        if (this != &other) {
            gptoss_metal_library_release(&library_);
            library_ = other.library_;
            std::memset(&other.library_, 0, sizeof(other.library_));
        }
        return *this;
    }

    inline const gptoss_metal_library* handle() const noexcept {
        return &library_;
    }

private:
    gptoss_metal_library library_{};
};

class Function {
public:
    inline Function(const Library& library, const char* name) {
        Check(gptoss_metal_function_create(library.handle(), name, &function_),
            "gptoss_metal_function_create");
    }

    inline ~Function() {
        gptoss_metal_function_release(&function_);
    }

    Function(const Function&) = delete;
    Function& operator=(const Function&) = delete;

    inline Function(Function&& other) noexcept {
        function_ = other.function_;
        std::memset(&other.function_, 0, sizeof(other.function_));
    }

    inline Function& operator=(Function&& other) noexcept {
        if (this != &other) {
            gptoss_metal_function_release(&function_);
            function_ = other.function_;
            std::memset(&other.function_, 0, sizeof(other.function_));
        }
        return *this;
    }

    inline const gptoss_metal_function* handle() const noexcept { return &function_; }

    inline size_t max_threadgroup_threads() const noexcept { return function_.max_threadgroup_threads; }
    inline size_t simdgroup_threads() const noexcept { return function_.simdgroup_threads; }
    inline size_t static_threadgroup_memory() const noexcept { return function_.static_threadgroup_memory; }

private:
    gptoss_metal_function function_{};
};

class Buffer {
public:
    inline Buffer(const Device& dev, size_t size, const void* data = nullptr) {
        Check(gptoss_metal_buffer_create(dev.handle(), size, data, &buffer_), "create buffer");
    }

    inline ~Buffer() {
        gptoss_metal_buffer_release(&buffer_);
    }

    Buffer(const Buffer&) = delete;
    Buffer& operator=(const Buffer&) = delete;

    inline Buffer(Buffer&& other) noexcept {
        buffer_ = other.buffer_;
        std::memset(&other.buffer_, 0, sizeof(other.buffer_));
    }

    inline Buffer& operator=(Buffer&& other) noexcept {
        if (this != &other) {
            gptoss_metal_buffer_release(&buffer_);
            buffer_ = other.buffer_;
            std::memset(&other.buffer_, 0, sizeof(other.buffer_));
        }
        return *this;
    }

    inline size_t size() const noexcept { return buffer_.size; }
    inline void* ptr() const noexcept { return buffer_.ptr; }

    inline const gptoss_metal_buffer* handle() const noexcept { return &buffer_; }

private:
    gptoss_metal_buffer buffer_{};
};

class CommandQueue {
public:
    inline explicit CommandQueue(const Device& dev) {
        Check(gptoss_metal_command_queue_create(dev.handle(), &command_queue_),
            "gptoss_metal_command_queue_create");
    }

    inline ~CommandQueue() {
        gptoss_metal_command_queue_release(&command_queue_);
    }

    CommandQueue(const CommandQueue&) = delete;
    CommandQueue& operator=(const CommandQueue&) = delete;

    inline CommandQueue(CommandQueue&& other) noexcept {
        command_queue_ = other.command_queue_;
        std::memset(&other.command_queue_, 0, sizeof(other.command_queue_));
    }

    inline CommandQueue& operator=(CommandQueue&& other) noexcept {
        if (this != &other) {
            gptoss_metal_command_queue_release(&command_queue_);
            command_queue_ = other.command_queue_;
            std::memset(&other.command_queue_, 0, sizeof(other.command_queue_));
        }
        return *this;
    }

    inline const gptoss_metal_command_queue* handle() const noexcept {
        return &command_queue_;
    }

private:
    gptoss_metal_command_queue command_queue_{};
};

class CommandBuffer {
public:
    inline explicit CommandBuffer(const CommandQueue& command_queue) {
        Check(gptoss_metal_command_buffer_create(command_queue.handle(), &command_buffer_),
            "gptoss_metal_command_buffer_create");
    }
    inline ~CommandBuffer() {
        gptoss_metal_command_buffer_release(&command_buffer_);
    }

    CommandBuffer(const CommandBuffer&)            = delete;
    CommandBuffer& operator=(const CommandBuffer&) = delete;

    inline CommandBuffer(CommandBuffer&& other) noexcept  {
        command_buffer_ = other.command_buffer_;
        std::memset(&other.command_buffer_, 0, sizeof(other.command_buffer_));
    }

    inline CommandBuffer& operator=(CommandBuffer&& other) noexcept {
        if (this != &other) {
            gptoss_metal_command_buffer_release(&command_buffer_);
            command_buffer_ = other.command_buffer_;
            std::memset(&other.command_buffer_, 0, sizeof(other.command_buffer_));
        }
        return *this;
    }

    inline void encode_launch_kernel(const Function& function,
                                     const std::array<size_t, 3>& threadgroup_size,
                                     const std::array<size_t, 3>& num_threadgroups,
                                     size_t params_size, const void* params,
                                     std::initializer_list<const Buffer*> device_buffers = {},
                                     size_t threadgroup_buffer_size = 0)
    {
        std::vector<const gptoss_metal_buffer*> buffer_handles(device_buffers.size());
        std::transform(device_buffers.begin(), device_buffers.end(), buffer_handles.begin(),
            [](const Buffer* buffer) -> const gptoss_metal_buffer* { return buffer->handle(); });
        Check(gptoss_metal_command_buffer_encode_launch_kernel(
                &command_buffer_, function.handle(),
                threadgroup_size[0], threadgroup_size[1], threadgroup_size[2],
                num_threadgroups[0], num_threadgroups[1], num_threadgroups[2],
                params_size, params,
                buffer_handles.size(),
                buffer_handles.data(),
                /*buffer_offsets=*/nullptr,
                threadgroup_buffer_size),
            "gptoss_metal_command_buffer_encode_launch_kernel");
    }

    inline void encode_launch_f32_fill_random(const Function& f32_fill_random_fn,
                                              size_t threadgroup_size,
                                              size_t num_threadgroups,
                                              const Buffer& output_buffer,
                                              size_t output_offset,
                                              size_t num_channels,
                                              uint64_t rng_seed,
                                              uint64_t rng_offset,
                                              float rng_min,
                                              float rng_max)
    {
        Check(gptoss_metal_command_buffer_encode_launch_f32_fill_random(
                &command_buffer_, f32_fill_random_fn.handle(),
                threadgroup_size, num_threadgroups,
                output_buffer.handle(), output_offset,
                num_channels,
                rng_seed, rng_offset, rng_min, rng_max),
            "gptoss_metal_command_buffer_encode_launch_f32_fill_random");
    }

    inline void encode_launch_bf16_fill_random(const Function& bf16_fill_random_fn,
                                               size_t threadgroup_size,
                                               size_t num_threadgroups,
                                               const Buffer& output_buffer,
                                               size_t output_offset,
                                               size_t num_channels,
                                               uint64_t rng_seed,
                                               uint64_t rng_offset,
                                               float rng_min,
                                               float rng_max)
    {
        Check(gptoss_metal_command_buffer_encode_launch_bf16_fill_random(
                &command_buffer_, bf16_fill_random_fn.handle(),
                threadgroup_size, num_threadgroups,
                output_buffer.handle(), output_offset,
                num_channels,
                rng_seed, rng_offset, rng_min, rng_max),
            "gptoss_metal_command_buffer_encode_launch_bf16_fill_random");
    }

    inline void encode_launch_u32_fill_random(const Function& u32_fill_random_fn,
                                              size_t threadgroup_size,
                                              size_t num_threadgroups,
                                              const Buffer& output_buffer,
                                              size_t output_offset,
                                              size_t num_channels,
                                              uint64_t rng_seed,
                                              uint64_t rng_offset)
    {
        Check(gptoss_metal_command_buffer_encode_launch_u32_fill_random(
                &command_buffer_, u32_fill_random_fn.handle(),
                threadgroup_size, num_threadgroups,
                output_buffer.handle(), output_offset,
                num_channels,
                rng_seed, rng_offset),
            "gptoss_metal_command_buffer_encode_launch_u32_fill_random");
    }

    inline void commit() {
        Check(gptoss_metal_command_buffer_commit(&command_buffer_), "commit");
    }

    inline double wait_completion() {
        double secs = 0.0;
        Check(gptoss_metal_command_buffer_wait_completion(&command_buffer_, &secs), "wait completion");
        return secs;
    }

    inline const gptoss_metal_command_buffer* handle() const noexcept { return &command_buffer_; }

private:
    gptoss_metal_command_buffer command_buffer_{};
};

} // namespace metal
} // namespace gptoss


================================================
FILE: gpt_oss/metal/source/include/internal/model.h
================================================
#pragma once

#ifndef __cplusplus
    #include <stdatomic.h>
#endif
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>

#include "internal/metal.h"


struct gptoss_tokenizer {
#ifndef __cplusplus
    atomic_uint_least64_t ref_count;
#else
    uint_least64_t ref_count;
#endif

    void* mapping_ptr;
    size_t mapping_size;

    const char* regex_ptr;
    const char* tokens_ptr;

    uint32_t num_text_tokens;
    uint32_t num_special_tokens;

    uint32_t special_token_id[gptoss_special_token_max - 1];
};

struct gptoss_model {
#ifndef __cplusplus
    atomic_uint_least64_t ref_count;
#else
    uint_least64_t ref_count;
#endif

    struct gptoss_tokenizer* tokenizer;

    void* mapping_ptr;
    size_t mapping_size;

    uint32_t context_length;
    uint32_t num_blocks;
    uint32_t num_experts;
    uint32_t num_active_experts;
    uint32_t embedding_dim;
    uint32_t mlp_dim;
    float swiglu_limit;
    uint32_t head_dim;
    uint32_t num_heads;
    uint32_t num_kv_heads;
    uint32_t attention_window;
    float rope_theta;
    float interpolation_scale;
    float yarn_offset;
    float yarn_scale;
    float yarn_multiplier;
    float rmsnorm_epsilon;

    uint32_t vocabulary_size;

    bool lock_memory;

    size_t weights_size;
    size_t allocation_size;

    // Metal objects
    struct gptoss_metal_device device;
    size_t max_threadgroups;
    struct gptoss_metal_command_queue command_queue;
    struct gptoss_metal_library library;
    struct gptoss_metal_function bf16_f32_embeddings_fn;
    struct gptoss_metal_function f32_bf16w_rmsnorm_fn;
    struct gptoss_metal_function f32_bf16w_matmul_fn;
    struct gptoss_metal_function f32_bf16w_matmul_qkv_fn;
    struct gptoss_metal_function f32_bf16w_dense_matmul_qkv_fn;
    struct gptoss_metal_function f32_bf16w_dense_matmul_attn_output_fn;
    struct gptoss_metal_function f32_bf16w_dense_matmul_mlp_gate_fn;
    struct gptoss_metal_function f32_bf16w_unembedding_fn;
    struct gptoss_metal_function f32_rope_fn;
    struct gptoss_metal_function f32_mf4w_moe_matmul_swiglu_fn;
    struct gptoss_metal_function f32_mf4w_moe_matmul_fn;
    struct gptoss_metal_function f32_accumulate_e4_fn;
    struct gptoss_metal_function f32_scatter_e4_fn;
    struct gptoss_metal_function f32_mf4w_moe_dense_matmul_swiglu_fn;
    struct gptoss_metal_function f32_mf4w_moe_dense_matmul_fn;
    struct gptoss_metal_function f32_gather_and_accumulate_e4_fn;
    struct gptoss_metal_function f32_expert_routing_metadata_fn;
    struct gptoss_metal_function f32_topk_softmax_e32_k4_fn;
    struct gptoss_metal_function f32_topk_softmax_e128_k4_fn;
    struct gptoss_metal_function f32_sdpa_q8_d64_fn;
    struct gptoss_metal_function f32_softmax_fn;
    struct gptoss_metal_function f32_sample_fn;

    size_t per_block_shared_weights_size;
    size_t per_expert_block_weight_size;

    size_t embeddings_threadgroup_size;
    size_t attn_qkv_threadgroup_size;
    size_t attn_out_threadgroup_size;
    size_t mlp_gate_threadgroup_size;
    size_t mlp_swiglu_threadgroup_size;
    size_t mlp_out_threadgroup_size;
    size_t mlp_acc_threadgroup_size;
    size_t unembedding_threadgroup_size;

    size_t attn_rmsnorm_gain_offset;
    size_t attn_qkv_weight_offset;
    size_t attn_qkv_bias_offset;
    size_t attn_sdpa_sink_offset;
    size_t attn_out_weight_offset;
    size_t attn_out_bias_offset;
    size_t mlp_rmsnorm_gain_offset;
    size_t mlp_gate_weight_offset;
    size_t mlp_gate_bias_offset;
    size_t mlp_swiglu_scale_offset;
    size_t mlp_swiglu_bias_offset;
    size_t mlp_out_block_offset;
    size_t mlp_out_scale_offset;
    size_t mlp_out_bias_offset;
    size_t rmsnorm_weight_offset;
    size_t unembedding_weight_offset;

    // Buffer with non-MoE weights. Includes MoE gates, embeddings/unembeddings.
    struct gptoss_metal_buffer shared_weight_buffer;
    // num_blocks per-block buffers with MoE weights to follow.
    struct gptoss_metal_buffer block_weight_buffers[];
};

#define GPTOSS_DEFAULT_BATCH_SIZE 128

struct gptoss_context {
#ifndef __cplusplus
    atomic_uint_least64_t ref_count;
#else
    uint_least64_t ref_count;
#endif

    struct gptoss_model* model;
    // Number of tokens processed in the context.
    size_t num_tokens;
    // Number of tokens in the KV cache.
    size_t num_kv_tokens;
    // Length of the context.
    size_t max_tokens;
    // Maximum number of tokens that can be processed in a single batch.
    // Activation buffers are allocated with this size.
    size_t max_batch_tokens;


    size_t kvcache_size;
    size_t allocation_size;

    // Activation buffers.
    // TODO: merge into a single buffer.
    struct gptoss_metal_buffer residual_activation_buffer;  // Residual stream
    struct gptoss_metal_buffer rmsnorm_activation_buffer;  // Both attention & MLP RMSNorm output
    struct gptoss_metal_buffer qkv_activation_buffer;  // QKV projection output
    struct gptoss_metal_buffer sdpa_activation_buffer;  // SDPA output
    struct gptoss_metal_buffer gate_activation_buffer;  // MoE gating output
    struct gptoss_metal_buffer expert_activation_buffer;  // MoE expert predictions
    struct gptoss_metal_buffer expert_offset_buffer; // MoE expert histograms cumsum
    struct gptoss_metal_buffer token_to_expert_routing_buffer; // MoE token to expert routing
    struct gptoss_metal_buffer swiglu_input_buffer; // MLP+SwiGLU input for prefill.
    struct gptoss_metal_buffer swiglu_activation_buffer;  // MLP+SwiGLU output
    struct gptoss_metal_buffer moe_activation_buffer;  // MoE MLP output (per-active expert)

    // Input/output buffers.
    struct gptoss_metal_buffer control_buffer;
    struct gptoss_metal_buffer token_buffer;  // uint32 token IDs
    struct gptoss_metal_buffer score_buffer;  // unembedding outputs
    struct gptoss_metal_buffer prob_buffer;
    struct gptoss_metal_buffer sum_buffer;
    struct gptoss_metal_buffer argmax_buffer;
    struct gptoss_metal_buffer kvcache_buffer;
};


================================================
FILE: gpt_oss/metal/source/include/internal/rng.h
================================================
#pragma once

#include <stdint.h>

inline static uint32_t rng_squares32(uint64_t offset, uint64_t seed) {
    const uint64_t y = offset * seed;
    const uint64_t z = y + seed;

    /* Round 1 */
    uint64_t x = y * y + y;
    x = (x >> 32) | (x << 32);

    /* Round 2 */
    x = x * x + z;
    x = (x >> 32) | (x << 32);

    /* Round 3 */
    x = x * x + y;
    x = (x >> 32) | (x << 32);

    /* Round 4 */
    x = x * x + z;
    return (uint32_t) (x >> 32);
}


================================================
FILE: gpt_oss/metal/source/include/internal/rng.hpp
================================================
#pragma once

#include <cstdint>

namespace gptoss {

namespace rng {

inline static std::uint32_t squares32(std::uint64_t offset, std::uint64_t seed) {
    const std::uint64_t y = offset * seed;
    const std::uint64_t z = y + seed;

    /* Round 1 */
    std::uint64_t x = y * y + y;
    x = (x >> 32) | (x << 32);

    /* Round 2 */
    x = x * x + z;
    x = (x >> 32) | (x << 32);

    /* Round 3 */
    x = x * x + y;
    x = (x >> 32) | (x << 32);

    /* Round 4 */
    x = x * x + z;
    return static_cast<uint32_t>(x >> 32);
}

}  // namespace rng

}  // namespace gptoss


================================================
FILE: gpt_oss/metal/source/include/internal/storage.h
================================================
#pragma once

#include <stdbool.h>
#include <stdint.h>

struct gptoss_file_header {
    char magic[12];
    uint32_t zero;
};

struct gptoss_gptoss_model_header {
    uint32_t context_length;
    uint32_t num_blocks;
    uint32_t num_experts;
    uint32_t num_active_experts;
    uint32_t embedding_dim;
    uint32_t mlp_dim;
    float swiglu_limit;
    uint32_t head_dim;
    uint32_t num_heads;
    uint32_t num_kv_heads;
    uint32_t attention_window;
    float rope_theta;
    float interpolation_scale;
    float yarn_offset;
    float yarn_scale;
    float yarn_multiplier;
    float rmsnorm_epsilon;
};

struct gptoss_tiktoken_tokenizer_header {
    uint32_t num_special_tokens;
    uint32_t num_text_tokens;
    uint32_t regex_size;
    uint32_t tokens_size;
};


================================================
FILE: gpt_oss/metal/source/include/internal/uuid.h
================================================
#pragma once

#include <stdbool.h>
#include <stdint.h>
#include <string.h>

#include "internal/macros.h"


struct GPTOSS_DENSELY_PACKED_STRUCTURE gptoss_uuid {
    uint8_t bytes[16];
};
static_assert(sizeof(struct gptoss_uuid) == 16, "UUID size is not 16 bytes");


#define UUID_FORMAT "%02X%02X%02X%02X-%02X%02X-%02X%02X-%02X%02X-%02X%02X%02X%02X%02X%02X"
#define UUID_ARGS(uuid) (uuid).bytes[0], (uuid).bytes[1], (uuid).bytes[2], (uuid).bytes[3], \
    (uuid).bytes[4], (uuid).bytes[5], (uuid).bytes[6], (uuid).bytes[7], (uuid).bytes[8], (uuid).bytes[9], \
    (uuid).bytes[10], (uuid).bytes[11], (uuid).bytes[12], (uuid).bytes[13], (uuid).bytes[14], (uuid).bytes[15]

static inline bool gptoss_is_gptoss_model_uuid(const struct gptoss_uuid* uuid) {
    return memcmp(
        &(struct gptoss_uuid) {0xDF, 0x52, 0xDC, 0x86, 0x17, 0x89, 0x4E, 0xD0, 0xA2, 0x95, 0x66, 0xF1, 0x05, 0x08, 0x14, 0x5B},
        uuid,
        sizeof(struct gptoss_uuid)) == 0;
}

static inline bool gptoss_is_applegpu_layout_uuid(const struct gptoss_uuid* uuid) {
    return memcmp(
        &(struct gptoss_uuid) {0x22, 0x91, 0x77, 0xA8, 0x57, 0x75, 0x42, 0x68, 0xBF, 0xD8, 0xD5, 0x88, 0xB3, 0x51, 0xC5, 0x6D},
        uuid,
        sizeof(struct gptoss_uuid)) == 0;
}

static inline bool gptoss_is_tiktoken_tokenizer_uuid(const struct gptoss_uuid* uuid) {
    return memcmp(
        &(struct gptoss_uuid) {0x74, 0x01, 0xAD, 0xED, 0x2A, 0x95, 0x40, 0xCB, 0xB7, 0x82, 0x9C, 0xCE, 0xBA, 0xAF, 0xE7, 0x2B},
        uuid,
        sizeof(struct gptoss_uuid)) == 0;
}

static inline enum gptoss_special_token gptoss_special_token_decode_uuid(const struct gptoss_uuid* uuid) {
    if (memcmp(
        &(struct gptoss_uuid) {0x55, 0xA7, 0x7C, 0x2F, 0x8A, 0x01, 0x4C, 0x54, 0x8A, 0xC2, 0x31, 0x3B, 0xFC, 0x7E, 0x20, 0x8D},
        uuid,
        sizeof(struct gptoss_uuid)) == 0)
    {
        return gptoss_special_token_start;
    } else if (memcmp(
        &(struct gptoss_uuid) {0x16, 0xE4, 0x04, 0x31, 0xF4, 0x7F, 0x4B, 0x22, 0xB5, 0x9B, 0x8B, 0x27, 0x8F, 0xC3, 0x0A, 0x54},
        uuid,
        sizeof(struct gptoss_uuid)) == 0)
    {
        return gptoss_special_token_message;
    } else if (memcmp(
        &(struct gptoss_uuid) {0xFC, 0xAC, 0x2F, 0x6D, 0x47, 0x05, 0x4F, 0x6B, 0xB2, 0x28, 0x64, 0x2A, 0xCC, 0xAC, 0x72, 0x38},
        uuid,
        sizeof(struct gptoss_uuid)) == 0)
    {
        return gptoss_special_token_end;
    } else if (memcmp(
        &(struct gptoss_uuid) {0xF7, 0x99, 0xFF, 0x69, 0x19, 0x92, 0x43, 0xC4, 0xA3, 0xD8, 0xD8, 0x31, 0xF4, 0x75, 0xDC, 0x75},
        uuid,
        sizeof(struct gptoss_uuid)) == 0)
    {
        return gptoss_special_token_return;
    } else if (memcmp(
        &(struct gptoss_uuid) {0xE1, 0x5B, 0xA7, 0x02, 0x28, 0xC4, 0x42, 0x92, 0xAB, 0x8F, 0xFF, 0xA4, 0x34, 0x70, 0x91, 0x28},
        uuid,
        sizeof(struct gptoss_uuid)) == 0)
    {
        return gptoss_special_token_refusal;
    } else if (memcmp(
        &(struct gptoss_uuid) {0xC0, 0xBB, 0x14, 0xC7, 0x60, 0x22, 0x49, 0xDA, 0xAD, 0x08, 0x79, 0x2D, 0x67, 0xE8, 0xB4, 0x70},
        uuid,
        sizeof(struct gptoss_uuid)) == 0)
    {
        return gptoss_special_token_constrain;
    } else if (memcmp(
        &(struct gptoss_uuid) {0xFD, 0x3D, 0xDA, 0x11, 0xC8, 0xAB, 0x40, 0x33, 0x87, 0x6E, 0xD9, 0x3D, 0xEB, 0x17, 0x2C, 0x93},
        uuid,
        sizeof(struct gptoss_uuid)) == 0)
    {
        return gptoss_special_token_channel;
    } else if (memcmp(
        &(struct gptoss_uuid) {0x12, 0x20, 0xF7, 0x96, 0xE3, 0x88, 0x4D, 0xE5, 0xB4, 0x87, 0xFE, 0x2E, 0xB5, 0xFE, 0x03, 0xC0},
        uuid,
        sizeof(struct gptoss_uuid)) == 0)
    {
        return gptoss_special_token_call;
    } else if (memcmp(
        &(struct gptoss_uuid) {0x07, 0xD7, 0xDA, 0x55, 0xB3, 0x46, 0x4C, 0xFF, 0x8B, 0x37, 0x7C, 0xEF, 0xAC, 0xF8, 0xA3, 0xE8},
        uuid,
        sizeof(struct gptoss_uuid)) == 0)
    {
        return gptoss_special_token_untrusted;
    } else if (memcmp(
        &(struct gptoss_uuid) {0xF2, 0x65, 0xBD, 0x9C, 0xC7, 0x17, 0x46, 0x9E, 0xA4, 0x47, 0x92, 0x06, 0x87, 0xD6, 0x5D, 0x90},
        uuid,
        sizeof(struct gptoss_uuid)) == 0)
    {
        return gptoss_special_token_end_untrusted;
    } else if (memcmp(
        &(struct gptoss_uuid) {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
        uuid,
        sizeof(struct gptoss_uuid)) == 0)
    {
        // Suppress warning
        return gptoss_special_token_invalid;
    } else {
        GPTOSS_LOG_WARNING("unsupported special token " UUID_FORMAT, UUID_ARGS(*uuid));
        return gptoss_special_token_invalid;
    }
}


================================================
FILE: gpt_oss/metal/source/log.c
================================================
#include <assert.h>  // assert
#include <stdarg.h>  // va_list, va_copy, va_end
#include <stdio.h>  // vsnprintf
#include <stdlib.h>  // malloc, free

#include <unistd.h>  // STDERR_FILENO


#define GPTOSS_ON_STACK_FORMAT_BUFFER_SIZE 16384

void gptoss_format_log(const char* format, va_list args) {
    char stack_buffer[GPTOSS_ON_STACK_FORMAT_BUFFER_SIZE];
    char* heap_buffer = NULL;

    va_list args_copy;
    va_copy(args_copy, args);

    const int vsnprintf_result = vsnprintf(stack_buffer, GPTOSS_ON_STACK_FORMAT_BUFFER_SIZE, format, args);
    assert(vsnprintf_result >= 0);

    // At least a partially formatted buffer is ready.
    char* message_buffer = &stack_buffer[0];
    size_t message_size = (size_t) vsnprintf_result;
    if (message_size > GPTOSS_ON_STACK_FORMAT_BUFFER_SIZE) {
        heap_buffer = malloc(message_size);
        if (heap_buffer == NULL) {
            // Fall back to the truncated message in the on-stack buffer.
            message_size = GPTOSS_ON_STACK_FORMAT_BUFFER_SIZE;
        } else {
            // Use the full message in the in-heap buffer.
            vsnprintf(heap_buffer, message_size, format, args_copy);
            message_buffer = heap_buffer;
        }
    }

    ssize_t bytes_written;
    do {
        bytes_written = write(STDERR_FILENO, message_buffer, message_size);
        if (bytes_written > 0) {
            assert((size_t) bytes_written <= message_size);
            message_buffer += bytes_written;
            message_size -= bytes_written;
        }
    } while (bytes_written >= 0 && message_size != 0);

cleanup:
    free(heap_buffer);
    va_end(args_copy);
}


================================================
FILE: gpt_oss/metal/source/matmul.metal
================================================
#include <metal_atomic>
#include <metal_compute>
#include <metal_integer>
#include <metal_math>
#include <metal_simdgroup>
#include <metal_stdlib>

#include <internal/kernel-args.h>

#pragma METAL fp math_mode(safe)
#pragma METAL fp contract(off)


// Each simdgroup reduces all channels of the input and computes a single channel of the output
// + Efficient synchronization
// + Sequential memory access within a warp
// Each threadgroup computes (simdgroups_per_threadgroup) consecutive output channels
// + Reuse input vector from threadgroup memory
// + Avoid synchronization across warps when doing reduction

kernel void gptoss_f32_bf16w_matmul(
    constant gptoss_matmul_args& args [[ buffer(0) ]],
    const device float4* input [[ buffer(1) ]],
    const device bfloat4* weight [[ buffer(2) ]],
    const device bfloat* bias [[ buffer(3) ]],
    device float* output [[ buffer(4) ]],
    const device gptoss_control* control [[ buffer(5) ]],
    uint2 gid [[threadgroup_position_in_grid]],
    uint simdgroup_tid [[thread_index_in_simdgroup]],
    uint simdgroup_idx [[simdgroup_index_in_threadgroup]],
    uint num_simdgroups [[simdgroups_per_threadgroup]])
{
    const uint simdgroup_size = 32;
    if (control->abort != 0) {
        return;
    }

    const uint num_column_vecs = args.num_column_vecs;
    const uint row = gid.x * num_simdgroups + simdgroup_idx;

    input += gid.y * num_column_vecs + simdgroup_tid;
    weight += num_column_vecs * row + simdgroup_tid;
    bias += row;
    output += gid.y * args.num_rows + row;

    uint num_iter = (num_column_vecs - simdgroup_tid + (simdgroup_size - 1)) / simdgroup_size;

    float4 sum4 = 0.0f;
    do {
        const bfloat4 w = *weight;
        const float4 i = *input;
        sum4 = metal::fma(static_cast<float4>(w), i, sum4);

        weight += simdgroup_size;
        input += simdgroup_size;
    } while (--num_iter != 0);
    const float2 sum2 = sum4.xy + sum4.zw;
    float sum = sum2.x + sum2.y;
    sum = metal::simd_sum(sum);
    if (metal::simd_is_first()) {
        sum += static_cast<float>(*bias);
        if (args.add) {
            *output += sum;
        } else {
            *output = sum;
        }
    }
}

kernel void gptoss_f32_bf16w_matmul_qkv(
    constant gptoss_qkv_args& args [[ buffer(0) ]],
    const device float4* input [[ buffer(1) ]],
    const device bfloat4* weight [[ buffer(2) ]],
    const device bfloat* bias [[ buffer(3) ]],
    device float* q [[ buffer(4) ]],
    device float* kv [[ buffer(5) ]],
    const device gptoss_control* control [[ buffer(6) ]],
    threadgroup void* scratch [[ threadgroup(0) ]],
    uint2 gid [[threadgroup_position_in_grid]],
    uint simdgroup_tid [[thread_index_in_simdgroup]],
    uint simdgroup_idx [[simdgroup_index_in_threadgroup]],
    uint num_simdgroups [[simdgroups_per_threadgroup]])
{
    const uint simdgroup_size = 32;
    const uint head_dim = 64;
    const uint num_q_heads = 64;
    const uint num_kv_heads = 8;
    if (control->abort != 0) {
        return;
    }

    const uint num_column_vecs = args.num_column_vecs;
    const uint row = gid.x * num_simdgroups + simdgroup_idx;

    input += gid.y * num_column_vecs + simdgroup_tid;
    weight += num_column_vecs * row + simdgroup_tid;
    bias += row;
    q += gid.y * args.num_rows;

    uint num_iter = (num_column_vecs - simdgroup_tid + (simdgroup_size - 1)) / simdgroup_size;

    float4 sum4 = 0.0f;
    do {
        const bfloat4 w = *weight;
        const float4 i = *input;
        sum4 = metal::fma(static_cast<float4>(w), i, sum4);

        weight += simdgroup_size;
        input += simdgroup_size;
    } while (--num_iter != 0);
    const float2 sum2 = sum4.xy + sum4.zw;
    float sum = sum2.x + sum2.y;
    sum = metal::simd_sum(sum);
    if (metal::simd_is_first()) {
        sum += static_cast<float>(*bias);
        static_cast<threadgroup float*>(scratch)[simdgroup_idx] = sum;
    }
    metal::threadgroup_barrier(metal::mem_flags::mem_threadgroup);
    if (simdgroup_idx == 0) {
        const uint num_half_simdgroups = num_simdgroups / 2;
        if (simdgroup_tid < num_half_simdgroups) {
            float2 vals = static_cast<const threadgroup float2*>(scratch)[simdgroup_tid];
            const uint idx = gid.x * num_half_simdgroups + simdgroup_tid;
            const uint head_idx = idx / (head_dim / 2);
            const uint token_idx = args.token_offset + gid.y;
            const uint dim_idx = idx % (head_dim / 2);
            if (head_idx < num_q_heads + num_kv_heads) {
                const float dim_idx_val = static_cast<float>(dim_idx);
                const float inv_extrapolation_freq = metal::precise::exp(dim_idx_val * args.freq_scale);
                const float inv_interpolation_freq = inv_extrapolation_freq * args.interpolation_scale;
                const float alpha = metal::saturate(metal::fma(dim_idx_val, args.yarn_scale, args.yarn_offset));
                const float inv_freq = metal::mix(inv_extrapolation_freq, inv_interpolation_freq, alpha);

                const float phi = static_cast<float>(token_idx) * inv_freq;
                const float yarn_multiplier = args.yarn_multiplier;
                float cosphi;
                const float sinphi = metal::precise::sincos(phi, cosphi) * yarn_multiplier;
                cosphi *= yarn_multiplier;

                vals = (float2) {
                    vals.x * cosphi - vals.y * sinphi,
                    vals.x * sinphi + vals.y * cosphi,
                };
            }
            if (head_idx < num_q_heads) {
                reinterpret_cast<device float2*>(q)[idx] = vals;
            } else if (head_idx < num_q_heads + num_kv_heads) {
                const uint h = head_idx - num_q_heads;
                reinterpret_cast<device float2*>(kv + (h * args.max_tokens + token_idx) * 2 * head_dim)[dim_idx] = vals;
            } else {
                const uint h = head_idx - num_q_heads - num_kv_heads;
                reinterpret_cast<device float2*>(kv + (h * args.max_tokens + token_idx) * 2 * head_dim + head_dim)[dim_idx] = vals;
            }
        }
    }
}

kernel void gptoss_f32_bf16w_unembedding(
    constant gptoss_unembedding_args& args [[ buffer(0) ]],
    const device float4* input [[ buffer(1) ]],
    const device bfloat4* weight [[ buffer(2) ]],
    device float* output [[ buffer(3) ]],
    device metal::atomic_ulong* argmax [[ buffer(4) ]],
    const device gptoss_control* control [[ buffer(5) ]],
    uint2 gid [[threadgroup_position_in_grid]],
    uint simdgroup_tid [[thread_index_in_simdgroup]],
    uint simdgroup_idx [[simdgroup_index_in_threadgroup]],
    uint num_simdgroups [[simdgroups_per_threadgroup]])
{
    const uint simdgroup_size = 32;
    threadgroup uint2 threadgroup_buffer[32];
    if (control->abort != 0) {
        return;
    }

    const uint num_column_vecs = args.num_column_vecs;
    const uint row_start = gid.x * args.num_rows_per_threadgroup + simdgroup_idx;
    const uint row_end = metal::min(gid.x * args.num_rows_per_threadgroup + args.num_rows_per_threadgroup, args.num_rows);
    const uint num_iter = (num_column_vecs - simdgroup_tid + (simdgroup_size - 1)) / simdgroup_size;

    input += gid.y * num_column_vecs + simdgroup_tid;
    weight += num_column_vecs * row_start + simdgroup_tid;
    output += gid.y * args.num_rows + row_start;

    uint2 row_sum{0xFFFFFFFFul, 0xFFFFFFFFul};
    for (uint row = row_start; row < row_end; row += num_simdgroups) {
        uint n = num_iter;

        float4 sum4 = 0.0f;
        do {
            const bfloat4 w = *weight;
            const float4 i = *input;

            sum4 = metal::fma(static_cast<float4>(w), i, sum4);

            weight += simdgroup_size;
            input += simdgroup_size;
        } while (--n != 0);
        input -= num_iter * simdgroup_size;
        weight -= num_iter * simdgroup_size;

        const float2 sum2 = sum4.xy + sum4.zw;
        float sum = sum2.x + sum2.y;
        sum = metal::simd_sum(sum);
        uint sum_bits = as_type<uint>(sum);
        if (static_cast<int>(sum_bits) >= 0) {
            sum_bits ^= 0x7FFFFFFFu;
        }
        row_sum = as_type<uint2>(metal::min(as_type<ulong>(row_sum), as_type<ulong>(uint2{row, sum_bits})));
        if (metal::simd_is_first()) {
            *output = sum;
        }

        weight += num_column_vecs * num_simdgroups;
        output += num_simdgroups;
    }
    if (metal::simd_is_first()) {
        threadgroup_buffer[simdgroup_idx] = row_sum;
    }
    metal::threadgroup_barrier(metal::mem_flags::mem_threadgroup);
    if (simdgroup_idx == 0) {
        // Min-Reduce threadgroup_buffer
        if (simdgroup_tid < num_simdgroups) {
            row_sum = threadgroup_buffer[simdgroup_tid];
        }
        const uint sum_bits = row_sum.y;
        const uint sum_bits_min = metal::simd_min(sum_bits);
        const uint row_min = metal::simd_min(sum_bits == sum_bits_min ? row_sum.x : 0xFFFFFFFFu);
        if (metal::simd_is_first()) {
            const uint2 threadgroup_output{row_min, sum_bits_min};
            atomic_min_explicit(&argmax[gid.y], as_type<ulong>(threadgroup_output), metal::memory_order_relaxed);
        }
    }
}

// Current constraints for the dense matmul kernel:
//  1- All B* and Sg_* are a multiple of 8.
//  2- Bm is divisible by Sg_n and Bn is divisible by Sg_n.
//  3- M, N and K are all divisible by 8..
template <uint Bm, uint Bn, uint Bk, uint Sg_Bm, uint Sg_Bn, uint add = 0>
inline void _gptoss_f32_bf16w_dense_matmul_impl(
    constant gptoss_dense_matmul_args& args, const device float* lhs,
    const device bfloat* rhs, const device bfloat* __restrict__ bias,
    device float* out, const device gptoss_control* control, threadgroup float* scratch, threadgroup float* bias_tile,
    uint sg_id, uint sg_count_per_tg, uint3 gid, uint3 tg_id, uint3 local_tid,
    uint3 threadgroup_size) {

    if (control->abort != 0) {
        return;
    }

    // The kernel assumes that M, K, and N are divisible by 8.
    const uint M = args.m;
    const uint K = args.k;
    const uint N = args.n;
    static_assert((Bm % 8u) == 0u, "Bm must be a multiple of 8");
    static_assert((Bn % 8u) == 0u, "Bn must be a multiple of 8");
    static_assert((Bk % 8u) == 0u, "Bk must be a multiple of 8");
    static_assert((Sg_Bm % 8u) == 0u, "Bk must be a multiple of 8");
    static_assert((Sg_Bn % 8u) == 0u, "Bk must be a multiple of 8");
    static_assert((Bn % Sg_Bn) == 0u, "Bn must be a multiple of Sg_Bn");
    static_assert((Bm % Sg_Bm) == 0u, "Bm must be a multiple of Sg_Bm");

    // Get row and col tg.
    const uint row_tg = tg_id.y;
    const uint col_tg = tg_id.x;
    // Get row and col local tid.
    const uint row_tg_offset = row_tg * Bm;
    const uint col_tg_offset = col_tg * Bn;

    const uint sg_col_count = Bn / Sg_Bn;
    const uint row_sg = sg_id / sg_col_count;
    const uint col_sg = sg_id % sg_col_count;

    const uint row_sg_offset = row_sg * Sg_Bm;
    const uint col_sg_offset = col_sg * Sg_Bn;
    constexpr uint temp_result_size = (Sg_Bm / 8) * (Sg_Bn / 8);
    // Create an array of simdgroup_float8x8 to hold temp results.
    metal::simdgroup_float8x8 OutTiles[temp_result_size];
#pragma clang loop unroll(full)
    for (uint i = 0; i < temp_result_size; i++) {
        OutTiles[i] = metal::make_filled_simdgroup_matrix<float, 8, 8>(
            static_cast<float>(0.0));
    }

    for (uint k_offset = 0; k_offset < K; k_offset += Bk) {
#pragma clang loop unroll(full)
        for (uint k = 0; k < Bk; k += 8) {
#pragma clang loop unroll(full)
            for (uint m_subtile_ = 0; m_subtile_ < Sg_Bm; m_subtile_ += 8) {
                // const uint m_subtile = row_sg_offset + m_subtile_;
                // const uint row_index_in_out_tile = (m_subtile - row_sg_offset) / 8;
                const uint row_index_in_out_tile = m_subtile_ / 8;
                metal::simdgroup_float8x8 LHStile;
                const uint k_id = k + k_offset;
                const uint row_offset = row_tg_offset + row_sg_offset + m_subtile_;
                metal::simdgroup_load(LHStile, lhs, K, ulong2(k_id, row_offset));
                metal::simdgroup_bfloat8x8 RHStile;
#pragma clang loop unroll(full)
                for (uint n_subtile_ = 0; n_subtile_ < Sg_Bn; n_subtile_ += 8) {
                    const uint col_index_in_out_tile = n_subtile_ / 8;
                    const uint current_index_out_tile =
                        row_index_in_out_tile * (Sg_Bn / 8) + col_index_in_out_tile;
                    const uint col_offset = col_tg_offset + col_sg_offset + n_subtile_;
                    simdgroup_load(RHStile, rhs, K, ulong2(k_id, col_offset), /*transpose=*/true);
                    // If rhs was not transposed, use the following instead:
                    // simdgroup_load(RHStile, rhs, N, ulong2(col_offset, k_id));
                    simdgroup_multiply_accumulate(OutTiles[current_index_out_tile],
                                                  LHStile, RHStile,
                                                  OutTiles[current_index_out_tile]);
                }
            }
        }
    }
    // Epilogue.
#pragma clang loop unroll(full)
    for (uint n_subtile_ = 0; n_subtile_ < Sg_Bn; n_subtile_ += 8) {
        const uint col_index_in_out_tile = n_subtile_ / 8;
        const uint local_col_offset = col_sg_offset + n_subtile_;
#pragma clang loop unroll(full)
        for (uint m_subtile_ = 0; m_subtile_ < Sg_Bm; m_subtile_ += 8) {
            const uint row_index_in_out_tile = m_subtile_ / 8;
            const uint local_row_offset = row_sg_offset + m_subtile_;
            const uint current_index_out_tile =
                row_index_in_out_tile * (Sg_Bn / 8) + col_index_in_out_tile;
            simdgroup_store(OutTiles[current_index_out_tile], scratch, Bn,
                            ulong2(local_col_offset, local_row_offset));
        }
    }
    // TODO(ibahmed): vectorize these loads an maybe unroll the loop.
    const uint thread_count_per_tg =
        threadgroup_size.x * threadgroup_size.y * threadgroup_size.z;
    for (uint c_local = local_tid.x; c_local < Bn;
         c_local += thread_count_per_tg) {
        const uint c_global = col_tg_offset + c_local;
        bias_tile[c_local] =
            (c_global < N) ? static_cast<float>(bias[c_global]) : 0.0f;
    }

    metal::threadgroup_barrier(metal::mem_flags::mem_threadgroup);

    // TODO(ibahmed): vectorize these stores and maybe unroll the loop.
    for (uint idx = local_tid.x; idx < Bm * Bn; idx += thread_count_per_tg) {
        const uint r = idx / Bn;
        const uint c = idx % Bn;

        const uint out_row = row_tg_offset + r;
        const uint out_col = col_tg_offset + c;

        if (out_row < M && out_col < N) {
            float acc = scratch[idx] + bias_tile[c];
            if (add) {
                acc += out[out_row * N + out_col];
            }
            out[out_row * N + out_col] = acc;
        }
    }
}

kernel void gptoss_f32_bf16w_dense_matmul_qkv(
    constant gptoss_dense_matmul_qkv_args& args [[buffer(0)]],
    const device float* lhs [[buffer(1)]],
    const device bfloat* rhs [[buffer(2)]],
    const device bfloat* __restrict__ bias [[buffer(3)]],
    device float* out [[buffer(4)]],
    device float* kv [[buffer(5)]],
    const device gptoss_control* control [[buffer(6)]],
    uint sg_id [[simdgroup_index_in_threadgroup]],
    uint sg_count_per_tg [[dispatch_simdgroups_per_threadgroup]],
    uint3 gid [[thread_position_in_grid]],
    uint3 tg_id [[threadgroup_position_in_grid]],
    uint3 local_tid [[thread_position_in_threadgroup]],
    uint3 threadgroup_size [[threads_per_threadgroup]]) {
    threadgroup float scratch[QKV_Bm * QKV_Bn];
    threadgroup float bias_tile[QKV_Bn];
    if (control->abort != 0) {
        return;
    }

    // The kernel assumes that QKV_Bm, QKV_Bn, QKV_Bk, QKV_Sg_Bm, QKV_Sg_Bn are divisible by 8.
    const uint M = args.m;
    const uint K = args.k;
    const uint N = args.n;
    const uint Bm = QKV_Bm;
    const uint Bn = QKV_Bn;
    const uint Bk = QKV_Bk;
    const uint Sg_Bm = QKV_Sg_Bm;
    const uint Sg_Bn = QKV_Sg_Bn;
    static_assert((Bm % 8u) == 0u, "Bm must be a multiple of 8");
    static_assert((Bn % 8u) == 0u, "Bn must be a multiple of 8");
    static_assert((Bk % 8u) == 0u, "Bk must be a multiple of 8");
    static_assert((Sg_Bm % 8u) == 0u, "Bk must be a multiple of 8");
    static_assert((Sg_Bn % 8u) == 0u, "Bk must be a multiple of 8");
    static_assert((Bn % Sg_Bn) == 0u, "Bn must be a multiple of Sg_Bn");
    static_assert((Bm % Sg_Bm) == 0u, "Bm must be a multiple of Sg_Bm");

    // Get row and col tg.
    const uint row_tg = tg_id.y;
    const uint col_tg = tg_id.x;
    // Get row and col local tid.
    const uint row_tg_offset = row_tg * Bm;
    const uint col_tg_offset = col_tg * Bn;

    const uint sg_col_count = Bn / Sg_Bn;
    const uint row_sg = sg_id / sg_col_count;
    const uint col_sg = sg_id % sg_col_count;

    const uint row_sg_offset = row_sg * Sg_Bm;
    const uint col_sg_offset = col_sg * Sg_Bn;
    constexpr uint temp_result_size = (Sg_Bm / 8) * (Sg_Bn / 8);
    // Create an array of simdgroup_float8x8 to hold temp results.
    metal::simdgroup_float8x8 OutTiles[temp_result_size];
#pragma clang loop unroll(full)
    for (uint i = 0; i < temp_result_size; i++) {
        OutTiles[i] = metal::make_filled_simdgroup_matrix<float, 8, 8>(
            static_cast<float>(0.0));
    }

    for (uint k_offset = 0; k_offset < K; k_offset += Bk) {
#pragma clang loop unroll(full)
        for (uint k = 0; k < Bk; k += 8) {
#pragma clang loop unroll(full)
            for (uint m_subtile_ = 0; m_subtile_ < Sg_Bm; m_subtile_ += 8) {
                const uint row_index_in_out_tile = m_subtile_ / 8;
                metal::simdgroup_float8x8 LHStile;
                const uint k_id = k + k_offset;
                const uint row_offset = row_tg_offset + row_sg_offset + m_subtile_;
                metal::simdgroup_load(LHStile, lhs, K, ulong2(k_id, row_offset));
                metal::simdgroup_bfloat8x8 RHStile;
#pragma clang loop unroll(full)
                for (uint n_subtile_ = 0; n_subtile_ < Sg_Bn; n_subtile_ += 8) {
                    const uint col_index_in_out_tile = n_subtile_ / 8;
                    const uint current_index_out_tile =
                        row_index_in_out_tile * (Sg_Bn / 8) + col_index_in_out_tile;
                    const uint col_offset = col_tg_offset + col_sg_offset + n_subtile_;
                    simdgroup_load(RHStile, rhs, K, ulong2(k_id, col_offset), /*transpose=*/true);
                    simdgroup_multiply_accumulate(OutTiles[current_index_out_tile],
                                                  LHStile, RHStile,
                                                  OutTiles[current_index_out_tile]);
                }
            }
        }
    }
    // Epilogue.
#pragma clang loop unroll(full)
    for (uint n_subtile_ = 0; n_subtile_ < Sg_Bn; n_subtile_ += 8) {
        const uint col_index_in_out_tile = n_subtile_ / 8;
        const uint local_col_offset = col_sg_offset + n_subtile_;
#pragma clang loop unroll(full)
        for (uint m_subtile_ = 0; m_subtile_ < Sg_Bm; m_subtile_ += 8) {
            const uint row_index_in_out_tile = m_subtile_ / 8;
            const uint local_row_offset = row_sg_offset + m_subtile_;
            const uint current_index_out_tile =
                row_index_in_out_tile * (Sg_Bn / 8) + col_index_in_out_tile;
            simdgroup_store(OutTiles[current_index_out_tile], scratch, Bn,
                            ulong2(local_col_offset, local_row_offset));
        }
    }
    // TODO(ibahmed): vectorize these loads an maybe unroll the loop.
    const uint thread_count_per_tg =
        threadgroup_size.x * threadgroup_size.y * threadgroup_size.z;
    for (uint c_local = local_tid.x; c_local < Bn;
         c_local += thread_count_per_tg) {
        const uint c_global = col_tg_offset + c_local;
        bias_tile[c_local] =
            (c_global < N) ? static_cast<float>(bias[c_global]) : 0.0f;
    }

    metal::threadgroup_barrier(metal::mem_flags::mem_threadgroup);
    const uint q_heads = 64;
    const uint kv_heads = 8;
    const uint head_dim = 64;
    const uint q_cols = q_heads * head_dim;
    const uint k_cols = kv_heads * head_dim;

    // TODO(ibahmed): vectorize these stores and maybe unroll the loop.
    for (uint idx = local_tid.x; idx < Bm * Bn; idx += thread_count_per_tg) {
        const uint r = idx / Bn;
        const uint c = idx % Bn;

        const uint out_row = row_tg_offset + r;
        const uint out_col = col_tg_offset + c;

        if (out_row < M && out_col < N) {
            float acc = scratch[idx] + bias_tile[c];
            if ((out_col < q_cols + k_cols)) {
                out[out_row * N + out_col] = acc;
            } else {
                // Write v into kv cache.
                const uint v_col = out_col - q_cols - k_cols;
                const uint v_head = v_col / head_dim;
                const uint dim_idx = v_col % head_dim;
                const uint token_idx = args.token_offset + out_row;
                kv[(v_head * args.max_tokens + token_idx) * 2 * head_dim + head_dim + dim_idx] = acc;
            }
        }
    }
}

kernel void gptoss_f32_bf16w_dense_matmul_attn_output(
    constant gptoss_dense_matmul_args& args [[buffer(0)]],
    const device float* lhs [[buffer(1)]],
    const device bfloat* rhs [[buffer(2)]],
    const device bfloat* __restrict__ bias [[buffer(3)]],
    device float* out [[buffer(4)]],
    const device gptoss_control* control [[buffer(5)]],
    uint sg_id [[simdgroup_index_in_threadgroup]],
    uint sg_count_per_tg [[dispatch_simdgroups_per_threadgroup]],
    uint3 gid [[thread_position_in_grid]],
    uint3 tg_id [[threadgroup_position_in_grid]],
    uint3 local_tid [[thread_position_in_threadgroup]],
    uint3 threadgroup_size [[threads_per_threadgroup]]) {
    threadgroup float scratch[ATTN_OUTPUT_Bm * ATTN_OUTPUT_Bn];
    threadgroup float bias_tile[ATTN_OUTPUT_Bn];
    _gptoss_f32_bf16w_dense_matmul_impl<ATTN_OUTPUT_Bm, ATTN_OUTPUT_Bn,
                                        ATTN_OUTPUT_Bk, ATTN_OUTPUT_Sg_Bm,
                                        ATTN_OUTPUT_Sg_Bn, /*add=*/1>(
        args, lhs, rhs, bias, out, control, scratch, bias_tile, sg_id, sg_count_per_tg,
        gid, tg_id, local_tid, threadgroup_size);
}

kernel void gptoss_f32_bf16w_dense_matmul_mlp_gate(
    constant gptoss_dense_matmul_args& args [[buffer(0)]],
    const device float* lhs [[buffer(1)]],
    const device bfloat* rhs [[buffer(2)]],
    const device bfloat* __restrict__ bias [[buffer(3)]],
    device float* out [[buffer(4)]],
    const device gptoss_control* control [[buffer(5)]],
    uint sg_id [[simdgroup_index_in_threadgroup]],
    uint sg_count_per_tg [[dispatch_simdgroups_per_threadgroup]],
    uint3 gid [[thread_position_in_grid]],
    uint3 tg_id [[threadgroup_position_in_grid]],
    uint3 local_tid [[thread_position_in_threadgroup]],
    uint3 threadgroup_size [[threads_per_threadgroup]]) {
    threadgroup float scratch[MLP_GATE_Bm * MLP_GATE_Bn];
    threadgroup float bias_tile[MLP_GATE_Bn];
    _gptoss_f32_bf16w_dense_matmul_impl<MLP_GATE_Bm, MLP_GATE_Bn, MLP_GATE_Bk,
                                        MLP_GATE_Sg_Bm, MLP_GATE_Sg_Bn>(
        args, lhs, rhs, bias, out, control, scratch, bias_tile, sg_id, sg_count_per_tg,
        gid, tg_id, local_tid, threadgroup_size);
}


================================================
FILE: gpt_oss/metal/source/metal-kernels.c
================================================
#include <inttypes.h>
#include <stddef.h>
#include <stdint.h>
#include <math.h>

#include <internal/kernel-args.h>
#include <internal/log.h>
#include <internal/math.h>
#include <internal/metal.h>
#include <internal/metal-kernels.h>


enum gptoss_status gptoss_metal_command_buffer_encode_launch_u32_fill_random(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* u32_fill_random_fn,
    size_t threadgroup_size,
    size_t max_threadgroups,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    uint64_t num_elements,
    uint64_t rng_seed,
    uint64_t rng_offset)
{
    if (command_buffer->object == NULL || u32_fill_random_fn->pipeline_state_object == NULL) {
        return gptoss_status_invalid_state;
    }

    if (threadgroup_size == 0) {
        threadgroup_size = u32_fill_random_fn->max_threadgroup_threads;
    } else if (threadgroup_size > u32_fill_random_fn->max_threadgroup_threads) {
        return gptoss_status_invalid_argument;
    }

    const size_t num_vecs = num_elements;
    const size_t num_vecs_per_threadgroup = math_ceil_div(num_vecs, max_threadgroups * threadgroup_size) * threadgroup_size;
    const size_t num_threadgroups = math_min(max_threadgroups, math_ceil_div(num_vecs, num_vecs_per_threadgroup));
    const struct gptoss_u32_fill_random_args args = {
        .num_vecs = num_vecs,
        .num_vecs_per_threadgroup = num_vecs_per_threadgroup,
        .seed = rng_seed,
        .offset = rng_offset,
    };

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, u32_fill_random_fn,
        threadgroup_size, 1, 1,
        num_threadgroups, 1, 1,
        sizeof(args), &args,
        1, &output_buffer, &output_offset,
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_fill_random(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_fill_random_fn,
    size_t threadgroup_size,
    size_t max_threadgroups,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    uint64_t num_elements,
    uint64_t rng_seed,
    uint64_t rng_offset,
    float rng_min,
    float rng_max)
{
    if (command_buffer->object == NULL || f32_fill_random_fn->pipeline_state_object == NULL) {
        return gptoss_status_invalid_state;
    }

    if (threadgroup_size == 0) {
        threadgroup_size = f32_fill_random_fn->max_threadgroup_threads;
    } else if (threadgroup_size > f32_fill_random_fn->max_threadgroup_threads) {
        return gptoss_status_invalid_argument;
    }

    if (rng_min >= rng_max) {
        return gptoss_status_invalid_argument;
    }

    const size_t num_vecs = num_elements;
    const size_t num_vecs_per_threadgroup = math_ceil_div(num_vecs, max_threadgroups * threadgroup_size) * threadgroup_size;
    const size_t num_threadgroups = math_min(max_threadgroups, math_ceil_div(num_vecs, num_vecs_per_threadgroup));
    const struct gptoss_f32_fill_random_args args = {
        .num_vecs = num_vecs,
        .num_vecs_per_threadgroup = num_vecs_per_threadgroup,
        .seed = rng_seed,
        .offset = rng_offset,
        .scale = (rng_max - rng_min) * 0x1.0p-32f,
        .bias = (rng_min + rng_max) * 0.5f,
    };

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_fill_random_fn,
        threadgroup_size, 1, 1,
        num_threadgroups, 1, 1,
        sizeof(args), &args,
        1, &output_buffer, &output_offset,
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_bf16_fill_random(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* bf16_fill_random_fn,
    size_t threadgroup_size,
    size_t max_threadgroups,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    uint64_t num_elements,
    uint64_t rng_seed,
    uint64_t rng_offset,
    float rng_min,
    float rng_max)
{
    if (command_buffer->object == NULL || bf16_fill_random_fn->pipeline_state_object == NULL) {
        return gptoss_status_invalid_state;
    }

    if (threadgroup_size == 0) {
        threadgroup_size = bf16_fill_random_fn->max_threadgroup_threads;
    } else if (threadgroup_size > bf16_fill_random_fn->max_threadgroup_threads) {
        return gptoss_status_invalid_argument;
    }

    if (rng_min >= rng_max) {
        return gptoss_status_invalid_argument;
    }

    const size_t num_vecs = num_elements;
    const size_t num_vecs_per_threadgroup = math_ceil_div(num_vecs, max_threadgroups * threadgroup_size) * threadgroup_size;
    const size_t num_threadgroups = math_min(max_threadgroups, math_ceil_div(num_vecs, num_vecs_per_threadgroup));
    const struct gptoss_f32_fill_random_args args = {
        .num_vecs = num_vecs,
        .num_vecs_per_threadgroup = num_vecs_per_threadgroup,
        .seed = rng_seed,
        .offset = rng_offset,
        .scale = (rng_max - rng_min) * 0x1.0p-32f,
        .bias = (rng_min + rng_max) * 0.5f,
    };

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, bf16_fill_random_fn,
        threadgroup_size, 1, 1,
        num_threadgroups, 1, 1,
        sizeof(args), &args,
        1, &output_buffer, &output_offset,
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_mf4_f32_convert(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* mf4_f32_convert_fn,
    size_t threadgroup_size,
    size_t max_threadgroups,
    const struct gptoss_metal_buffer* block_buffer,
    const struct gptoss_metal_buffer* scale_buffer,
    const struct gptoss_metal_buffer* output_buffer,
    uint64_t num_elements)
{
    if (command_buffer->object == NULL || mf4_f32_convert_fn->pipeline_state_object == NULL) {
        return gptoss_status_invalid_state;
    }

    if (num_elements % 32 != 0) {
        return gptoss_status_invalid_argument;
    }

    if (threadgroup_size == 0) {
        threadgroup_size = mf4_f32_convert_fn->max_threadgroup_threads;
    } else if (threadgroup_size > mf4_f32_convert_fn->max_threadgroup_threads) {
        return gptoss_status_invalid_argument;
    }

    const size_t num_vecs = num_elements / 32;
    const size_t num_vecs_per_threadgroup = math_ceil_div(num_vecs, max_threadgroups * threadgroup_size) * threadgroup_size;
    const size_t num_threadgroups = math_min(max_threadgroups, math_ceil_div(num_vecs, num_vecs_per_threadgroup));
    const struct gptoss_convert_args args = {
        .num_vecs = num_vecs,
        .num_vecs_per_threadgroup = num_vecs_per_threadgroup,
    };

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, mf4_f32_convert_fn,
        threadgroup_size, 1, 1,
        num_threadgroups, 1, 1,
        sizeof(args), &args,
        3, (const struct gptoss_metal_buffer *[]) {block_buffer, scale_buffer, output_buffer}, NULL,
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_bf16_f32_embeddings(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* bf16_f32_embeddings_fn,
    size_t threadgroup_size,
    const struct gptoss_metal_buffer* token_buffer,
    size_t token_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_channels)
{
    if (command_buffer->object == NULL || bf16_f32_embeddings_fn->pipeline_state_object == NULL) {
        return gptoss_status_invalid_state;
    }

    if (num_channels % 4 != 0) {
        return gptoss_status_invalid_argument;
    }

    if (threadgroup_size == 0) {
        threadgroup_size = bf16_f32_embeddings_fn->max_threadgroup_threads;
    } else if (threadgroup_size > bf16_f32_embeddings_fn->max_threadgroup_threads) {
        return gptoss_status_invalid_argument;
    }

    const uint32_t num_vecs = num_channels / 4;
    const struct gptoss_embeddings_args args = {
        .num_vecs = num_vecs,
    };

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, bf16_f32_embeddings_fn,
        threadgroup_size, 1, 1,
        num_tokens, 1, 1,
        sizeof(args), &args,
        4,
        (const struct gptoss_metal_buffer *[]) {token_buffer, weight_buffer, output_buffer, control_buffer},
        (const size_t[]) {token_offset, weight_offset, output_offset, control_offset},
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_bf16w_rmsnorm(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_bf16w_rmsnorm_fn,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_channels,
    float epsilon)
{
    if (command_buffer->object == NULL || f32_bf16w_rmsnorm_fn->pipeline_state_object == NULL) {
        return gptoss_status_invalid_state;
    }

    if (num_channels % 4 != 0) {
        return gptoss_status_invalid_argument;
    }

    if (f32_bf16w_rmsnorm_fn->max_threadgroup_threads < 1024) {
        return gptoss_status_unsupported_system;
    }

    if (f32_bf16w_rmsnorm_fn->simdgroup_threads != 32) {
        return gptoss_status_unsupported_system;
    }

    const uint32_t num_vecs = num_channels / 4;
    const struct gptoss_rmsnorm_args args = {
        .num_vecs = num_vecs,
        .num_channels = (float) num_channels,
        .epsilon = epsilon,
    };

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_bf16w_rmsnorm_fn,
        /*threadgroup_size=*/1024, 1, 1,
        num_tokens, 1, 1,
        sizeof(args), &args,
        4,
        (const struct gptoss_metal_buffer *[]) {input_buffer, weight_buffer, output_buffer, control_buffer},
        (const size_t[]) {input_offset, weight_offset, output_offset, control_offset},
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_bf16w_matmul(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_bf16w_matmul_fn,
    size_t threadgroup_size,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_cols,
    uint32_t num_rows)
{
    if (command_buffer->object == NULL || f32_bf16w_matmul_fn->pipeline_state_object == NULL) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul kernel launch: invalid command buffer or pipeline state object");
        return gptoss_status_invalid_state;
    }

    if (threadgroup_size == 0) {
        threadgroup_size = f32_bf16w_matmul_fn->simdgroup_threads;
    } else if (threadgroup_size > f32_bf16w_matmul_fn->max_threadgroup_threads) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul kernel launch: threadgroup size (%zu) exceeds supported maximum (%zu)",
            threadgroup_size, f32_bf16w_matmul_fn->max_threadgroup_threads);
        return gptoss_status_invalid_argument;
    }

    if (num_cols % 4 != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul kernel launch: number of columns (%" PRIu32 ") is not divisible by 4",
            num_cols);
        return gptoss_status_invalid_argument;
    }
    const size_t num_simdgroups = threadgroup_size / f32_bf16w_matmul_fn->simdgroup_threads;
    if (num_rows % num_simdgroups != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul kernel launch: number of rows (%" PRIu32 ") is not divisible by the number of simdgroups (%zu)",
            num_rows, num_simdgroups);
        return gptoss_status_invalid_argument;
    }

    const struct gptoss_matmul_args args = {
        .num_column_vecs = num_cols / 4,
        .num_rows = num_rows,
        .add = 0,
    };

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_bf16w_matmul_fn,
        threadgroup_size, 1, 1,
        num_rows / num_simdgroups, num_tokens, 1,
        sizeof(args), &args,
        5,
        (const struct gptoss_metal_buffer *[]) {input_buffer, weight_buffer, bias_buffer, output_buffer, control_buffer},
        (const size_t[]) {input_offset, weight_offset, bias_offset, output_offset, control_offset},
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_bf16w_matmul_qkv(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_bf16w_matmul_qkv_fn,
    size_t threadgroup_size,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* kv_buffer,
    size_t kv_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_cols,
    uint32_t num_q_heads,
    uint32_t num_kv_heads,
    uint32_t attn_head_dim,
    uint32_t token_offset,
    uint32_t max_tokens,
    float rope_base,
    float interpolation_scale,
    float yarn_offset,
    float yarn_scale,
    float yarn_multiplier)
{
    if (command_buffer->object == NULL || f32_bf16w_matmul_qkv_fn->pipeline_state_object == NULL) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul_qkv kernel launch: invalid command buffer or pipeline state object");
        return gptoss_status_invalid_state;
    }

    if (threadgroup_size == 0) {
        threadgroup_size = f32_bf16w_matmul_qkv_fn->simdgroup_threads;
    } else if (threadgroup_size > f32_bf16w_matmul_qkv_fn->max_threadgroup_threads) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul_qkv kernel launch: threadgroup size (%zu) exceeds supported maximum (%zu)",
            threadgroup_size, f32_bf16w_matmul_qkv_fn->max_threadgroup_threads);
        return gptoss_status_invalid_argument;
    }

    if (num_cols % 4 != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul_qkv kernel launch: number of columns (%" PRIu32 ") is not divisible by 4",
            num_cols);
        return gptoss_status_invalid_argument;
    }

    if (num_q_heads != 64) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul_qkv kernel launch: number of Q heads (%" PRIu32 ") must be 64",
            num_q_heads);
        return gptoss_status_invalid_argument;
    }
    if (num_kv_heads != 8) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul_qkv kernel launch: number of KV heads (%" PRIu32 ") must be 8",
            num_kv_heads);
        return gptoss_status_invalid_argument;
    }
    if (attn_head_dim != 64) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul_qkv kernel launch: attention head dimension (%" PRIu32 ") must be 64",
            attn_head_dim);
        return gptoss_status_invalid_argument;
    }

    const size_t num_simdgroups = threadgroup_size / f32_bf16w_matmul_qkv_fn->simdgroup_threads;
    const uint32_t num_rows = (num_q_heads + 2 * num_kv_heads) * attn_head_dim;
    if (num_rows % num_simdgroups != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul_qkv kernel launch: number of rows (%" PRIu32 ") is not divisible by the number of simdgroups (%zu)",
            num_rows, num_simdgroups);
        return gptoss_status_invalid_argument;
    }

    const struct gptoss_qkv_args args = {
        .num_column_vecs = num_cols / 4,
        .num_rows = num_rows,
        .token_offset = token_offset,
        .freq_scale = -logf(rope_base) / (float) (int32_t) (attn_head_dim / 2),
        .interpolation_scale = interpolation_scale,
        .yarn_offset = yarn_offset,
        .yarn_scale = yarn_scale,
        .yarn_multiplier = yarn_multiplier,
        .max_tokens = max_tokens,
    };

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_bf16w_matmul_qkv_fn,
        threadgroup_size, 1, 1,
        num_rows / num_simdgroups, num_tokens, 1,
        sizeof(args), &args,
        6,
        (const struct gptoss_metal_buffer *[]) {input_buffer, weight_buffer, bias_buffer, output_buffer, kv_buffer, control_buffer},
        (const size_t[]) {input_offset, weight_offset, bias_offset, output_offset, kv_offset, control_offset},
        /*threadgroup_buffer_size=*/num_simdgroups * sizeof(float));
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_bf16w_matmul_add(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_bf16w_matmul_fn,
    size_t threadgroup_size,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_cols,
    uint32_t num_rows)
{
    if (command_buffer->object == NULL || f32_bf16w_matmul_fn->pipeline_state_object == NULL) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul_add kernel launch: invalid command buffer or pipeline state object");
        return gptoss_status_invalid_state;
    }

    if (threadgroup_size == 0) {
        threadgroup_size = f32_bf16w_matmul_fn->simdgroup_threads;
    } else if (threadgroup_size > f32_bf16w_matmul_fn->max_threadgroup_threads) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul_add kernel launch: threadgroup size (%zu) exceeds supported maximum (%zu)",
            threadgroup_size, f32_bf16w_matmul_fn->max_threadgroup_threads);
        return gptoss_status_invalid_argument;
    }

    if (num_cols % 4 != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul_add kernel launch: number of columns (%" PRIu32 ") is not divisible by 4",
            num_cols);
        return gptoss_status_invalid_argument;
    }
    const size_t num_simdgroups = threadgroup_size / f32_bf16w_matmul_fn->simdgroup_threads;
    if (num_rows % num_simdgroups != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul_add kernel launch: number of rows (%" PRIu32 ") is not divisible by the number of simdgroups (%zu)",
            num_rows, num_simdgroups);
        return gptoss_status_invalid_argument;
    }

    const struct gptoss_matmul_args args = {
        .num_column_vecs = num_cols / 4,
        .num_rows = num_rows,
        .add = 1,
    };

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_bf16w_matmul_fn,
        threadgroup_size, 1, 1,
        num_rows / num_simdgroups, num_tokens, 1,
        sizeof(args), &args,
        5,
        (const struct gptoss_metal_buffer *[]) {input_buffer, weight_buffer, bias_buffer, output_buffer, control_buffer},
        (const size_t[]) {input_offset, weight_offset, bias_offset, output_offset, control_offset},
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status _gptoss_metal_command_buffer_encode_launch_f32_bf16w_dense_matmul_impl(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_bf16w_dense_matmul_fn,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset, 
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_cols,
    uint32_t num_rows,
    uint32_t Bm,
    uint32_t Bn,
    uint32_t Bk,
    uint32_t Sg_Bm,
    uint32_t Sg_Bn)
{

    if (command_buffer->object == NULL || f32_bf16w_dense_matmul_fn->pipeline_state_object == NULL) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul kernel launch: invalid command buffer or pipeline state object");
        return gptoss_status_invalid_state;
    }

    if (num_cols % 8 != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul kernel launch: number of columns (%" PRIu32 ") is not divisible by 8",
                         num_cols);
        return gptoss_status_invalid_argument;
    }
    if (num_rows % 8 != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul kernel launch: number of rows (%" PRIu32 ") is not divisible by 8",
                         num_rows);
        return gptoss_status_invalid_argument;
    }

    const struct gptoss_dense_matmul_args args = {
        .m = num_tokens,
        .n = num_rows,
        .k = num_cols,
    };
    const size_t threads_per_simdgroup = f32_bf16w_dense_matmul_fn->simdgroup_threads;
    const uint32_t m = args.m;
    const uint32_t n = args.n;
    const uint32_t k = args.k;
    if (Bm % Sg_Bm != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul kernel launch: Bm (%" PRIu32 ") is not divisible by Sg_Bm (%" PRIu32 ")",
                         Bm, Sg_Bm);
        return gptoss_status_invalid_argument;
    }
    if (Bn % Sg_Bn != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul kernel launch: Bn (%" PRIu32 ") is not divisible by Sg_Bn (%" PRIu32 ")",
                         Bn, Sg_Bn);
        return gptoss_status_invalid_argument;
    }
    const size_t threadgroup_size_x = (Bm / Sg_Bm) * (Bn / Sg_Bn) * threads_per_simdgroup;
    const size_t threadgroup_size_y = 1;
    const size_t threadgroup_size_z = 1;
    const size_t total_threadgroup_size = threadgroup_size_x * threadgroup_size_y * threadgroup_size_z;
    if (total_threadgroup_size > f32_bf16w_dense_matmul_fn->max_threadgroup_threads) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul kernel launch: total threadgroup size (%zu) exceeds supported maximum (%zu)",
                         total_threadgroup_size, f32_bf16w_dense_matmul_fn->max_threadgroup_threads);
        return gptoss_status_invalid_argument;
    }
    if (n % Bn != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul kernel launch: n (%" PRIu32 ") is not divisible by Bn (%" PRIu32 ")",
                         n, Bn);
        return gptoss_status_invalid_argument;
    }
    if (k % Bk != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul kernel launch: k (%" PRIu32 ") is not divisible by Bk (%" PRIu32 ")",
                         k, Bk);
        return gptoss_status_invalid_argument;
    }
    const size_t grid_x = n / Bn;
    const size_t grid_y = math_ceil_div(m, Bm);
    const size_t grid_z = 1;

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_bf16w_dense_matmul_fn,
        threadgroup_size_x, threadgroup_size_y, threadgroup_size_z,
        grid_x, grid_y, grid_z,
        sizeof(args), &args,
        5,
        (const struct gptoss_metal_buffer *[]){input_buffer, weight_buffer, bias_buffer, output_buffer, control_buffer},
        (const size_t[]){input_offset, weight_offset, bias_offset, output_offset, control_offset},
        /*threadgroup_buffer_size=*/0);
    return gptoss_status_success;
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_bf16w_dense_matmul_qkv(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_bf16w_dense_matmul_fn,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* kv_buffer,
    size_t kv_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_cols,
    uint32_t num_rows,
    uint32_t max_tokens,
    uint32_t token_offset)
{
    if (command_buffer->object == NULL || f32_bf16w_dense_matmul_fn->pipeline_state_object == NULL) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul kernel launch: invalid command buffer or pipeline state object");
        return gptoss_status_invalid_state;
    }

    if (num_cols % 8 != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul kernel launch: number of columns (%" PRIu32 ") is not divisible by 8",
                         num_cols);
        return gptoss_status_invalid_argument;
    }
    if (num_rows % 8 != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul kernel launch: number of rows (%" PRIu32 ") is not divisible by 8",
                         num_rows);
        return gptoss_status_invalid_argument;
    }

    const struct gptoss_dense_matmul_qkv_args args = {
        .m = num_tokens,
        .n = num_rows,
        .k = num_cols,
        .max_tokens = max_tokens,
        .token_offset = token_offset,
    };
    const size_t threads_per_simdgroup = f32_bf16w_dense_matmul_fn->simdgroup_threads;
    const uint32_t m = args.m;
    const uint32_t n = args.n;
    const uint32_t k = args.k;
    if (QKV_Bm % QKV_Sg_Bm != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul kernel launch: Bm (%" PRIu32 ") is not divisible by Sg_Bm (%" PRIu32 ")",
                         QKV_Bm, QKV_Sg_Bm);
        return gptoss_status_invalid_argument;
    }
    if (QKV_Bn % QKV_Sg_Bn != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul kernel launch: Bn (%" PRIu32 ") is not divisible by Sg_Bn (%" PRIu32 ")",
                         QKV_Bn, QKV_Sg_Bn);
        return gptoss_status_invalid_argument;
    }
    const size_t threadgroup_size_x = (QKV_Bm / QKV_Sg_Bm) * (QKV_Bn / QKV_Sg_Bn) * threads_per_simdgroup;
    const size_t threadgroup_size_y = 1;
    const size_t threadgroup_size_z = 1;
    const size_t total_threadgroup_size = threadgroup_size_x * threadgroup_size_y * threadgroup_size_z;
    if (total_threadgroup_size > f32_bf16w_dense_matmul_fn->max_threadgroup_threads) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul kernel launch: total threadgroup size (%zu) exceeds supported maximum (%zu)",
                         total_threadgroup_size, f32_bf16w_dense_matmul_fn->max_threadgroup_threads);
        return gptoss_status_invalid_argument;
    }
    if (n % QKV_Bn != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul kernel launch: n (%" PRIu32 ") is not divisible by Bn (%" PRIu32 ")",
                         n, QKV_Bn);
        return gptoss_status_invalid_argument;
    }
    if (k % QKV_Bk != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_dense_matmul kernel launch: k (%" PRIu32 ") is not divisible by Bk (%" PRIu32 ")",
                         k, QKV_Bk);
        return gptoss_status_invalid_argument;
    }
    const size_t grid_x = n / QKV_Bn;
    const size_t grid_y = math_ceil_div(m, QKV_Bm);
    const size_t grid_z = 1;

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_bf16w_dense_matmul_fn,
        threadgroup_size_x, threadgroup_size_y, threadgroup_size_z,
        grid_x, grid_y, grid_z,
        sizeof(args), &args,
        6,
        (const struct gptoss_metal_buffer *[]){input_buffer, weight_buffer, bias_buffer, output_buffer, kv_buffer, control_buffer},
        (const size_t[]){input_offset, weight_offset, bias_offset, output_offset, kv_offset, control_offset},
        /*threadgroup_buffer_size=*/0);
    return gptoss_status_success;
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_bf16w_dense_matmul_attn_output(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_bf16w_dense_matmul_fn,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_cols,
    uint32_t num_rows)
{
    return _gptoss_metal_command_buffer_encode_launch_f32_bf16w_dense_matmul_impl(
        command_buffer, f32_bf16w_dense_matmul_fn, input_buffer, input_offset,
        weight_buffer, weight_offset, bias_buffer, bias_offset, output_buffer,
        output_offset, control_buffer, control_offset, num_tokens, num_cols, num_rows, ATTN_OUTPUT_Bm,
        ATTN_OUTPUT_Bn, ATTN_OUTPUT_Bk, ATTN_OUTPUT_Sg_Bm, ATTN_OUTPUT_Sg_Bn);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_bf16w_dense_matmul_mlp_gate(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_bf16w_dense_matmul_fn,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_cols,
    uint32_t num_rows)
{
    return _gptoss_metal_command_buffer_encode_launch_f32_bf16w_dense_matmul_impl(
        command_buffer, f32_bf16w_dense_matmul_fn, input_buffer, input_offset,
        weight_buffer, weight_offset, bias_buffer, bias_offset, output_buffer,
        output_offset, control_buffer, control_offset, num_tokens, num_cols,
        num_rows, MLP_GATE_Bm, MLP_GATE_Bn, MLP_GATE_Bk, MLP_GATE_Sg_Bm,
        MLP_GATE_Sg_Bn);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_bf16w_unembedding(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_bf16w_unembedding_fn,
    size_t threadgroup_size,
    size_t max_threadgroups,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_buffer,
    size_t weight_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* argmax_buffer,
    size_t argmax_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_cols,
    uint32_t num_rows)
{
    if (command_buffer->object == NULL || f32_bf16w_unembedding_fn->pipeline_state_object == NULL) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_unembedding kernel launch: invalid command buffer or pipeline state object");
        return gptoss_status_invalid_state;
    }

    if (threadgroup_size == 0) {
        threadgroup_size = f32_bf16w_unembedding_fn->simdgroup_threads;
    } else if (threadgroup_size > f32_bf16w_unembedding_fn->max_threadgroup_threads) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_unembedding kernel launch: threadgroup size (%zu) exceeds supported maximum (%zu)",
            threadgroup_size, f32_bf16w_unembedding_fn->max_threadgroup_threads);
        return gptoss_status_invalid_argument;
    }

    if (num_cols % 4 != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_unembedding kernel launch: number of columns (%" PRIu32 ") is not divisible by 4",
            num_cols);
        return gptoss_status_invalid_argument;
    }

    const size_t num_simdgroups = threadgroup_size / f32_bf16w_unembedding_fn->simdgroup_threads;
    const size_t num_rows_per_threadgroup = math_ceil_div(num_rows, max_threadgroups * num_simdgroups) * num_simdgroups;
    const size_t num_threadgroups = math_min(max_threadgroups, math_ceil_div(num_rows, num_rows_per_threadgroup));
    const struct gptoss_unembedding_args args = {
        .num_column_vecs = num_cols / 4,
        .num_rows_per_threadgroup = num_rows_per_threadgroup,
        .num_rows = num_rows,
    };

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_bf16w_unembedding_fn,
        threadgroup_size, 1, 1,
        num_threadgroups, num_tokens, 1,
        sizeof(args), &args,
        5,
        (const struct gptoss_metal_buffer *[]) {input_buffer, weight_buffer, output_buffer, argmax_buffer, control_buffer},
        (const size_t[]) {input_offset, weight_offset, output_offset, argmax_offset, control_offset},
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_mf4w_moe_matmul_swiglu(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_mf4w_moe_matmul_swiglu_fn,
    size_t threadgroup_size,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* expert_buffer,
    size_t expert_offset,
    const struct gptoss_metal_buffer* weight_block_buffer,
    size_t weight_block_offset,
    const struct gptoss_metal_buffer* weight_scale_buffer,
    size_t weight_scale_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    float swiglu_limit,
    uint32_t expert_stride,
    uint32_t num_tokens,
    uint32_t num_active_experts,
    uint32_t num_cols,
    uint32_t num_rows)
{
    if (command_buffer->object == NULL || f32_mf4w_moe_matmul_swiglu_fn->pipeline_state_object == NULL) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_matmul_swiglu kernel launch: invalid command buffer or pipeline state object");
        return gptoss_status_invalid_state;
    }

    if (threadgroup_size == 0) {
        threadgroup_size = 2 * f32_mf4w_moe_matmul_swiglu_fn->simdgroup_threads;
    } else if (threadgroup_size > f32_mf4w_moe_matmul_swiglu_fn->max_threadgroup_threads) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_matmul_swiglu kernel launch: threadgroup size (%zu) exceeds supported maximum (%zu)",
            threadgroup_size, f32_mf4w_moe_matmul_swiglu_fn->max_threadgroup_threads);
        return gptoss_status_invalid_argument;
    } else if (threadgroup_size % (2 * f32_mf4w_moe_matmul_swiglu_fn->simdgroup_threads)) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_matmul_swiglu kernel launch: threadgroup size (%zu) is not divisible by simdgroup size (%zu) multiplied by 2X",
            threadgroup_size, f32_mf4w_moe_matmul_swiglu_fn->simdgroup_threads);
        return gptoss_status_invalid_argument;
    }

    if (num_cols % 32 != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_matmul_swiglu kernel launch: number of columns (%" PRIu32 ") is not divisible by 32",
            num_cols);
        return gptoss_status_invalid_argument;
    }
    const size_t num_simdgroups = threadgroup_size / f32_mf4w_moe_matmul_swiglu_fn->simdgroup_threads;
    if ((2 * num_rows) % num_simdgroups != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_bf16w_matmul_add kernel launch: "
            "the number of rows (%" PRIu32 ") multiplied by 2X is not divisible by the number of simdgroups (%zu)",
            num_rows, num_simdgroups);
        return gptoss_status_invalid_argument;
    }

    const struct gptoss_moe_matmul_swiglu_args args = {
        .num_column_vecs = num_cols / 32,
        .num_rows = num_rows,
        .num_active_experts = num_active_experts,
        .weight_expert_stride = expert_stride,
        .output_expert_stride = num_rows * num_tokens,
        .swiglu_min = -swiglu_limit,
        .swiglu_max = swiglu_limit,
    };

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_mf4w_moe_matmul_swiglu_fn,
        threadgroup_size, 1, 1,
        (2 * num_rows) / num_simdgroups, num_tokens, num_active_experts,
        sizeof(args), &args,
        7,
        (const struct gptoss_metal_buffer *[]) {input_buffer, expert_buffer, weight_block_buffer, weight_scale_buffer, bias_buffer, output_buffer, control_buffer},
        (const size_t[]) {input_offset, expert_offset, weight_block_offset, weight_scale_offset, bias_offset, output_offset, control_offset},
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_mf4w_moe_matmul(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_mf4w_moe_matmul_fn,
    size_t threadgroup_size,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* expert_buffer,
    size_t expert_offset,
    const struct gptoss_metal_buffer* weight_block_buffer,
    size_t weight_block_offset,
    const struct gptoss_metal_buffer* weight_scale_buffer,
    size_t weight_scale_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t expert_stride,
    uint32_t num_tokens,
    uint32_t num_active_experts,
    uint32_t num_cols,
    uint32_t num_rows)
{
    if (command_buffer->object == NULL || f32_mf4w_moe_matmul_fn->pipeline_state_object == NULL) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_matmul kernel launch: invalid command buffer or pipeline state object");
        return gptoss_status_invalid_state;
    }

    if (threadgroup_size == 0) {
        threadgroup_size = f32_mf4w_moe_matmul_fn->simdgroup_threads;
    } else if (threadgroup_size > f32_mf4w_moe_matmul_fn->max_threadgroup_threads) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_matmul kernel launch: threadgroup size (%zu) exceeds supported maximum (%zu)",
            threadgroup_size, f32_mf4w_moe_matmul_fn->max_threadgroup_threads);
        return gptoss_status_invalid_argument;
    } else if (threadgroup_size % f32_mf4w_moe_matmul_fn->simdgroup_threads) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_matmul kernel launch: threadgroup size (%zu) is not divisible by simdgroup size (%zu)",
            threadgroup_size, f32_mf4w_moe_matmul_fn->simdgroup_threads);
        return gptoss_status_invalid_argument;
    }

    if (num_cols % 32 != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_matmul kernel launch: number of columns (%" PRIu32 ") is not divisible by 32",
            num_cols);
        return gptoss_status_invalid_argument;
    }
    const size_t num_simdgroups = threadgroup_size / f32_mf4w_moe_matmul_fn->simdgroup_threads;
    if (num_rows % num_simdgroups != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_matmul kernel launch: "
            "the number of rows (%" PRIu32 ") is not divisible by the number of simdgroups (%zu)",
            num_rows, num_simdgroups);
        return gptoss_status_invalid_argument;
    }

    const struct gptoss_moe_matmul_args args = {
        .num_column_vecs = num_cols / 32,
        .num_rows = num_rows,
        .num_active_experts = num_active_experts,
        .input_expert_stride = num_tokens * (num_cols / 32),
        .weight_expert_stride = expert_stride,
        .output_expert_stride = num_rows * num_tokens,
    };

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_mf4w_moe_matmul_fn,
        threadgroup_size, 1, 1,
        num_rows / num_simdgroups, num_tokens, num_active_experts,
        sizeof(args), &args,
        7,
        (const struct gptoss_metal_buffer *[]) {input_buffer, expert_buffer, weight_block_buffer, weight_scale_buffer, bias_buffer, output_buffer, control_buffer},
        (const size_t[]) {input_offset, expert_offset, weight_block_offset, weight_scale_offset, bias_offset, output_offset, control_offset},
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_rope(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_rope_fn,
    size_t threadgroup_size,
    const struct gptoss_metal_buffer* activations_buffer,
    size_t activations_offset,
    const struct gptoss_metal_buffer* kv_buffer,
    size_t kv_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    float rope_base,
    float interpolation_scale,
    float yarn_offset,
    float yarn_scale,
    float yarn_multiplier,
    uint32_t num_tokens,
    uint32_t num_q_heads,
    uint32_t num_kv_heads,
    uint32_t attn_head_dim,
    uint32_t max_tokens,
    uint32_t token_offset)
{
    if (command_buffer->object == NULL || f32_rope_fn->pipeline_state_object == NULL) {
        return gptoss_status_invalid_state;
    }

    if (threadgroup_size == 0) {
        threadgroup_size = f32_rope_fn->max_threadgroup_threads;
    } else if (threadgroup_size > f32_rope_fn->max_threadgroup_threads) {
        return gptoss_status_invalid_argument;
    }

    const size_t num_simdgroups = threadgroup_size / f32_rope_fn->simdgroup_threads;
    const uint32_t num_qk_heads = num_q_heads + num_kv_heads;
    if (num_qk_heads % num_simdgroups != 0) {
        return gptoss_status_invalid_argument;
    }

    const struct gptoss_rope_args args = {
        .token_stride = (num_q_heads + 2 * num_kv_heads) * (attn_head_dim / 2),
        .token_offset = token_offset,
        .max_tokens = max_tokens,
        .freq_scale = -logf(rope_base) / (float) (int32_t) (attn_head_dim / 2),
        .interpolation_scale = interpolation_scale,
        .yarn_offset = yarn_offset,
        .yarn_scale = yarn_scale,
        .yarn_multiplier = yarn_multiplier,
    };

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_rope_fn,
        threadgroup_size, 1, 1,
        num_qk_heads / num_simdgroups, num_tokens, 1,
        sizeof(args), &args,
        3,
        (const struct gptoss_metal_buffer *[]) {activations_buffer, kv_buffer, control_buffer},
        (const size_t[]) {activations_offset, kv_offset, control_offset},
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_expert_routing_metadata(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* expert_routing_metadata_fn,
    const struct gptoss_metal_buffer* expert_predictions_buffer,
    size_t expert_predictions_offset,
    const struct gptoss_metal_buffer* expert_offsets_buffer,
    size_t expert_offsets_offset,
    const struct gptoss_metal_buffer* intra_expert_offsets_buffer,
    size_t intra_expert_offsets_offset,
    uint32_t num_tokens,
    uint32_t num_experts)
{
    if (command_buffer->object == NULL || expert_routing_metadata_fn->pipeline_state_object == NULL) {
        return gptoss_status_invalid_state;
    }
    
    const struct gptoss_expert_routing_metadata_args args = {
        .tokens = num_tokens,
        .num_experts = num_experts,
    };
    const uint32_t threadgroup_size = 256;
    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, expert_routing_metadata_fn,
        threadgroup_size, 1, 1,
        /*num_threadgroups_x=*/1, /*num_threadgroups_y=*/1, /*num_threadgroups_z=*/1,
        sizeof(args), &args,
        3,
        (const struct gptoss_metal_buffer *[]) {expert_predictions_buffer, expert_offsets_buffer, intra_expert_offsets_buffer},
        (const size_t[]) {expert_predictions_offset, expert_offsets_offset, intra_expert_offsets_offset},
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_scatter(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_scatter_fn,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* expert_predictions_buffer,
    size_t expert_predictions_offset,
    const struct gptoss_metal_buffer* expert_offsets_buffer,
    size_t expert_offsets_offset,
    const struct gptoss_metal_buffer* intra_expert_offsets_buffer,
    size_t intra_expert_offsets_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    uint32_t num_channels,
    uint32_t num_tokens,
    uint32_t num_active_experts)
{
    if (command_buffer->object == NULL || f32_scatter_fn->pipeline_state_object == NULL) {
        return gptoss_status_invalid_state;
    }

    if (num_channels % 4 != 0) {
        return gptoss_status_invalid_argument;
    }

    const size_t num_vecs = num_channels / 4;
    const size_t tgx = math_min(num_vecs, 64);
    const size_t tgy = 1;
    const size_t tgz = 1;
    const size_t grid_x = math_ceil_div(num_vecs, tgx);
    const size_t grid_y = num_tokens;
    const size_t grid_z = 1;
    const size_t total_threadgroup_size = tgx * tgy * tgz;
    if (total_threadgroup_size > f32_scatter_fn->max_threadgroup_threads) {
        return gptoss_status_invalid_argument;
    }
    const struct gptoss_scatter_args args = {
        .tokens = num_tokens,
        .active_experts_per_token = num_active_experts,
        .token_stride = num_channels,
    };

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_scatter_fn,
        tgx, tgy, tgz,
        grid_x, grid_y, grid_z,
        sizeof(args), &args,
        5,
        (const struct gptoss_metal_buffer *[]) {input_buffer, expert_predictions_buffer, expert_offsets_buffer, intra_expert_offsets_buffer, output_buffer},
        (const size_t[]) {input_offset, expert_predictions_offset, expert_offsets_offset, intra_expert_offsets_offset, output_offset},
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_gather_and_accumulate_e4(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_gather_and_accumulate_e4_fn,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* expert_predictions_buffer,
    size_t expert_predictions_offset,
    const struct gptoss_metal_buffer* expert_offsets_buffer,
    size_t expert_offsets_offset,
    const struct gptoss_metal_buffer* intra_expert_offsets_buffer,
    size_t intra_expert_offsets_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    uint32_t num_channels,
    uint32_t num_tokens,
    uint32_t num_active_experts) 
{
        if (command_buffer->object == NULL || f32_gather_and_accumulate_e4_fn->pipeline_state_object == NULL) {
        return gptoss_status_invalid_state;
    }

    if (num_channels % 4 != 0) {
        return gptoss_status_invalid_argument;
    }

    const size_t num_vecs = num_channels / 4;
    const size_t tgx = math_min(num_vecs, 64);
    const size_t tgy = 1;
    const size_t tgz = 1;
    const size_t grid_x = math_ceil_div(num_vecs, tgx);
    const size_t grid_y = num_tokens;
    const size_t grid_z = 1;
    const size_t total_threadgroup_size = tgx * tgy * tgz;
    if (total_threadgroup_size > f32_gather_and_accumulate_e4_fn->max_threadgroup_threads) {
        return gptoss_status_invalid_argument;
    }
    const struct gptoss_gather_args args = {
        .tokens = num_tokens,
        .active_experts_per_token = num_active_experts,
        .token_stride = num_channels,
    };
    
    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_gather_and_accumulate_e4_fn,
        tgx, tgy, tgz,
        grid_x, grid_y, grid_z,
        sizeof(args), &args,
        5,
        (const struct gptoss_metal_buffer *[]) {input_buffer, expert_predictions_buffer, expert_offsets_buffer, intra_expert_offsets_buffer, output_buffer},
        (const size_t[]) {input_offset, expert_predictions_offset, expert_offsets_offset, intra_expert_offsets_offset, output_offset},
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_mf4w_moe_dense_matmul_swiglu(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_mf4w_moe_dense_matmul_swiglu_fn,
    const struct gptoss_metal_buffer* expert_offsets_buffer,
    size_t expert_offsets_offset,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_block_buffer,
    size_t weight_block_offset,
    const struct gptoss_metal_buffer* weight_scale_buffer,
    size_t weight_scale_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    float swiglu_limit,
    uint32_t expert_stride_bytes,
    uint32_t num_tokens,
    uint32_t num_experts,
    uint32_t num_cols,
    uint32_t num_rows)
{
    if (command_buffer->object == NULL || f32_mf4w_moe_dense_matmul_swiglu_fn->pipeline_state_object == NULL) {
        return gptoss_status_invalid_state;
    }

    if (num_cols % 32 != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_dense_matmul_swiglu kernel launch: number of columns (%" PRIu32 ") is not divisible by 32",
            num_cols);
        return gptoss_status_invalid_argument;
    }

    const struct gptoss_moe_dense_matmul_swiglu_args args = {
        .n = num_rows,
        .k = num_cols,
        .weight_blocks_expert_stride_bytes = expert_stride_bytes,
        .weight_scales_expert_stride_bytes = expert_stride_bytes,
        .bias_expert_stride_bytes = expert_stride_bytes,
        .swiglu_min = -swiglu_limit,
        .swiglu_max = swiglu_limit,
    };
    const size_t threads_per_simdgroup = f32_mf4w_moe_dense_matmul_swiglu_fn->simdgroup_threads;
    const uint32_t m = num_tokens;
    const uint32_t n = args.n;
    const uint32_t k = args.k;
    const uint32_t Bm = MOE_DENSE_MATMUL_SWIGLU_Bm;
    const uint32_t Bn = MOE_DENSE_MATMUL_SWIGLU_Bn;
    const uint32_t Bk = MOE_DENSE_MATMUL_SWIGLU_Bk;
    const uint32_t Sg_Bm = MOE_DENSE_MATMUL_SWIGLU_Sg_Bm;
    const uint32_t Sg_Bn = MOE_DENSE_MATMUL_SWIGLU_Sg_Bn;
    if (Bm % Sg_Bm != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_dense_matmul_swiglu kernel launch: Bm (%" PRIu32 ") is not divisible by Sg_Bm (%" PRIu32 ")",
            Bm, Sg_Bm);
        return gptoss_status_invalid_argument;
    }
    if (Bn % Sg_Bn != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_dense_matmul_swiglu kernel launch: Bn (%" PRIu32 ") is not divisible by Sg_Bn (%" PRIu32 ")",
            Bn, Sg_Bn);
        return gptoss_status_invalid_argument;
    }

    const size_t threadgroup_size_x = (Bm / Sg_Bm) * (Bn / Sg_Bn) * threads_per_simdgroup;
    const size_t threadgroup_size_y = 1;
    const size_t threadgroup_size_z = 1;
    const size_t total_threadgroup_size = threadgroup_size_x * threadgroup_size_y * threadgroup_size_z;
    if (total_threadgroup_size > f32_mf4w_moe_dense_matmul_swiglu_fn->max_threadgroup_threads) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_dense_matmul_swiglu kernel launch: total threadgroup size (%zu) exceeds supported maximum (%zu)",
            total_threadgroup_size, f32_mf4w_moe_dense_matmul_swiglu_fn->max_threadgroup_threads);
        return gptoss_status_invalid_argument;
    }
    if (n % Bn != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_dense_matmul_swiglu kernel launch: n (%" PRIu32 ") is not divisible by Bn (%" PRIu32 ")",
            n, Bn);
        return gptoss_status_invalid_argument;
    }
    if (k % Bk != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_dense_matmul_swiglu kernel launch: k (%" PRIu32 ") is not divisible by Bk (%" PRIu32 ")",
            k, Bk);
        return gptoss_status_invalid_argument;
    }
    const size_t grid_x = n / Bn;
    const size_t grid_y = math_ceil_div(m, Bm);
    const size_t grid_z = num_experts;

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_mf4w_moe_dense_matmul_swiglu_fn,
        threadgroup_size_x, threadgroup_size_y, threadgroup_size_z,
        grid_x, grid_y, grid_z,
        sizeof(args), &args,
        6,
        (const struct gptoss_metal_buffer *[]) {expert_offsets_buffer, input_buffer, weight_block_buffer, weight_scale_buffer, bias_buffer, output_buffer},
        (const size_t[]) {expert_offsets_offset, input_offset, weight_block_offset, weight_scale_offset, bias_offset, output_offset},
        /*threadgroup_buffer_size=*/0);

    }

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_mf4w_moe_dense_matmul(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_mf4w_moe_dense_matmul_fn,
    const struct gptoss_metal_buffer* expert_offsets_buffer,
    size_t expert_offsets_offset,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* weight_block_buffer,
    size_t weight_block_offset,
    const struct gptoss_metal_buffer* weight_scale_buffer,
    size_t weight_scale_offset,
    const struct gptoss_metal_buffer* bias_buffer,
    size_t bias_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    uint32_t expert_stride_bytes,
    uint32_t num_tokens,
    uint32_t num_experts,
    uint32_t num_cols,
    uint32_t num_rows)
{
    if (command_buffer->object == NULL || f32_mf4w_moe_dense_matmul_fn->pipeline_state_object == NULL) {
        return gptoss_status_invalid_state;
    }

    if (num_cols % 32 != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_dense_matmul kernel launch: number of columns (%" PRIu32 ") is not divisible by 32",
            num_cols);
        return gptoss_status_invalid_argument;
    }
    const struct gptoss_moe_dense_matmul_args args = {
        .k = num_cols,
        .n = num_rows,
        .weight_blocks_expert_stride_bytes = expert_stride_bytes,
        .weight_scales_expert_stride_bytes = expert_stride_bytes,
        .bias_expert_stride_bytes = expert_stride_bytes,
    };

    const size_t threads_per_simdgroup = f32_mf4w_moe_dense_matmul_fn->simdgroup_threads;
    const uint32_t m = num_tokens;
    const uint32_t n = args.n;
    const uint32_t k = args.k;
    const uint32_t Bm = MOE_DENSE_MATMUL_Bm;
    const uint32_t Bn = MOE_DENSE_MATMUL_Bn;
    const uint32_t Bk = MOE_DENSE_MATMUL_Bk;
    const uint32_t Sg_Bm = MOE_DENSE_MATMUL_Sg_Bm;
    const uint32_t Sg_Bn = MOE_DENSE_MATMUL_Sg_Bn;
    if (Bm % Sg_Bm != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_dense_matmul kernel launch: Bm (%" PRIu32 ") is not divisible by Sg_Bm (%" PRIu32 ")",
            Bm, Sg_Bm);
        return gptoss_status_invalid_argument;
    }
    if (Bn % Sg_Bn != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_dense_matmul kernel launch: Bn (%" PRIu32 ") is not divisible by Sg_Bn (%" PRIu32 ")",
            Bn, Sg_Bn);
        return gptoss_status_invalid_argument;
    }

    const size_t threadgroup_size_x = (Bm / Sg_Bm) * (Bn / Sg_Bn) * threads_per_simdgroup;
    const size_t threadgroup_size_y = 1;
    const size_t threadgroup_size_z = 1;
    const size_t total_threadgroup_size = threadgroup_size_x * threadgroup_size_y * threadgroup_size_z;
    if (total_threadgroup_size > f32_mf4w_moe_dense_matmul_fn->max_threadgroup_threads) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_dense_matmul kernel launch: total threadgroup size (%zu) exceeds supported maximum (%zu)",
            total_threadgroup_size, f32_mf4w_moe_dense_matmul_fn->max_threadgroup_threads);
        return gptoss_status_invalid_argument;
    }
    if (n % Bn != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_dense_matmul kernel launch: n (%" PRIu32 ") is not divisible by Bn (%" PRIu32 ")",
            n, Bn);
        return gptoss_status_invalid_argument;
    }
    if (k % Bk != 0) {
        GPTOSS_LOG_ERROR("failed to encode f32_mf4w_moe_dense_matmul kernel launch: k (%" PRIu32 ") is not divisible by Bk (%" PRIu32 ")",
            k, Bk);
        return gptoss_status_invalid_argument;
    }

    const size_t grid_y = math_ceil_div(m, Bm);
    const size_t grid_x = n / Bn;
    const size_t grid_z = num_experts;

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_mf4w_moe_dense_matmul_fn,
        threadgroup_size_x, threadgroup_size_y, threadgroup_size_z,
        grid_x, grid_y, grid_z,
        sizeof(args), &args,
        6,
        (const struct gptoss_metal_buffer *[]) {expert_offsets_buffer, input_buffer, weight_block_buffer, weight_scale_buffer, bias_buffer, output_buffer},
        (const size_t[]) {expert_offsets_offset, input_offset, weight_block_offset, weight_scale_offset, bias_offset, output_offset},
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_accumulate(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_accumulate_fn,
    size_t threadgroup_size,
    size_t max_threadgroups,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* expert_buffer,
    size_t expert_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_channels,
    uint32_t num_tokens,
    uint32_t num_experts)
{
    if (command_buffer->object == NULL || f32_accumulate_fn->pipeline_state_object == NULL) {
        return gptoss_status_invalid_state;
    }

    if (num_channels% 4 != 0) {
        return gptoss_status_invalid_argument;
    }

    if (threadgroup_size == 0) {
        threadgroup_size = f32_accumulate_fn->max_threadgroup_threads;
    } else if (threadgroup_size > f32_accumulate_fn->max_threadgroup_threads) {
        return gptoss_status_invalid_argument;
    }

    const size_t num_vecs = num_channels / 4;
    const size_t num_vecs_per_expert = num_vecs * num_tokens;
    const size_t num_vecs_per_threadgroup = math_ceil_div(num_vecs, max_threadgroups * threadgroup_size) * threadgroup_size;
    const size_t num_threadgroups = math_min(max_threadgroups, math_ceil_div(num_vecs, num_vecs_per_threadgroup));
    const struct gptoss_accumulate_args args = {
        .num_vecs_per_expert = num_vecs_per_expert,
        .num_vecs_per_threadgroup = num_vecs_per_threadgroup,
        .num_vecs = num_vecs,
    };

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_accumulate_fn,
        threadgroup_size, 1, 1,
        num_threadgroups, num_tokens, 1,
        sizeof(args), &args,
        4,
        (const struct gptoss_metal_buffer *[]) {input_buffer, expert_buffer, output_buffer, control_buffer},
        (const size_t[]) {input_offset, expert_offset, output_offset, control_offset},
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_topk(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_topk_fn,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_tokens,
    uint32_t num_experts,
    uint32_t num_active_experts)
{
    if (command_buffer->object == NULL || f32_topk_fn->pipeline_state_object == NULL) {
        return gptoss_status_invalid_state;
    }

    if (num_experts != 32  && num_experts != 128) {
        return gptoss_status_invalid_argument;
    }

    if (num_active_experts != 4) {
        return gptoss_status_invalid_argument;
    }

    const struct gptoss_topk_args args = { 0 };

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_topk_fn,
        /*threadgroup_size=*/32, 1, 1,
        num_tokens, 1, 1,
        sizeof(args), &args,
        3,
        (const struct gptoss_metal_buffer *[]) {input_buffer, output_buffer, control_buffer},
        (const size_t[]) {input_offset, output_offset, control_offset},
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_sdpa(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_sdpa_fn,
    const struct gptoss_metal_buffer* q_buffer,
    size_t q_offset,
    const struct gptoss_metal_buffer* kv_buffer,
    size_t kv_offset,
    const struct gptoss_metal_buffer* s_buffer,
    size_t s_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t window,
    uint32_t kv_stride,
    uint32_t num_q_tokens,
    uint32_t num_kv_tokens,
    uint32_t num_q_heads,
    uint32_t num_kv_heads,
    uint32_t head_dim)
{
    if (command_buffer->object == NULL || f32_sdpa_fn->pipeline_state_object == NULL) {
        return gptoss_status_invalid_state;
    }

    if (num_q_heads != num_kv_heads * 8) {
        GPTOSS_LOG_ERROR("number of Q heads (%" PRIu32 ") must be 8 times the number of KV heads (%" PRIu32 ")",
            num_q_heads, num_kv_heads);
        return gptoss_status_invalid_argument;
    }

    if (head_dim != 64) {
        GPTOSS_LOG_ERROR("attention head dimension (%" PRIu32 ") must be 64", head_dim);
        return gptoss_status_invalid_argument;
    }

    const size_t max_context_tokens = math_min(num_q_tokens + num_kv_tokens + 1, window);
    const size_t threadgroup_size = math_min(f32_sdpa_fn->max_threadgroup_threads,
        max_context_tokens * f32_sdpa_fn->simdgroup_threads);
    const size_t half_threadgroup_size = math_round_down_po2(threadgroup_size / 2, f32_sdpa_fn->simdgroup_threads);

    const struct gptoss_sdpa_args args = {
        .qkv_dim = head_dim * (num_q_heads + 2 * num_kv_heads),
        .num_kv_tokens = num_kv_tokens,
        .kv_stride = kv_stride,
        .window = window,
    };

    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_sdpa_fn,
        threadgroup_size, 1, 1,
        num_q_tokens, num_kv_heads, 1,
        sizeof(args), &args,
        5,
        (const struct gptoss_metal_buffer *[]) {q_buffer, kv_buffer, s_buffer, output_buffer, control_buffer},
        (const size_t[]) {q_offset, kv_offset, s_offset, output_offset, control_offset},
        /*threadgroup_buffer_size=*/half_threadgroup_size * 8 * 4 * sizeof(float));
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_softmax(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_softmax_fn,
    size_t threadgroup_size,
    size_t max_threadgroups,
    const struct gptoss_metal_buffer* score_buffer,
    size_t score_offset,
    const struct gptoss_metal_buffer* argmax_buffer,
    size_t argmax_offset,
    const struct gptoss_metal_buffer* prob_buffer,
    size_t prob_offset,
    const struct gptoss_metal_buffer* sum_buffer,
    size_t sum_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint32_t num_channels,
    uint32_t num_tokens,
    float temperature,
    uint32_t* num_threadgroups_out,
    uint32_t* num_channels_per_threadgroup_out)
{
    *num_threadgroups_out = 0;
    *num_channels_per_threadgroup_out = 0;
    if (command_buffer->object == NULL || f32_softmax_fn->pipeline_state_object == NULL) {
        return gptoss_status_invalid_state;
    }

    const size_t num_vecs = num_channels;
    const size_t num_vecs_per_threadgroup = math_ceil_div(num_vecs, max_threadgroups * threadgroup_size) * threadgroup_size;
    const size_t num_threadgroups = math_min(max_threadgroups, math_ceil_div(num_vecs, num_vecs_per_threadgroup));
    const struct gptoss_softmax_args args = {
        .num_vecs = num_vecs,
        .num_vecs_per_threadgroup = num_vecs_per_threadgroup,
        .max_threadgroups = max_threadgroups,
        .temperature = temperature,
    };

    *num_threadgroups_out = num_threadgroups;
    *num_channels_per_threadgroup_out = num_vecs_per_threadgroup;
    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_softmax_fn,
        threadgroup_size, 1, 1,
        num_threadgroups, num_tokens, 1,
        sizeof(args), &args,
        5,
        (const struct gptoss_metal_buffer *[]) {score_buffer, argmax_buffer, prob_buffer, sum_buffer, control_buffer},
        (const size_t[]) {score_offset, argmax_offset, prob_offset, sum_offset, control_offset},
        /*threadgroup_buffer_size=*/0);
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_f32_sample(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* f32_sample_fn,
    size_t min_threadgroup_size,
    const struct gptoss_metal_buffer* prob_buffer,
    size_t prob_offset,
    const struct gptoss_metal_buffer* sum_buffer,
    size_t sum_offset,
    const struct gptoss_metal_buffer* token_buffer,
    size_t token_offset,
    const struct gptoss_metal_buffer* control_buffer,
    size_t control_offset,
    uint64_t rng_seed,
    uint32_t rng_offset,
    uint32_t num_blocks,
    uint32_t num_channels,
    uint32_t num_channels_per_block)
{
    if (command_buffer->object == NULL || f32_sample_fn->pipeline_state_object == NULL) {
        return gptoss_status_invalid_state;
    }

    if (min_threadgroup_size > f32_sample_fn->max_threadgroup_threads) {
        return gptoss_status_invalid_argument;
    }

    if (min_threadgroup_size % f32_sample_fn->simdgroup_threads != 0) {
        return gptoss_status_invalid_argument;
    }

    if (num_blocks > f32_sample_fn->max_threadgroup_threads) {
        return gptoss_status_invalid_argument;
    }

    const struct gptoss_sample_args args = {
        .rng_seed = rng_seed,
        .rng_offset = rng_offset,
        .num_blocks = num_blocks,
        .num_dims = num_channels,
        .num_dims_per_block = num_channels_per_block,
    };

    const size_t threadgroup_size = math_max(min_threadgroup_size,
        math_round_up_po2(num_blocks, f32_sample_fn->simdgroup_threads));
    return gptoss_metal_command_buffer_encode_launch_kernel(
        command_buffer, f32_sample_fn,
        threadgroup_size, 1, 1,
        1, 1, 1,
        sizeof(args), &args,
        4,
        (const struct gptoss_metal_buffer *[]) {prob_buffer, sum_buffer, token_buffer, control_buffer},
        (const size_t[]) {prob_offset, sum_offset, token_offset, control_offset},
        /*threadgroup_buffer_size=*/0);
}


================================================
FILE: gpt_oss/metal/source/metal.m
================================================
#import <Foundation/Foundation.h>
#import <Metal/Metal.h>

#include <dispatch/dispatch.h>
#include <mach-o/getsect.h>

#include <gpt-oss/types.h>

#include <internal/log.h>
#include <internal/metal.h>


static size_t gptoss_metal_device_get_core_count(id<MTLDevice> device) {
    if (!device) {
        return 0;
    }

    const uint64_t target_registry_id = [device registryID];

    io_iterator_t it = IO_OBJECT_NULL;
    const kern_return_t kr = IOServiceGetMatchingServices(
        kIOMainPortDefault,
        IOServiceMatching("IOAccelerator"),
        &it
    );
    if (kr != KERN_SUCCESS) {
        GPTOSS_LOG_ERROR("failed to find IOAccelerator objects: error %d", kr);
        return 0;
    }

    size_t result = 0;
    for (io_object_t obj = IOIteratorNext(it); obj != IO_OBJECT_NULL; obj = IOIteratorNext(it)) {
        uint64_t registry_id = 0;
        if (IORegistryEntryGetRegistryEntryID(obj, &registry_id) == KERN_SUCCESS &&
            registry_id == target_registry_id)
        {
            // Read "gpu-core-count" from this accelerator node
            const CFTypeRef value = IORegistryEntryCreateCFProperty(
                obj, CFSTR("gpu-core-count"), kCFAllocatorDefault, 0);
            if (value != NULL) {
                if (CFGetTypeID(value) == CFNumberGetTypeID()) {
                    int32_t n = -1;
                    if (CFNumberGetValue((CFNumberRef) value, kCFNumberSInt32Type, &n) && n > 0) {
                        result = (size_t) n;
                    }
                }
                CFRelease(value);
            }
            IOObjectRelease(obj);
            break;
        }
        IOObjectRelease(obj);
    }

    IOObjectRelease(it);
    return result;
}

enum gptoss_status gptoss_metal_device_create_system_default(
    struct gptoss_metal_device* device_out)
{
    id<MTLDevice> device_obj = MTLCreateSystemDefaultDevice();
    if (device_obj == nil) {
        GPTOSS_LOG_ERROR("failed to create Metal device");
        return gptoss_status_unsupported_system;
    }

    device_out->object = (void*) device_obj;
    device_out->num_cores = gptoss_metal_device_get_core_count(device_obj);
    device_out->max_buffer_size = (size_t) [device_obj maxBufferLength];
    device_out->max_threadgroup_memory = (size_t) [device_obj maxThreadgroupMemoryLength];
    const MTLSize max_threadgroup_threads = [device_obj maxThreadsPerThreadgroup];
    device_out->max_threadgroup_threads_x = (size_t) max_threadgroup_threads.width;
    device_out->max_threadgroup_threads_y = (size_t) max_threadgroup_threads.height;
    device_out->max_threadgroup_threads_z = (size_t) max_threadgroup_threads.depth;
    return gptoss_status_success;
}

enum gptoss_status gptoss_metal_device_release(
    struct gptoss_metal_device* device)
{
    if (device->object != NULL) {
        id<MTLDevice> device_obj = (id<MTLDevice>) device->object;
        [device_obj release];
    }
    memset(device, 0, sizeof(struct gptoss_metal_device));
    return gptoss_status_success;
}

extern const struct mach_header_64 __dso_handle;

enum gptoss_status gptoss_metal_library_create_default(
    const struct gptoss_metal_device* device,
    struct gptoss_metal_library* library_out)
{
    enum gptoss_status status = gptoss_status_success;
    id<MTLDevice> device_obj = (id<MTLDevice>) device->object;
    id<MTLLibrary> library_obj = nil;
    NSAutoreleasePool* autorelease_pool = nil;
    dispatch_data_t library_blob = NULL;

    unsigned long library_size = 0;
    uint8_t* library_data = getsectiondata(&__dso_handle, "__METAL", "__shaders", &library_size);
    if (library_data != NULL) {
        library_blob = dispatch_data_create(library_data, library_size, NULL, DISPATCH_DATA_DESTRUCTOR_DEFAULT);

        autorelease_pool = [[NSAutoreleasePool alloc] init];
        NSError* error_obj = nil;
        library_obj = [device_obj newLibraryWithData:library_blob error:&error_obj];
        if (library_obj == nil) {
            GPTOSS_LOG_ERROR("failed to create Metal library: %s", [[error_obj localizedDescription] UTF8String]);
            status = gptoss_status_unsupported_system;
            goto cleanup;
        }
    } else {
        // Fall-back to loading from the bundle
        library_obj = [device_obj newDefaultLibrary];
        if (library_obj == nil) {
            GPTOSS_LOG_ERROR("failed to create Metal default library");
            status = gptoss_status_unsupported_system;
            goto cleanup;
        }
    }

    *library_out = (struct gptoss_metal_library) {
        .object = (void*) library_obj,
    };

cleanup:
    if (library_blob != NULL) {
        dispatch_release(library_blob);
    }
    if (autorelease_pool != nil) {
        [autorelease_pool drain];
    }
    return status;
}

enum gptoss_status gptoss_metal_library_release(
    struct gptoss_metal_library* library)
{
    if (library->object != NULL) {
        id<MTLLibrary> library_obj = (id<MTLLibrary>) library->object;
        [library_obj release];
    }
    memset(library, 0, sizeof(struct gptoss_metal_library));
    return gptoss_status_success;
}

enum gptoss_status gptoss_metal_function_create(
    const struct gptoss_metal_library* library,
    const char* name,
    struct gptoss_metal_function* function_out)
{
    __block NSString* error_string_obj = nil;
    id<MTLFunction> function_obj = nil;
    MTLComputePipelineDescriptor* pipeline_descriptor_obj = nil;
    __block id<MTLComputePipelineState> pipeline_state_obj = nil;
    dispatch_semaphore_t pipeline_build_semaphore = NULL;
    enum gptoss_status status = gptoss_status_success;

    NSAutoreleasePool* autorelease_pool = [[NSAutoreleasePool alloc] init];
    id<MTLLibrary> library_obj = (id<MTLLibrary>) library->object;
    NSString* name_obj = [NSString stringWithUTF8String:name];
    function_obj = [library_obj newFunctionWithName:name_obj];
    if (function_obj == nil) {
        GPTOSS_LOG_ERROR("failed to create Metal function %s", name);
        status = gptoss_status_unsupported_system;
        goto cleanup;
    }
    id<MTLDevice> device_obj = [library_obj device];
    pipeline_descriptor_obj = [[MTLComputePipelineDescriptor alloc] init];
    [pipeline_descriptor_obj setComputeFunction:function_obj];
    [pipeline_descriptor_obj setThreadGroupSizeIsMultipleOfThreadExecutionWidth:YES];

    pipeline_build_semaphore = dispatch_semaphore_create(/*value=*/0);
    [device_obj newComputePipelineStateWithDescriptor:pipeline_descriptor_obj
                                              options:MTLPipelineOptionNone
                                    completionHandler:^(id<MTLComputePipelineState> _Nullable new_state,
                                                        MTLComputePipelineReflection* _Nullable reflection,
                                                        NSError* _Nullable error_obj) {
        if (new_state != nil) {
            pipeline_state_obj = [new_state retain];
        }
        if (error_obj != nil) {
            error_string_obj = [[error_obj localizedDescription] copy];
        }
        dispatch_semaphore_signal(pipeline_build_semaphore);
    }];
    dispatch_semaphore_wait(pipeline_build_semaphore, DISPATCH_TIME_FOREVER);

    if (pipeline_state_obj == nil) {
        const char* error_string = "unknown error";
        if (error_string_obj != nil) {
            error_string = [error_string_obj UTF8String];
        }
        GPTOSS_LOG_ERROR("failed to create Metal compute pipeline state for function %s: %s",
            name, error_string);
        status = gptoss_status_unsupported_system;
        goto cleanup;
    }

    // Commit
    function_out->function_object = function_obj;
    function_out->pipeline_state_object = pipeline_state_obj;
    function_out->max_threadgroup_threads = (size_t) [pipeline_state_obj maxTotalThreadsPerThreadgroup];
    function_out->simdgroup_threads = (size_t) [pipeline_state_obj threadExecutionWidth];
    function_out->static_threadgroup_memory = (size_t) [pipeline_state_obj staticThreadgroupMemoryLength];

    function_obj = nil;
    pipeline_state_obj = nil;

cleanup:
    if (function_obj != nil) {
        [function_obj release];
    }
    if (pipeline_descriptor_obj != nil) {
        [pipeline_descriptor_obj release];
    }
    if (error_string_obj != nil) {
        [error_string_obj release];
    }
    if (pipeline_build_semaphore != NULL) {
        dispatch_release(pipeline_build_semaphore);
    }
    if (autorelease_pool != nil) {
        [autorelease_pool drain];
    }
    return status;
}

enum gptoss_status gptoss_metal_function_release(
    struct gptoss_metal_function* function)
{
    if (function->pipeline_state_object != NULL) {
        id<MTLComputePipelineState> pipeline_state_obj = (id<MTLComputePipelineState>) function->pipeline_state_object;
        [pipeline_state_obj release];
    }
    if (function->function_object != NULL) {
        id<MTLFunction> function_obj = (id<MTLFunction>) function->function_object;
        [function_obj release];
    }
    memset(function, 0, sizeof(struct gptoss_metal_function));
    return gptoss_status_success;
}

enum gptoss_status gptoss_metal_buffer_create(
    const struct gptoss_metal_device* device,
    size_t size,
    const void* data,
    struct gptoss_metal_buffer* buffer_out)
{
    id<MTLDevice> device_obj = (id<MTLDevice>) device->object;
    id<MTLBuffer> buffer_obj = nil;
    if (data != NULL) {
        buffer_obj = [device_obj newBufferWithBytes:data length:size options:MTLResourceStorageModeShared];
    } else {
        buffer_obj = [device_obj newBufferWithLength:size options:MTLResourceStorageModeShared];
    }
    if (buffer_obj == nil) {
        GPTOSS_LOG_ERROR("failed to create Metal buffer of size %zu", size);
        return gptoss_status_unsupported_system;
    }
    buffer_out->object = (void*) buffer_obj;
    buffer_out->size = size;
    buffer_out->ptr = [buffer_obj contents];
    return gptoss_status_success;
}

enum gptoss_status gptoss_metal_buffer_wrap(
    const struct gptoss_metal_device* device,
    size_t size,
    const void* data,
    struct gptoss_metal_buffer* buffer_out)
{
    id<MTLDevice> device_obj = (id<MTLDevice>) device->object;
    id<MTLBuffer> buffer_obj = [device_obj newBufferWithBytesNoCopy:(void*) data length:size options:MTLResourceStorageModeShared deallocator:nil];
    if (buffer_obj == nil) {
        GPTOSS_LOG_ERROR("failed to wrap Metal buffer of size %zu", size);
        return gptoss_status_unsupported_system;
    }
    buffer_out->object = (void*) buffer_obj;
    buffer_out->size = size;
    buffer_out->ptr = (void*) data;
    return gptoss_status_success;
}

enum gptoss_status gptoss_metal_buffer_release(
    struct gptoss_metal_buffer* buffer)
{
    if (buffer->object != NULL) {
        id<MTLBuffer> buffer_obj = (id<MTLBuffer>) buffer->object;
        [buffer_obj release];
    }
    memset(buffer, 0, sizeof(struct gptoss_metal_buffer));
    return gptoss_status_success;
}

enum gptoss_status gptoss_metal_command_queue_create(
    const struct gptoss_metal_device* device,
    struct gptoss_metal_command_queue* command_queue_out)
{
    id<MTLDevice> device_obj = (id<MTLDevice>) device->object;
    id<MTLCommandQueue> command_queue_obj = [device_obj newCommandQueue];
    if (command_queue_obj == nil) {
        GPTOSS_LOG_ERROR("failed to create Metal command queue");
        return gptoss_status_unsupported_system;
    }
    command_queue_out->object = (void*) command_queue_obj;
    return gptoss_status_success;
}

enum gptoss_status gptoss_metal_command_queue_release(
    struct gptoss_metal_command_queue* command_queue)
{
    if (command_queue->object != NULL) {
        id<MTLCommandQueue> command_queue_obj = (id<MTLCommandQueue>) command_queue->object;
        [command_queue_obj release];
    }
    memset(command_queue, 0, sizeof(struct gptoss_metal_command_queue));
    return gptoss_status_success;
}

enum gptoss_status gptoss_metal_command_buffer_create(
    const struct gptoss_metal_command_queue* command_queue,
    struct gptoss_metal_command_buffer* command_buffer_out)
{
    id<MTLCommandQueue> command_queue_obj = (id<MTLCommandQueue>) command_queue->object;
    id<MTLCommandBuffer> command_buffer_obj = [command_queue_obj commandBuffer];
    if (command_buffer_obj == nil) {
        GPTOSS_LOG_ERROR("failed to create Metal command buffer");
        return gptoss_status_unsupported_system;
    }
    [command_buffer_obj retain];
    command_buffer_out->object = (void*) command_buffer_obj;
    return gptoss_status_success;
}

enum gptoss_status gptoss_metal_command_buffer_encode_fill_buffer(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_buffer* buffer,
    size_t offset,
    size_t size,
    uint8_t fill_value)
{
    if (command_buffer->object == NULL) {
        return gptoss_status_invalid_state;
    }
    if (buffer->object == NULL) {
        return gptoss_status_invalid_argument;
    }

    id<MTLCommandBuffer> command_buffer_obj = (id<MTLCommandBuffer>) command_buffer->object;
    id<MTLBuffer> buffer_obj = (id<MTLBuffer>) buffer->object;

    id<MTLBlitCommandEncoder> command_encoder_obj = [command_buffer_obj blitCommandEncoder];

    const NSRange range = NSMakeRange((NSUInteger) offset, (NSUInteger) size);
    [command_encoder_obj fillBuffer:buffer_obj range:range value:fill_value];
    [command_encoder_obj endEncoding];

    return gptoss_status_success;
}

enum gptoss_status gptoss_metal_command_buffer_encode_copy_buffer(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_buffer* input_buffer,
    size_t input_offset,
    const struct gptoss_metal_buffer* output_buffer,
    size_t output_offset,
    size_t size)
{
    if (command_buffer->object == NULL) {
        return gptoss_status_invalid_state;
    }
    if (input_buffer->object == NULL) {
        return gptoss_status_invalid_argument;
    }
    if (output_buffer->object == NULL) {
        return gptoss_status_invalid_argument;
    }

    id<MTLCommandBuffer> command_buffer_obj = (id<MTLCommandBuffer>) command_buffer->object;
    id<MTLBuffer> input_buffer_obj = (id<MTLBuffer>) input_buffer->object;
    id<MTLBuffer> output_buffer_obj = (id<MTLBuffer>) output_buffer->object;

    id<MTLBlitCommandEncoder> command_encoder_obj = [command_buffer_obj blitCommandEncoder];

    [command_encoder_obj copyFromBuffer:input_buffer_obj sourceOffset:(NSUInteger) input_offset
                         toBuffer:output_buffer_obj destinationOffset:(NSUInteger) output_offset
                         size:(NSUInteger) size];
    [command_encoder_obj endEncoding];

    return gptoss_status_success;
}

enum gptoss_status gptoss_metal_command_buffer_encode_launch_kernel(
    const struct gptoss_metal_command_buffer* command_buffer,
    const struct gptoss_metal_function* function,
    size_t threadgroup_size_x,
    size_t threadgroup_size_y,
    size_t threadgroup_size_z,
    size_t num_threadgroups_x,
    size_t num_threadgroups_y,
    size_t num_threadgroups_z,
    size_t params_size,
    const void* params,
    size_t num_device_buffers,
    const struct gptoss_metal_buffer** device_buffers,
    const size_t* device_buffer_offsets,
    size_t threadgroup_buffer_size)
{
    if (command_buffer->object == NULL || function->pipeline_state_object == NULL) {
        return gptoss_status_invalid_state;
    }

    id<MTLCommandBuffer> command_buffer_obj = (id<MTLCommandBuffer>) command_buffer->object;
    id<MTLComputePipelineState> pipeline_state_obj = (id<MTLComputePipelineState>) function->pipeline_state_object;

    id<MTLComputeCommandEncoder> command_encoder_obj = [command_buffer_obj computeCommandEncoder];

    // Set kernel arguments
    [command_encoder_obj setComputePipelineState:pipeline_state_obj];
    [command_encoder_obj setBytes:params length:params_size atIndex:0];
    for (size_t i = 0; i < num_device_buffers; ++i) {
        id<MTLBuffer> buffer_obj = (id<MTLBuffer>) device_buffers[i]->object;
        const NSUInteger offset = device_buffer_offsets == NULL ? 0 : (NSUInteger) device_buffer_offsets[i];
        [command_encoder_obj setBuffer:buffer_obj offset:offset atIndex:i + 1];
    }
    if (threadgroup_buffer_size != 0) {
        [command_encoder_obj setThreadgroupMemoryLength:threadgroup_buffer_size atIndex:0];
    }

    // Dispatch kernel
    const MTLSize threadgroup_size = MTLSizeMake(threadgroup_size_x, threadgroup_size_y, threadgroup_size_z);
    const MTLSize num_threadgroups = MTLSizeMake(num_threadgroups_x, num_threadgroups_y, num_threadgroups_z);
    [command_encoder_obj dispatchThreadgroups:num_threadgroups threadsPerThreadgroup:threadgroup_size];
    [command_encoder_obj endEncoding];

    return gptoss_status_success;
}

enum gptoss_status gptoss_metal_command_buffer_commit(
    const struct gptoss_metal_command_buffer* command_buffer)
{
    if (command_buffer->object == NULL) {
        return gptoss_status_invalid_state;
    }

    id<MTLCommandBuffer> command_buffer_obj = (id<MTLCommandBuffer>) command_buffer->object;
    [command_buffer_obj commit];
    return gptoss_status_success;
}

enum gptoss_status gptoss_metal_command_buffer_wait_completion(
    const struct gptoss_metal_command_buffer* command_buffer,
    double* elapsed_seconds)
{
    if (command_buffer->object == NULL) {
        return gptoss_status_invalid_state;
    }

    id<MTLCommandBuffer> command_buffer_obj = (id<MTLCommandBuffer>) command_buffer->object;
    [command_buffer_obj waitUntilCompleted];
    if (elapsed_seconds != NULL) {
        const CFTimeInterval start_time = [command_buffer_obj GPUStartTime];
        const CFTimeInterval end_time = [command_buffer_obj GPUEndTime];
        *elapsed_seconds = (double) end_time - (double) start_time;
    }
    return gptoss_status_success;
}

enum gptoss_status gptoss_metal_command_buffer_release(
    struct gptoss_metal_command_buffer* command_buffer)
{
    if (command_buffer->object != NULL) {
        id<MTLCommandBuffer> command_buffer_obj = (id<MTLCommandBuffer>) command_buffer->object;
        [command_buffer_obj release];
    }
    memset(command_buffer, 0, sizeof(struct gptoss_metal_command_buffer));
    return gptoss_status_success;
}


================================================
FILE: gpt_oss/metal/source/model.c
================================================
#include <assert.h>
#include <inttypes.h>
#include <stdatomic.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>

#include <errno.h>  // errno, EISDIR, ENOENT, ENOTDIR
#include <fcntl.h>  // open
#include <mach/vm_page_size.h>  // vm_page_size
#include <sys/mman.h>  // mmap, PROT_READ, MAP_PRIVATE
#include <sys/stat.h>  // fstat, stat
#include <sys/types.h>  // off_t, ssize_t
#include <unistd.h>  // close

#include <gpt-oss.h>

#include "internal/datatype.h"
#include "internal/kernel-args.h"  // gptoss_expert_prediction
#include "internal/log.h"
#include "internal/uuid.h"
#include "internal/storage.h"
#include "internal/math.h"
#include "internal/model.h"


static size_t round_up_to_page_size(size_t bytes) {
    const size_t page_size_mask = (size_t) vm_page_size - 1;
    if ((bytes & page_size_mask) != 0) {
        bytes |= page_size_mask;
        bytes += 1;
    }
    return bytes;
}

static size_t round_down_to_page_size(size_t bytes) {
    const size_t page_size_mask = (size_t) vm_page_size - 1;
    return bytes & ~page_size_mask;
}

static enum gptoss_status read_fd(int fd, void* data, size_t size, const char* path) {
    assert(fd != -1);
    assert(data != NULL);
    assert(size != 0);

    size_t bytes_to_read = size;
    char* current_byte = (char*) data;
    do {
        const ssize_t read_result = read(fd, current_byte, bytes_to_read);
        if (read_result < 0) {
            GPTOSS_LOG_ERROR("reading %zu bytes from file %s failed with error %d",
                size, path, errno);
            return gptoss_status_io_error;
        }
        current_byte += (size_t) read_result;
        bytes_to_read -= (size_t) read_result;
    } while (bytes_to_read != 0);
    return gptoss_status_success;
}

static void prefetch_fd(int fd, size_t offset, size_t size, const char* path) {
    // radvisory.ra_count is int, so we can't prefetch 2GB+ at once
    const size_t prefetch_max = round_down_to_page_size((size_t) INT_MAX);
    do {
        const size_t prefetch_size = math_min(size, prefetch_max);
        const struct radvisory ra = {
            .ra_offset = offset,
            .ra_count = (int) prefetch_size,
        };
        if (fcntl(fd, F_RDADVISE, &ra) == -1) {
            GPTOSS_LOG_WARNING("fcntl(%s, F_RDADVISE, .ra_offset=%zu, .ra_count=%d) failed with error %d\n",
                path, (size_t) ra.ra_offset, ra.ra_count, errno);
            return;
        }
        offset += prefetch_size;
        size -= prefetch_size;
    } while (size != 0);
}

enum gptoss_status GPTOSS_ABI gptoss_model_create_from_file(
    const char* path,
    gptoss_model_t* model_out)
{
    *model_out = NULL;

    enum gptoss_status status = gptoss_status_success;
    struct gptoss_model* model = NULL;
    struct gptoss_tokenizer* tokenizer = NULL;
    int fd = -1;
    size_t file_offset = 0;

    fd = open(path, O_RDONLY);
    if (fd == -1) {
        GPTOSS_LOG_ERROR("open(%s) failed with error %d", path, errno);
        switch (errno) {
            case EISDIR:
            case ENOENT:
            case ENOTDIR:
                status = gptoss_status_invalid_argument;
                break;
            default:
                status = gptoss_status_io_error;
                break;
        }
        goto cleanup;
    }

    struct gptoss_file_header file_header;
    status = read_fd(fd, &file_header, sizeof(file_header), path);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    file_offset += sizeof(file_header);

    if (file_header.magic[0] != 'G' ||
        file_header.magic[1] != 'P' ||
        file_header.magic[2] != 'T' ||
        file_header.magic[3] != '-' ||
        file_header.magic[4] != 'O' ||
        file_header.magic[5] != 'S' ||
        file_header.magic[6] != 'S' ||
        file_header.magic[7] != ' ' ||
        file_header.magic[8] != 'v' ||
        file_header.magic[9] != '1' ||
        file_header.magic[10] != '.' ||
        file_header.magic[11] != '0' ||
        file_header.zero != 0)
    {
        GPTOSS_LOG_ERROR("invalid magic in file %s", path);
        status = gptoss_status_invalid_argument;
        goto cleanup;
    }

    struct gptoss_uuid model_uuid;
    status = read_fd(fd, &model_uuid, sizeof(model_uuid), path);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    file_offset += sizeof(model_uuid);

    if (!gptoss_is_gptoss_model_uuid(&model_uuid)) {
        GPTOSS_LOG_ERROR("unsupported model UUID " UUID_FORMAT, UUID_ARGS(model_uuid));
        status = gptoss_status_invalid_argument;
        goto cleanup;
    }

    struct gptoss_gptoss_model_header model_header;
    status = read_fd(fd, &model_header, sizeof(model_header), path);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    file_offset += sizeof(model_header);

    struct gptoss_uuid layout_uuid;
    status = read_fd(fd, &layout_uuid, sizeof(layout_uuid), path);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    file_offset += sizeof(layout_uuid);

    if (!gptoss_is_applegpu_layout_uuid(&layout_uuid)) {
        GPTOSS_LOG_ERROR("unsupported layout UUID " UUID_FORMAT, UUID_ARGS(layout_uuid));
        status = gptoss_status_invalid_argument;
        goto cleanup;
    }

    const size_t model_size = sizeof(struct gptoss_model) + model_header.num_blocks * sizeof(struct gptoss_metal_buffer);
    model = malloc(model_size);
    if (model == NULL) {
        GPTOSS_LOG_ERROR("failed to allocate %zu bytes for model descriptor", model_size);
        status = gptoss_status_insufficient_memory;
        goto cleanup;
    }
    memset(model, 0, model_size);

    atomic_store_explicit(&model->ref_count, 1, memory_order_relaxed);
    model->context_length = model_header.context_length;
    model->num_blocks = model_header.num_blocks;
    model->num_experts = model_header.num_experts;
    model->num_active_experts = model_header.num_active_experts;
    model->embedding_dim = model_header.embedding_dim;
    model->mlp_dim = model_header.mlp_dim;
    model->swiglu_limit = model_header.swiglu_limit;
    model->head_dim = model_header.head_dim;
    model->num_heads = model_header.num_heads;
    model->num_kv_heads = model_header.num_kv_heads;
    model->attention_window = model_header.attention_window;
    model->rope_theta = model_header.rope_theta;
    model->interpolation_scale = model_header.interpolation_scale;
    model->yarn_offset = model_header.yarn_offset;
    model->yarn_scale = model_header.yarn_scale;
    model->yarn_multiplier = model_header.yarn_multiplier;
    model->rmsnorm_epsilon = model_header.rmsnorm_epsilon;

    struct gptoss_uuid tokenizer_uuid;
    status = read_fd(fd, &tokenizer_uuid, sizeof(tokenizer_uuid), path);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    file_offset += sizeof(tokenizer_uuid);

    if (!gptoss_is_tiktoken_tokenizer_uuid(&tokenizer_uuid)) {
        GPTOSS_LOG_ERROR("unsupported tokenizer UUID " UUID_FORMAT, UUID_ARGS(tokenizer_uuid));
        status = gptoss_status_invalid_argument;
        goto cleanup;
    }

    struct gptoss_tiktoken_tokenizer_header tokenizer_header;
    status = read_fd(fd, &tokenizer_header, sizeof(tokenizer_header), path);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    file_offset += sizeof(tokenizer_header);

    tokenizer = malloc(sizeof(struct gptoss_tokenizer));
    if (tokenizer == NULL) {
        GPTOSS_LOG_ERROR("failed to allocate %zu bytes for tokenizer descriptor", sizeof(struct gptoss_tokenizer));
        status = gptoss_status_insufficient_memory;
        goto cleanup;
    }
    memset(tokenizer, 0, sizeof(struct gptoss_tokenizer));
    // Initialize all special token IDs to UINT32_MAX (0xFF in all bytes)
    memset(tokenizer->special_token_id, 0xFF, sizeof(tokenizer->special_token_id));

    atomic_store_explicit(&tokenizer->ref_count, 1, memory_order_relaxed);
    tokenizer->num_special_tokens = tokenizer_header.num_special_tokens;
    tokenizer->num_text_tokens = tokenizer_header.num_text_tokens;
    model->vocabulary_size = tokenizer_header.num_special_tokens + tokenizer_header.num_text_tokens;
    for (uint32_t t = 0; t < tokenizer_header.num_special_tokens; t++) {
        struct gptoss_uuid token_uuid;
        status = read_fd(fd, &token_uuid, sizeof(token_uuid), path);
        if (status != gptoss_status_success) {
            goto cleanup;
        }
        file_offset += sizeof(token_uuid);

        const enum gptoss_special_token token = gptoss_special_token_decode_uuid(&token_uuid);
        if (token != gptoss_special_token_invalid) {
            tokenizer->special_token_id[token - 1] = tokenizer_header.num_text_tokens + t;
        }
    }

    const size_t tokenizer_start_offset = file_offset;
    const size_t tokenizer_end_offset = tokenizer_start_offset + tokenizer_header.regex_size + tokenizer_header.tokens_size;
    const size_t tokenizer_mapping_start = round_down_to_page_size(tokenizer_start_offset);
    const size_t tokenizer_mapping_size = round_up_to_page_size(tokenizer_end_offset) - tokenizer_mapping_start;
    void* tokenizer_mapping_ptr = mmap(NULL, tokenizer_mapping_size, PROT_READ, MAP_PRIVATE, fd, tokenizer_mapping_start);
    if (tokenizer_mapping_ptr == (void*) -1) {
        GPTOSS_LOG_ERROR("failed to mmap(%s) tokenizer at offset %zu size %zu",
            path, tokenizer_mapping_start, tokenizer_mapping_size);
        status = gptoss_status_io_error;
        goto cleanup;
    }
    tokenizer->mapping_ptr = tokenizer_mapping_ptr;
    tokenizer->mapping_size = tokenizer_mapping_size;
    tokenizer->regex_ptr = (const char*) tokenizer_mapping_ptr + (tokenizer_start_offset - tokenizer_mapping_start);
    tokenizer->tokens_ptr = tokenizer->regex_ptr + tokenizer_header.regex_size;

    if (madvise(tokenizer_mapping_ptr, tokenizer_mapping_size, MADV_RANDOM | MADV_WILLNEED) != 0) {
        GPTOSS_LOG_WARNING("madvise(%s, size=%zu) failed with error %d", path, tokenizer_mapping_size, errno);
    }

    prefetch_fd(fd, tokenizer_mapping_start, tokenizer_mapping_size, path);

    struct stat model_stat = {0};
    int stat_result = fstat(fd, &model_stat);
    if (stat_result != 0) {
        GPTOSS_LOG_ERROR("stat(%s) failed with error %d", path, errno);
        status = gptoss_status_io_error;
        goto cleanup;
    }

    const size_t model_mapping_start = round_up_to_page_size(tokenizer_end_offset);
    const size_t model_mapping_size = round_up_to_page_size((size_t) model_stat.st_size) - model_mapping_start;
    void* model_mapping_ptr = mmap(NULL, model_mapping_size, PROT_READ, MAP_PRIVATE, fd, model_mapping_start);
    if (model_mapping_ptr == (void*) -1) {
        GPTOSS_LOG_ERROR("failed to mmap(%s) model weights at offset %zu size %zu",
            path, model_mapping_start, model_mapping_size);
        status = gptoss_status_io_error;
        goto cleanup;
    }
    model->mapping_ptr = model_mapping_ptr;
    model->mapping_size = model_mapping_size;

    if (madvise(model_mapping_ptr, model_mapping_size, MADV_SEQUENTIAL | MADV_WILLNEED) != 0) {
        GPTOSS_LOG_WARNING("madvise(%s, size=%zu) failed with error %d", path, model_mapping_size, errno);
    }

    prefetch_fd(fd, model_mapping_start, model_mapping_size, path);

    if (mlock(model_mapping_ptr, model_mapping_size) != 0) {
        GPTOSS_LOG_WARNING("mlock(%s, size=%zu) failed with error %d", path, model_mapping_size, errno);
    } else {
        model->lock_memory = true;
    }

    // Initialize Metal
    status = gptoss_metal_device_create_system_default(&model->device);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    model->max_threadgroups = model->device.num_cores * 3;
    status = gptoss_metal_command_queue_create(&model->device, &model->command_queue);
    if (status != gptoss_status_success) {
        goto cleanup;
    }

    // Metal kernels
    status = gptoss_metal_library_create_default(&model->device, &model->library);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_bf16_f32_embeddings", &model->bf16_f32_embeddings_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_bf16w_rmsnorm", &model->f32_bf16w_rmsnorm_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_bf16w_matmul", &model->f32_bf16w_matmul_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_bf16w_matmul_qkv", &model->f32_bf16w_matmul_qkv_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_bf16w_dense_matmul_qkv", &model->f32_bf16w_dense_matmul_qkv_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_bf16w_dense_matmul_attn_output", &model->f32_bf16w_dense_matmul_attn_output_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_bf16w_dense_matmul_mlp_gate", &model->f32_bf16w_dense_matmul_mlp_gate_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_bf16w_unembedding", &model->f32_bf16w_unembedding_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_rope", &model->f32_rope_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_expert_routing_metadata", &model->f32_expert_routing_metadata_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_scatter_e4", &model->f32_scatter_e4_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_mf4w_moe_dense_matmul_swiglu", &model->f32_mf4w_moe_dense_matmul_swiglu_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_mf4w_moe_dense_matmul", &model->f32_mf4w_moe_dense_matmul_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_gather_and_accumulate_e4", &model->f32_gather_and_accumulate_e4_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_mf4w_moe_matmul_swiglu", &model->f32_mf4w_moe_matmul_swiglu_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_mf4w_moe_matmul", &model->f32_mf4w_moe_matmul_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_accumulate_e4", &model->f32_accumulate_e4_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_topk_softmax_e32_k4", &model->f32_topk_softmax_e32_k4_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_topk_softmax_e128_k4", &model->f32_topk_softmax_e128_k4_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_softmax", &model->f32_softmax_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_sample", &model->f32_sample_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }
    status = gptoss_metal_function_create(&model->library, "gptoss_f32_sdpa_q8_d64", &model->f32_sdpa_q8_d64_fn);
    if (status != gptoss_status_success) {
        goto cleanup;
    }

    // Kernel launch parameters
    model->embeddings_threadgroup_size = 512;
    model->attn_qkv_threadgroup_size = 1024;
    model->attn_out_threadgroup_size = 768;
    model->mlp_gate_threadgroup_size = 256;
    model->mlp_swiglu_threadgroup_size = 192;
    model->mlp_out_threadgroup_size = 192;
    model->mlp_acc_threadgroup_size = 768;
    model->unembedding_threadgroup_size = 416;

    // Weight buffers
    const char* current_ptr = (const char*) model->mapping_ptr;

    const size_t embedding_weight_size = math_round_up_po2(model->vocabulary_size * model->embedding_dim * sizeof(gptoss_bfloat16), 16);
    model->attn_rmsnorm_gain_offset = embedding_weight_size;
    const size_t rmsnorm_weight_size = math_round_up_po2(model->embedding_dim * sizeof(gptoss_bfloat16), 16);
    model->attn_qkv_weight_offset = model->attn_rmsnorm_gain_offset + rmsnorm_weight_size;
    const size_t attn_qkv_dim = model->head_dim * (model->num_heads + 2 * model->num_kv_heads);
    const size_t attn_qkv_weight_size = math_round_up_po2(attn_qkv_dim * model->embedding_dim * sizeof(gptoss_bfloat16), 16);
    model->attn_qkv_bias_offset = model->attn_qkv_weight_offset + attn_qkv_weight_size;
    const size_t attn_qkv_bias_size = math_round_up_po2(attn_qkv_dim * sizeof(gptoss_bfloat16), 16);
    model->attn_sdpa_sink_offset = model->attn_qkv_bias_offset + attn_qkv_bias_size;
    const size_t attn_sink_weight_size = math_round_up_po2(model->num_heads * sizeof(gptoss_bfloat16), 16);
    model->attn_out_weight_offset = model->attn_sdpa_sink_offset + attn_sink_weight_size;
    const size_t attn_out_weight_size = math_round_up_po2(model->embedding_dim * model->num_heads * model->head_dim * sizeof(gptoss_bfloat16), 16);
    model->attn_out_bias_offset = model->attn_out_weight_offset + attn_out_weight_size;
    const size_t attn_out_bias_size = math_round_up_po2(model->embedding_dim * sizeof(gptoss_bfloat16), 16);
    model->mlp_rmsnorm_gain_offset = model->attn_out_bias_offset + attn_out_bias_size;
    model->mlp_gate_weight_offset = model->mlp_rmsnorm_gain_offset + rmsnorm_weight_size;
    const size_t mlp_gate_weight_size = math_round_up_po2(model->num_experts * model->embedding_dim * sizeof(gptoss_bfloat16), 16);
    model->mlp_gate_bias_offset = model->mlp_gate_weight_offset + mlp_gate_weight_size;
    const size_t mlp_gate_bias_size = math_round_up_po2(model->num_experts * sizeof(gptoss_bfloat16), 16);
    const size_t per_block_shared_weights_size =
        rmsnorm_weight_size + attn_qkv_weight_size + attn_qkv_bias_size + attn_sink_weight_size + attn_out_weight_size + attn_out_bias_size +
        rmsnorm_weight_size + mlp_gate_weight_size + mlp_gate_bias_size;
    model->rmsnorm_weight_offset = embedding_weight_size + model->num_blocks * per_block_shared_weights_size;
    model->unembedding_weight_offset = model->rmsnorm_weight_offset + rmsnorm_weight_size;
    const size_t unembedding_weight_size = math_round_up_po2(model->vocabulary_size * model->embedding_dim * sizeof(gptoss_bfloat16), 16);

    model->per_block_shared_weights_size = per_block_shared_weights_size;
    const size_t shared_weights_size =
        round_up_to_page_size(embedding_weight_size + rmsnorm_weight_size + unembedding_weight_size + model->num_blocks * per_block_shared_weights_size);

    status = gptoss_metal_buffer_wrap(&model->device, shared_weights_size, current_ptr, &model->shared_weight_buffer);
    if (status != gptoss_status_success) {
        GPTOSS_LOG_ERROR("failed to map expert-shared weight of size %zu onto a Metal buffer", shared_weights_size);
        goto cleanup;
    }
    current_ptr += shared_weights_size;
    model->weights_size += shared_weights_size;

    const size_t mlp_swiglu_weight_block_size = math_round_up_po2(2 * model->mlp_dim * model->embedding_dim / 2, 16);
    model->mlp_swiglu_scale_offset = mlp_swiglu_weight_block_size;
    const size_t mlp_swiglu_weight_scale_size = math_round_up_po2(2 * model->mlp_dim * model->embedding_dim / 32, 16);
    model->mlp_swiglu_bias_offset = model->mlp_swiglu_scale_offset + mlp_swiglu_weight_scale_size;
    const size_t mlp_swiglu_bias_size = math_round_up_po2(2 * model->mlp_dim * sizeof(gptoss_bfloat16), 16);
    model->mlp_out_block_offset = model->mlp_swiglu_bias_offset + mlp_swiglu_bias_size;
    const size_t mlp_out_weight_block_size = math_round_up_po2(model->embedding_dim * model->mlp_dim / 2, 16);
    model->mlp_out_scale_offset = model->mlp_out_block_offset + mlp_out_weight_block_size;
    const size_t mlp_out_weight_scale_size = math_round_up_po2(model->embedding_dim * model->mlp_dim / 32, 16);
    model->mlp_out_bias_offset = model->mlp_out_scale_offset + mlp_out_weight_scale_size;
    const size_t mlp_out_bias_size = math_round_up_po2(model->embedding_dim * sizeof(gptoss_bfloat16), 16);
    model->per_expert_block_weight_size =
        mlp_swiglu_weight_block_size + mlp_swiglu_weight_scale_size + mlp_swiglu_bias_size + mlp_out_weight_block_size + mlp_out_weight_scale_size + mlp_out_bias_size;
    const size_t moe_block_weight_size = round_up_to_page_size(model->num_experts * model->per_expert_block_weight_size);
    for (uint32_t n = 0; n < model->num_blocks; n++) {
        status = gptoss_metal_buffer_wrap(&model->device, moe_block_weight_size, current_ptr, &model->block_weight_buffers[n]);
        if (status != gptoss_status_success) {
            GPTOSS_LOG_ERROR("failed to map block #%" PRIu32 " MoE weight of size %zu onto a Metal buffer",
                n, moe_block_weight_size);
            goto cleanup;
        }
        current_ptr += moe_block_weight_size;
        model->weights_size += moe_block_weight_size;
    }

    // Commit tokenizer
    model->tokenizer = tokenizer;
    tokenizer = NULL;

    // Commit model
    *model_out = model;
    model = NULL;

cleanup:
    if (fd != -1) {
        close(fd);
        fd = -1;
    }
    gptoss_model_release(model);  // does nothing if model is NULL
    gptoss_tokenizer_release(tokenizer);  // does nothing if tokenizer is NULL
    return status;
}

enum gptoss_status GPTOSS_ABI gptoss_model_get_tokenizer(
    gptoss_model_t model,
    gptoss_tokenizer_t* tokenizer_out)
{
    gptoss_tokenizer_t tokenizer = model->tokenizer;
    atomic_fetch_add_explicit(&tokenizer->ref_count, 1, memory_order_relaxed);
    *tokenizer_out = tokenizer;
    return gptoss_status_success;
}

enum gptoss_status GPTOSS_ABI gptoss_model_get_max_context_length(
    gptoss_model_t model,
    size_t* max_context_length_out)
{
    *max_context_length_out = model->context_length;
    return gptoss_status_success;
}

enum gptoss_status GPTOSS_ABI gptoss_model_retain(
    gptoss_model_t model)
{
    atomic_fetch_add_explicit(&model->ref_count, 1, memory_order_relaxed);
    return gptoss_status_success;
}

enum gptoss_status GPTOSS_ABI gptoss_model_release(
    gptoss_model_t model)
{
    if (model != NULL) {
        if (atomic_fetch_sub_explicit(&model->ref_count, 1, memory_order_acq_rel) == 1) {
            gptoss_tokenizer_release(model->tokenizer);

            // Weight buffers
            gptoss_metal_buffer_release(&model->shared_weight_buffer);
            for (uint32_t n = 0; n < model->num_blocks; n++) {
                gptoss_metal_buffer_release(&model->block_weight_buffers[n]);
            }

            // Metal kernels
            gptoss_metal_function_release(&model->bf16_f32_embeddings_fn);
            gptoss_metal_function_release(&model->f32_bf16w_rmsnorm_fn);
            gptoss_metal_function_release(&model->f32_bf16w_matmul_fn);
            gptoss_metal_function_release(&model->f32_bf16w_matmul_qkv_fn);
            gptoss_metal_function_release(&model->f32_bf16w_dense_matmul_qkv_fn);
            gptoss_metal_function_release(&model->f32_bf16w_dense_matmul_attn_output_fn);
            gptoss_metal_function_release(&model->f32_bf16w_dense_matmul_mlp_gate_fn);
            gptoss_metal_function_release(&model->f32_bf16w_unembedding_fn);
            gptoss_metal_function_release(&model->f32_rope_fn);
            gptoss_metal_function_release(&model->f32_expert_routing_metadata_fn);
            gptoss_metal_function_release(&model->f32_scatter_e4_fn);
            gptoss_metal_function_release(&model->f32_mf4w_moe_dense_matmul_swiglu_fn);
            gptoss_metal_function_release(&model->f32_mf4w_moe_dense_matmul_fn);
            gptoss_metal_function_release(&model->f32_gather_and_accumulate_e4_fn);
            gptoss_metal_function_release(&model->f32_mf4w_moe_matmul_swiglu_fn);
            gptoss_metal_function_release(&model->f32_mf4w_moe_matmul_fn);
            gptoss_metal_function_release(&model->f32_accumulate_e4_fn);
            gptoss_metal_function_release(&model->f32_topk_softmax_e32_k4_fn);
            gptoss_metal_function_release(&model->f32_topk_softmax_e128_k4_fn);
            gptoss_metal_function_release(&model->f32_softmax_fn);
            gptoss_metal_function_release(&model->f32_sample_fn);
            gptoss_metal_function_release(&model->f32_sdpa_q8_d64_fn);
            gptoss_metal_library_release(&model->library);

            gptoss_metal_command_queue_release(&model->command_queue);
            gptoss_metal_device_release(&model->device);
            // Weight buffers

            if (model->mapping_ptr != NULL && model->mapping_size != 0) {
                if (model->lock_memory) {
                    if (munlock(model->mapping_ptr, model->mapping_size) != 0) {
                        GPTOSS_LOG_WARNING("munlock for model weight mapping failed with error %d", errno);
                    }
                }

                if (munmap(model->mapping_ptr, model->mapping_size) != 0) {
                    GPTOSS_LOG_WARNING("munmap for model weight mapping failed with error %d", errno);
                }
            }

            const size_t model_size = sizeof(struct gptoss_model) + model->num_blocks * sizeof(struct gptoss_metal_buffer);
            memset(model, 0, model_size);
            free(model);
        }
    }
    return gptoss_status_success;
}


================================================
FILE: gpt_oss/metal/source/moematmul.metal
================================================
#include <internal/kernel-args.h>
#include <metal_common>
#include <metal_compute>
#include <metal_math>
#include <metal_simdgroup>
#include <metal_stdlib>

#pragma METAL fp math_mode(safe)
#pragma METAL fp contract(off)
#define ceil_div(a, b) (((a) + (b) - 1) / (b))

// Each simdgroup reduces all channels of the input and computes a single channel of the output
// + Efficient synchronization
// + Sequential memory access within a warp
// Each threadgroup computes (simdgroups_per_threadgroup) consecutive output channels
// + Reuse input vector from threadgroup memory
// + Avoid synchronization across warps when doing reduction

kernel void gptoss_f32_mf4w_moe_matmul_swiglu(
    constant gptoss_moe_matmul_swiglu_args& args [[ buffer(0) ]],
    const device float4* input [[ buffer(1) ]],
    const device gptoss_expert_prediction* expert [[ buffer(2) ]],
    const device uint4* weight_blocks [[ buffer(3) ]],
    const device uchar* weight_scales [[ buffer(4) ]],
    const device bfloat* bias [[ buffer(5) ]],
    device float* output [[ buffer(6) ]],
    const device gptoss_control* control [[ buffer(7) ]],
    uint3 gid [[threadgroup_position_in_grid]],
    uint tid [[thread_index_in_threadgroup]],
    uint simdgroup_tid [[thread_index_in_simdgroup]],
    uint simdgroup_idx [[simdgroup_index_in_threadgroup]],
    uint num_simdgroups [[simdgroups_per_threadgroup]])
{
    const uint simdgroup_size = 32;
    threadgroup float threadgroup_buffer[32];
    if (control->abort != 0) {
        return;
    }

    const uint num_column_vecs = args.num_column_vecs;
    const uint row = gid.x * num_simdgroups + simdgroup_idx;
    const uint expert_id = expert[gid.y * args.num_active_experts + gid.z].expert_id;

    input += 8 * (gid.y * num_column_vecs + simdgroup_tid);
    weight_blocks = (const device uint4*) ((uintptr_t) (weight_blocks + num_column_vecs * row + simdgroup_tid) + expert_id * args.weight_expert_stride);
    weight_scales = (const device uchar*) ((uintptr_t) (weight_scales + num_column_vecs * row + simdgroup_tid) + expert_id * args.weight_expert_stride);
    bias = (const device bfloat*) ((uintptr_t) (bias + row) + expert_id * args.weight_expert_stride);
    output += gid.y * args.num_rows + gid.x * (num_simdgroups / 2) + gid.z * args.output_expert_stride;

    uint num_iter = (num_column_vecs - simdgroup_tid + (simdgroup_size - 1)) / simdgroup_size;

    float4 sum4 = 0.0f;
    do {
        const uint4 wblock = *weight_blocks;
        const float wscale = as_type<float>(static_cast<uint>(*weight_scales) << 23);
        uint4 wblock02468ACEGIKMOQSU = wblock + wblock;
        uint4 wblock13579BDFHJLNPRTV = wblock >> 3;
        wblock02468ACEGIKMOQSU &= 0x1E1E1E1Eu;
        wblock13579BDFHJLNPRTV &= 0x1E1E1E1Eu;
        wblock02468ACEGIKMOQSU += 0x70707070u;
        wblock13579BDFHJLNPRTV += 0x70707070u;
        wblock02468ACEGIKMOQSU &= 0x8E8E8E8Eu;
        wblock13579BDFHJLNPRTV &= 0x8E8E8E8Eu;
        const uint4 wblock26AEIMQU = wblock02468ACEGIKMOQSU & 0xFF00FF00u;
        const uint4 wblock048CGKOS = (wblock02468ACEGIKMOQSU << 8) & 0xFF00FF00u;
        const uint4 wblock37BFJNRV = wblock13579BDFHJLNPRTV & 0xFF00FF00u;
        const uint4 wblock159DHLPT = (wblock13579BDFHJLNPRTV << 8) & 0xFF00FF00u;
        const float4 w048C = static_cast<float4>(as_type<half4>(wblock048CGKOS.xy));
        const float4 wGKOS = static_cast<float4>(as_type<half4>(wblock048CGKOS.zw));
        const float4 w26AE = static_cast<float4>(as_type<half4>(wblock26AEIMQU.xy));
        const float4 wIMQU = static_cast<float4>(as_type<half4>(wblock26AEIMQU.zw));
        const float4 w159D = static_cast<float4>(as_type<half4>(wblock159DHLPT.xy));
        const float4 wHLPT = static_cast<float4>(as_type<half4>(wblock159DHLPT.zw));
        const float4 w37BF = static_cast<float4>(as_type<half4>(wblock37BFJNRV.xy));
        const float4 wJNRV = static_cast<float4>(as_type<half4>(wblock37BFJNRV.zw));

        const float4 w0123 = (float4) { w048C.x, w159D.x, w26AE.x, w37BF.x };
        const float4 w4567 = (float4) { w048C.y, w159D.y, w26AE.y, w37BF.y };
        const float4 w89AB = (float4) { w048C.z, w159D.z, w26AE.z, w37BF.z };
        const float4 wCDEF = (float4) { w048C.w, w159D.w, w26AE.w, w37BF.w };
        const float4 wGHIJ = (float4) { wGKOS.x, wHLPT.x, wIMQU.x, wJNRV.x };
        const float4 wKLMN = (float4) { wGKOS.y, wHLPT.y, wIMQU.y, wJNRV.y };
        const float4 wOPQR = (float4) { wGKOS.z, wHLPT.z, wIMQU.z, wJNRV.z };
        const float4 wSTUV = (float4) { wGKOS.w, wHLPT.w, wIMQU.w, wJNRV.w };

        const float4 i0123 = input[0];
        const float4 i4567 = input[1];
        const float4 i89AB = input[2];
        const float4 iCDEF = input[3];
        const float4 iGHIJ = input[4];
        const float4 iKLMN = input[5];
        const float4 iOPQR = input[6];
        const float4 iSTUV = input[7];

        float4 psum0 = i0123 * w0123;
        float4 psum1 = i4567 * w4567;
        psum0 = metal::fma(i89AB, w89AB, psum0);
        psum1 = metal::fma(iCDEF, wCDEF, psum1);
        psum0 = metal::fma(iGHIJ, wGHIJ, psum0);
        psum1 = metal::fma(iKLMN, wKLMN, psum1);
        psum0 = metal::fma(iOPQR, wOPQR, psum0);
        psum1 = metal::fma(iSTUV, wSTUV, psum1);
        sum4 = metal::fma(psum0, wscale, sum4);
        sum4 = metal::fma(psum1, wscale, sum4);

        weight_blocks += simdgroup_size;
        weight_scales += simdgroup_size;
        input += 8 * simdgroup_size;
    } while (--num_iter != 0);
    const float2 sum2 = sum4.xy + sum4.zw;
    float sum = sum2.x + sum2.y;
    sum = metal::simd_sum(sum);
    if (metal::simd_is_first()) {
        sum += static_cast<float>(*bias);
        threadgroup_buffer[simdgroup_idx] = sum;
    }
    metal::threadgroup_barrier(metal::mem_flags::mem_threadgroup);
    if (tid * 2 < num_simdgroups) {
        const float2 x = reinterpret_cast<const threadgroup float2*>(threadgroup_buffer)[tid];
        const float swish_x = metal::min(x.x, args.swiglu_max);
        const float linear_x = metal::clamp(x.y, args.swiglu_min, args.swiglu_max);
        const float alpha = 1.702f;
        const float swish_y = swish_x / (1.0f + metal::precise::exp(-alpha * swish_x));
        const float swiglu_y = metal::fma(swish_y, linear_x, swish_y);
        output[tid] = swiglu_y;
    }
}

kernel void gptoss_f32_mf4w_moe_matmul(
    constant gptoss_moe_matmul_args& args [[ buffer(0) ]],
    const device float4* input [[ buffer(1) ]],
    const device gptoss_expert_prediction* expert [[ buffer(2) ]],
    const device uint4* weight_blocks [[ buffer(3) ]],
    const device uchar* weight_scales [[ buffer(4) ]],
    const device bfloat* bias [[ buffer(5) ]],
    device float* output [[ buffer(6) ]],
    const device gptoss_control* control [[ buffer(7) ]],
    uint3 gid [[threadgroup_position_in_grid]],
    uint tid [[thread_index_in_threadgroup]],
    uint simdgroup_tid [[thread_index_in_simdgroup]],
    uint simdgroup_idx [[simdgroup_index_in_threadgroup]],
    uint num_simdgroups [[simdgroups_per_threadgroup]])
{
    const uint simdgroup_size = 32;
    if (control->abort != 0) {
        return;
    }

    const uint num_column_vecs = args.num_column_vecs;
    const uint row = gid.x * num_simdgroups + simdgroup_idx;
    const uint expert_id = expert[gid.y * args.num_active_experts + gid.z].expert_id;

    input += 8 * (gid.y * num_column_vecs + simdgroup_tid + gid.z * args.input_expert_stride);
    weight_blocks = (const device uint4*) ((uintptr_t) (weight_blocks + num_column_vecs * row + simdgroup_tid) + expert_id * args.weight_expert_stride);
    weight_scales = (const device uchar*) ((uintptr_t) (weight_scales + num_column_vecs * row + simdgroup_tid) + expert_id * args.weight_expert_stride);
    bias = (const device bfloat*) ((uintptr_t) (bias + row) + expert_id * args.weight_expert_stride);
    output += gid.y * args.num_rows + row + gid.z * args.output_expert_stride;

    uint num_iter = (num_column_vecs - simdgroup_tid + (simdgroup_size - 1)) / simdgroup_size;

    float4 sum4 = 0.0f;
    do {
        const uint4 wblock = *weight_blocks;
        const float wscale = as_type<float>(static_cast<uint>(*weight_scales) << 23);
        uint4 wblock02468ACEGIKMOQSU = wblock + wblock;
        uint4 wblock13579BDFHJLNPRTV = wblock >> 3;
        wblock02468ACEGIKMOQSU &= 0x1E1E1E1Eu;
        wblock13579BDFHJLNPRTV &= 0x1E1E1E1Eu;
        wblock02468ACEGIKMOQSU += 0x70707070u;
        wblock13579BDFHJLNPRTV += 0x70707070u;
        wblock02468ACEGIKMOQSU &= 0x8E8E8E8Eu;
        wblock13579BDFHJLNPRTV &= 0x8E8E8E8Eu;
        const uint4 wblock26AEIMQU = wblock02468ACEGIKMOQSU & 0xFF00FF00u;
        const uint4 wblock048CGKOS = (wblock02468ACEGIKMOQSU << 8) & 0xFF00FF00u;
        const uint4 wblock37BFJNRV = wblock13579BDFHJLNPRTV & 0xFF00FF00u;
        const uint4 wblock159DHLPT = (wblock13579BDFHJLNPRTV << 8) & 0xFF00FF00u;
        const float4 w048C = static_cast<float4>(as_type<half4>(wblock048CGKOS.xy));
        const float4 wGKOS = static_cast<float4>(as_type<half4>(wblock048CGKOS.zw));
        const float4 w26AE = static_cast<float4>(as_type<half4>(wblock26AEIMQU.xy));
        const float4 wIMQU = static_cast<float4>(as_type<half4>(wblock26AEIMQU.zw));
        const float4 w159D = static_cast<float4>(as_type<half4>(wblock159DHLPT.xy));
        const float4 wHLPT = static_cast<float4>(as_type<half4>(wblock159DHLPT.zw));
        const float4 w37BF = static_cast<float4>(as_type<half4>(wblock37BFJNRV.xy));
        const float4 wJNRV = static_cast<float4>(as_type<half4>(wblock37BFJNRV.zw));

        const float4 w0123 = (float4) { w048C.x, w159D.x, w26AE.x, w37BF.x };
        const float4 w4567 = (float4) { w048C.y, w159D.y, w26AE.y, w37BF.y };
        const float4 w89AB = (float4) { w048C.z, w159D.z, w26AE.z, w37BF.z };
        const float4 wCDEF = (float4) { w048C.w, w159D.w, w26AE.w, w37BF.w };
        const float4 wGHIJ = (float4) { wGKOS.x, wHLPT.x, wIMQU.x, wJNRV.x };
        const float4 wKLMN = (float4) { wGKOS.y, wHLPT.y, wIMQU.y, wJNRV.y };
        const float4 wOPQR = (float4) { wGKOS.z, wHLPT.z, wIMQU.z, wJNRV.z };
        const float4 wSTUV = (float4) { wGKOS.w, wHLPT.w, wIMQU.w, wJNRV.w };

        const float4 i0123 = input[0];
        const float4 i4567 = input[1];
        const float4 i89AB = input[2];
        const float4 iCDEF = input[3];
        const float4 iGHIJ = input[4];
        const float4 iKLMN = input[5];
        const float4 iOPQR = input[6];
        const float4 iSTUV = input[7];

        float4 psum0 = i0123 * w0123;
        float4 psum1 = i4567 * w4567;
        psum0 = metal::fma(i89AB, w89AB, psum0);
        psum1 = metal::fma(iCDEF, wCDEF, psum1);
        psum0 = metal::fma(iGHIJ, wGHIJ, psum0);
        psum1 = metal::fma(iKLMN, wKLMN, psum1);
        psum0 = metal::fma(iOPQR, wOPQR, psum0);
        psum1 = metal::fma(iSTUV, wSTUV, psum1);
        sum4 = metal::fma(psum0, wscale, sum4);
        sum4 = metal::fma(psum1, wscale, sum4);

        weight_blocks += simdgroup_size;
        weight_scales += simdgroup_size;
        input += 8 * simdgroup_size;
    } while (--num_iter != 0);
    const float2 sum2 = sum4.xy + sum4.zw;
    float sum = sum2.x + sum2.y;
    sum = metal::simd_sum(sum);
    if (metal::simd_is_first()) {
        sum += static_cast<float>(*bias);
        *output = sum;
    }
}

kernel void gptoss_f32_mf4w_moe_dense_matmul_swiglu(
    constant gptoss_moe_dense_matmul_swiglu_args& params [[ buffer(0) ]],
    const device uint* __restrict__ expert_offsets [[ buffer(1) ]],
    const device float* lhs [[ buffer(2) ]],
    const device uint* weight_blocks [[ buffer(3) ]],
    const device uchar* weight_scales [[ buffer(4) ]],
    const device bfloat* __restrict__ bias [[ buffer(5) ]],
    device float* out [[ buffer(6) ]],
    uint sg_id [[simdgroup_index_in_threadgroup]],
    uint3 threads_per_tg [[threads_per_threadgroup]],
    uint sg_count_per_tg [[dispatch_simdgroups_per_threadgroup]],
    uint3 gid [[thread_position_in_grid]],
    uint3 tg_id [[threadgroup_position_in_grid]],
    uint3 local_tid [[thread_position_in_threadgroup]]) 
{
    constexpr uint Bm = MOE_DENSE_MATMUL_SWIGLU_Bm;
    constexpr uint Bn = MOE_DENSE_MATMUL_SWIGLU_Bn;
    constexpr uint Bk = MOE_DENSE_MATMUL_SWIGLU_Bk;
    constexpr uint Sg_Bm = MOE_DENSE_MATMUL_SWIGLU_Sg_Bm;
    constexpr uint Sg_Bn = MOE_DENSE_MATMUL_SWIGLU_Sg_Bn;

    // Assumptions about shapes.
    assert(Bm % 8 == 0);
    assert(Bn % 8 == 0);
    assert(Bk % 8 == 0);
    assert(Sg_Bm % 8 == 0);
    assert(Sg_Bn % 8 == 0);
    assert(Bm % Sg_Bm == 0);
    assert(Bn % Sg_Bn == 0);

    const uint K = params.k;
    const uint N = params.n;
    const uint M = expert_offsets[tg_id.z + 1] - expert_offsets[tg_id.z];
    assert((K % 32) == 0);
    assert((K % 8) == 0);
    assert(N % Bn == 0);
    assert(K % Bk == 0);
    // Get row and col tg.
    const uint row_tg = tg_id.y;
    const uint col_tg = tg_id.x;
    // Get row and col local tid.
    const uint row_tg_offset = row_tg * Bm;
    const uint col_tg_offset = col_tg * Bn;
    if (row_tg_offset >= M || col_tg_offset >= N) {
        return;
    }
    // Move lhs and output according to the passed offset.
    const uint expert_offset = expert_offsets[tg_id.z];
    lhs += expert_offset * K;
    const uint N_output = N / 2;
    out += expert_offset * N_output;

    const uint S = params.weight_blocks_expert_stride_bytes;
    const uint S_scales = params.weight_scales_expert_stride_bytes;
    const uint S_bias = params.bias_expert_stride_bytes;

    const device char* wb0 = reinterpret_cast<const device char*>(weight_blocks);
    const device char* sc0 = reinterpret_cast<const device char*>(weight_scales);
    const device char* bi0 = reinterpret_cast<const device char*>(bias);

    weight_blocks = reinterpret_cast<const device uint*>(wb0 + tg_id.z * S);
    weight_scales = reinterpret_cast<const device uchar*>(sc0 + tg_id.z * S_scales);
    bias = reinterpret_cast<const device bfloat*>(bi0 + tg_id.z * S_bias);

    const uint sg_col_count = Bn / Sg_Bn;
    const uint row_sg = sg_id / sg_col_count;
    const uint col_sg = sg_id % sg_col_count;

    const uint row_sg_offset = row_sg * Sg_Bm;
    const uint col_sg_offset = col_sg * Sg_Bn;
    // Declare threadgroup blocks.
    threadgroup float lhs_block[Bm * Bk];
    // rhs_block will hold the scaled fp32 weights.
    threadgroup float rhs_block[Bn * Bk];

    constexpr uint temp_result_size = (Sg_Bm / 8) * (Sg_Bn / 8);
    // Create an array of simdgroup_float8x8 to hold temp results.
    metal::simdgroup_float8x8 OutTiles[temp_result_size];
    for (uint i = 0; i < temp_result_size; i++) {
        OutTiles[i] = metal::make_filled_simdgroup_matrix<float, 8, 8>(0.0);
    }
    // Linear thread id within TG (we launch 1-D TGs)
    const uint lin_tid = local_tid.x;
    const uint thread_count_per_tg = threads_per_tg.x * threads_per_tg.y * threads_per_tg.z;

    // Iterate over all Bk blocks.
    for (uint k_offset = 0; k_offset < K; k_offset += Bk) {
        constexpr uint lhs_row_stride = Bk;
        constexpr uint lhs_vec_cols = Bk / 4;
        constexpr uint lhs_vec_total = Bm * lhs_vec_cols;

        const uint LHS_ITERS = ceil_div(lhs_vec_total, thread_count_per_tg);

        // #pragma clang loop unroll(full)
        for (uint t = 0; t < LHS_ITERS; ++t) {
            const uint i = t * thread_count_per_tg + lin_tid;
            if (i < lhs_vec_total) {
                const uint r = i / lhs_vec_cols;
                const uint c4 = i % lhs_vec_cols;

                const uint gr = row_tg_offset + r;
                const uint gc4 = (k_offset / 4) + c4;

                threadgroup float4* dst4 =
                    reinterpret_cast<threadgroup float4*>(lhs_block + r * lhs_row_stride + (c4 << 2));
                if (gr < M) {
                    const device float4* src4 =
                        reinterpret_cast<const device float4*>(lhs + gr * K + (gc4 << 2));

                    *dst4 = *src4;
                } else {
                    *dst4 = float4(0.0);
                }
            }
        }

        // Load weights with vector loads.
        constexpr uint rhs_row_stride = Bk;
        constexpr uint weights_per_elem = 8;
        constexpr uint rhs_loads_per_col = Bk / weights_per_elem;
        constexpr uint rhs_loads_total = Bn * rhs_loads_per_col;
        const uint RHS_ITERS = ceil_div(rhs_loads_total, thread_count_per_tg);
        // #pragma clang loop unroll(full)
        for (uint t = 0; t < RHS_ITERS; ++t) {
            const uint i = t * thread_count_per_tg + lin_tid;
            if (i < rhs_loads_total) {
                const uint r = i / rhs_loads_per_col;
                const uint c = i % rhs_loads_per_col;

                const uint gr = col_tg_offset + r;
                const uint gc = (k_offset / weights_per_elem) + c;
                const uint gc_scale = (k_offset / 32) + (c >> 2);

                const uint wblock = weight_blocks[gr * (K / weights_per_elem) + gc];
                const float scale =
                    as_type<float>(static_cast<uint>(weight_scales[gr * (K / 32) + gc_scale]) << 23);
                uint wblock0246 = (wblock + wblock);
                uint wblock1357 = (wblock >> 3);
                wblock0246 &= 0x1E1E1E1Eu;
                wblock1357 &= 0x1E1E1E1Eu;

                wblock0246 += 0x70707070u;
                wblock1357 += 0x70707070u;
                wblock0246 &= 0x8E8E8E8Eu;
                wblock1357 &= 0x8E8E8E8Eu;

                uint wblock26 = (wblock0246) & 0xFF00FF00u;
                uint wblock04 = ((wblock0246 << 8)) & 0xFF00FF00u;
                uint wblock37 = (wblock1357) & 0xFF00FF00u;
                uint wblock15 = ((wblock1357 << 8)) & 0xFF00FF00u;

                half4 wblock0426 = as_type<half4>(uint2(wblock04, wblock26));
                half4 wblock1537 = as_type<half4>(uint2(wblock15, wblock37));

                // Convert to float scalars and apply scale
                const float w0 = float(wblock0426.x) * scale;
                const float w1 = float(wblock1537.x) * scale;
                const float w2 = float(wblock0426.z) * scale;
                const float w3 = float(wblock1537.z) * scale;
                const float w4 = float(wblock0426.y) * scale;
                const float w5 = float(wblock1537.y) * scale;
                const float w6 = float(wblock0426.w) * scale;
                const float w7 = float(wblock1537.w) * scale;
                const uint rhs_offset = r * rhs_row_stride + c * 8;
                rhs_block[rhs_offset] = w0;
                rhs_block[rhs_offset + 1] = w1;
                rhs_block[rhs_offset + 2] = w2;
                rhs_block[rhs_offset + 3] = w3;
                rhs_block[rhs_offset + 4] = w4;
                rhs_block[rhs_offset + 5] = w5;
                rhs_block[rhs_offset + 6] = w6;
                rhs_block[rhs_offset + 7] = w7;
            }
        }
        threadgroup_barrier(metal::mem_flags::mem_threadgroup);
#pragma clang loop unroll(full)
        for (uint k = 0; k < Bk; k += 8) {
#pragma clang loop unroll(full)
            for (uint m_subtile_ = 0; m_subtile_ < Sg_Bm; m_subtile_ += 8) {
                const uint row_index_in_out_tile = m_subtile_ / 8;
                metal::simdgroup_float8x8 lhs_frag;

                simdgroup_load(lhs_frag, lhs_block, Bk, ulong2(k, m_subtile_ + row_sg_offset));
#pragma clang loop unroll(full)
                for (uint n_subtile_ = 0; n_subtile_ < Sg_Bn; n_subtile_ += 8) {
                    const uint col_index_in_out_tile = n_subtile_ / 8;
                    const uint current_index_out_tile =
                        row_index_in_out_tile * (Sg_Bn / 8) + col_index_in_out_tile;
                    metal::simdgroup_float8x8 rhs_frag;
                    simdgroup_load(rhs_frag, rhs_block, Bk, ulong2(k, n_subtile_ + col_sg_offset), true);

                    simdgroup_multiply_accumulate(OutTiles[current_index_out_tile], lhs_frag, rhs_frag,
                        OutTiles[current_index_out_tile]);
                }
            }
        }
        threadgroup_barrier(metal::mem_flags::mem_threadgroup);
    }

    // Epilogue.
    threadgroup float scratch[Bm * Bn];
#pragma clang loop unroll(full)
    for (uint n_subtile_ = 0; n_subtile_ < Sg_Bn; n_subtile_ += 8) {
        const uint col_index_in_out_tile = n_subtile_ / 8;
        const uint local_col_offset = col_sg_offset + n_subtile_;
#pragma clang loop unroll(full)
        for (uint m_subtile_ = 0; m_subtile_ < Sg_Bm; m_subtile_ += 8) {
            const uint row_index_in_out_tile = m_subtile_ / 8;
            const uint local_row_offset = row_sg_offset + m_subtile_;
            const uint current_index_out_tile =
                row_index_in_out_tile * (Sg_Bn / 8) + col_index_in_out_tile;
            simdgroup_store(OutTiles[current_index_out_tile], scratch, Bn,
                ulong2(local_col_offset, local_row_offset));
        }
    }
    threadgroup float bias_tile[Bn];
    // TODO(ibahmed): vectorize these loads an maybe unroll the loop.
    for (uint c_local = local_tid.x; c_local < Bn; c_local += thread_count_per_tg) {
        const uint c_global = col_tg_offset + c_local;
        bias_tile[c_local] = (c_global < N) ? static_cast<float>(bias[c_global]) : 0.0f;
    }

    threadgroup_barrier(metal::mem_flags::mem_threadgroup);
    const float alpha = 1.702f;
    // TODO(ibahmed): vectorize these stores and maybe unroll the loop.
    for (uint idx = local_tid.x; idx < Bm * Bn / 2; idx += thread_count_per_tg) {
        const uint idx_swish = idx * 2;
        const uint r = idx_swish / Bn;
        const uint c_swish = idx_swish % Bn;

        const uint out_row = row_tg_offset + r;
        const uint out_col = (col_tg_offset / 2) + (c_swish / 2);

        if (out_row < M && out_col < N_output) {
            float acc_swish = scratch[idx_swish] + bias_tile[c_swish];
            float acc_linear = scratch[idx_swish + 1] + bias_tile[c_swish + 1];
            const float swish = metal::min(acc_swish, params.swiglu_max);
            const float linear = metal::clamp(acc_linear, params.swiglu_min, params.swiglu_max);
            const float swish_y = swish / (1.0f + metal::precise::exp(-alpha * swish));
            const float swiglu_y = metal::fma(swish_y, linear, swish_y);
            out[out_row * N_output + out_col] = swiglu_y;
        }
    }
}

kernel void gptoss_f32_mf4w_moe_dense_matmul(
    constant gptoss_moe_dense_matmul_args& params [[ buffer(0) ]],
    const device uint* __restrict__ expert_offsets [[ buffer(1) ]],
    const device float* lhs [[ buffer(2) ]],
    const device uint* weight_blocks [[ buffer(3) ]],
    const device uchar* weight_scales [[ buffer(4) ]],
    const device bfloat* __restrict__ bias [[ buffer(5) ]],
    device float* out [[ buffer(6) ]],
    uint sg_id [[simdgroup_index_in_threadgroup]],
    uint3 threads_per_tg [[threads_per_threadgroup]],
    uint sg_count_per_tg [[dispatch_simdgroups_per_threadgroup]],
    uint3 gid [[thread_position_in_grid]],
    uint3 tg_id [[threadgroup_position_in_grid]],
    uint3 local_tid [[thread_position_in_threadgroup]]) 
{
    const uint Bm = MOE_DENSE_MATMUL_Bm;
    const uint Bn = MOE_DENSE_MATMUL_Bn;
    const uint Bk = MOE_DENSE_MATMUL_Bk;
    const uint Sg_Bm = MOE_DENSE_MATMUL_Sg_Bm;
    const uint Sg_Bn = MOE_DENSE_MATMUL_Sg_Bn;
    assert(Bm % 8 == 0);
    assert(Bn % 8 == 0);
    assert(Bk % 8 == 0);
    assert(Sg_Bm % 8 == 0);
    assert(Sg_Bn % 8 == 0);
    assert(Bm % Sg_Bm == 0);
    assert(Bn % Sg_Bn == 0);

    const uint K = params.k;
    const uint N = params.n;
    const uint M = expert_offsets[tg_id.z + 1] - expert_offsets[tg_id.z];
    assert((K % 32) == 0);
    assert((K % 8) == 0);
    assert(N % Bn == 0);
    assert(K % Bk == 0);
    // Get row and col tg.
    const uint row_tg = tg_id.y;
    const uint col_tg = tg_id.x;
    // Get row and col local tid.
    const uint row_tg_offset = row_tg * Bm;
    const uint col_tg_offset = col_tg * Bn;
    if (row_tg_offset >= M || col_tg_offset >= N) {
        return;
    }
    // Move lhs and output according to the passed offset.
    const uint expert_offset = expert_offsets[tg_id.z];
    lhs += expert_offset * K;
    out += expert_offset * N;

    const uint S = params.weight_blocks_expert_stride_bytes;
    const uint S_scales = params.weight_scales_expert_stride_bytes;
    const uint S_bias = params.bias_expert_stride_bytes;

    const device char* wb0 = reinterpret_cast<const device char*>(weight_blocks);
    const device char* sc0 = reinterpret_cast<const device char*>(weight_scales);
    const device char* bi0 = reinterpret_cast<const device char*>(bias);

    weight_blocks = reinterpret_cast<const device uint*>(wb0 + tg_id.z * S);
    weight_scales = reinterpret_cast<const device uchar*>(sc0 + tg_id.z * S_scales);
    bias = reinterpret_cast<const device bfloat*>(bi0 + tg_id.z * S_bias);

    const uint sg_col_count = Bn / Sg_Bn;
    const uint row_sg = sg_id / sg_col_count;
    const uint col_sg = sg_id % sg_col_count;

    const uint row_sg_offset = row_sg * Sg_Bm;
    const uint col_sg_offset = col_sg * Sg_Bn;
    // Declare threadgroup blocks.
    threadgroup float lhs_block[Bm * Bk];
    // rhs_block will hold the scaled fp32 weights.
    threadgroup float rhs_block[Bn * Bk];

    constexpr uint temp_result_size = (Sg_Bm / 8) * (Sg_Bn / 8);
    // Create an array of simdgroup_float8x8 to hold temp results.
    metal::simdgroup_float8x8 OutTiles[temp_result_size];
    for (uint i = 0; i < temp_result_size; i++) {
        OutTiles[i] = metal::make_filled_simdgroup_matrix<float, 8, 8>(0.0);
    }
    // Linear thread id within TG (we launch 1-D TGs)
    const uint lin_tid = local_tid.x;

    const uint thread_count_per_tg = threads_per_tg.x * threads_per_tg.y * threads_per_tg.z;
    // Iterate over all Bk blocks.
    for (uint k_offset = 0; k_offset < K; k_offset += Bk) {
        constexpr uint lhs_row_stride = Bk;
        constexpr uint lhs_vec_cols = Bk / 4;
        constexpr uint lhs_vec_total = Bm * lhs_vec_cols;

        const uint LHS_ITERS = ceil_div(lhs_vec_total, thread_count_per_tg);

        for (uint t = 0; t < LHS_ITERS; ++t) {
            const uint i = t * thread_count_per_tg + lin_tid;
            if (i < lhs_vec_total) {
                const uint r = i / lhs_vec_cols;
                const uint c4 = i % lhs_vec_cols;

                const uint gr = row_tg_offset + r;
                const uint gc4 = (k_offset / 4) + c4;

                threadgroup float4* dst4 =
                    reinterpret_cast<threadgroup float4*>(lhs_block + r * lhs_row_stride + (c4 << 2));
                if (gr < M) {
                    const device float4* src4 =
                        reinterpret_cast<const device float4*>(lhs + gr * K + (gc4 << 2));

                    *dst4 = *src4;
                } else {
                    *dst4 = float4(0.0);
                }
            }
        }

        // Load weights with vector loads.
        constexpr uint rhs_row_stride = Bk;
        constexpr uint weights_per_elem = 8;
        constexpr uint rhs_loads_per_col = Bk / weights_per_elem;
        constexpr uint rhs_loads_total = Bn * rhs_loads_per_col;
        const uint RHS_ITERS = ceil_div(rhs_loads_total, thread_count_per_tg);
        // #pragma clang loop unroll(full)
        for (uint t = 0; t < RHS_ITERS; ++t) {
            const uint i = t * thread_count_per_tg + lin_tid;
            if (i < rhs_loads_total) {
                const uint r = i / rhs_loads_per_col;
                const uint c = i % rhs_loads_per_col;

                const uint gr = col_tg_offset + r;
                const uint gc = (k_offset / weights_per_elem) + c;
                const uint gc_scale = (k_offset / 32) + (c >> 2);

                const uint wblock = weight_blocks[gr * (K / weights_per_elem) + gc];
                const float scale =
                    as_type<float>(static_cast<uint>(weight_scales[gr * (K / 32) + gc_scale]) << 23);

                uint wblock0246 = (wblock + wblock);
                uint wblock1357 = (wblock >> 3);
                wblock0246 &= 0x1E1E1E1Eu;
                wblock1357 &= 0x1E1E1E1Eu;

                wblock0246 += 0x70707070u;
                wblock1357 += 0x70707070u;
                wblock0246 &= 0x8E8E8E8Eu;
                wblock1357 &= 0x8E8E8E8Eu;

                uint wblock26 = (wblock0246) & 0xFF00FF00u;
                uint wblock04 = ((wblock0246 << 8)) & 0xFF00FF00u;
                uint wblock37 = (wblock1357) & 0xFF00FF00u;
                uint wblock15 = ((wblock1357 << 8)) & 0xFF00FF00u;

                half4 wblock0426 = as_type<half4>(uint2(wblock04, wblock26));
                half4 wblock1537 = as_type<half4>(uint2(wblock15, wblock37));

                const float w0 = float(wblock0426.x) * scale;
                const float w1 = float(wblock1537.x) * scale;
                const float w2 = float(wblock0426.z) * scale;
                const float w3 = float(wblock1537.z) * scale;
                const float w4 = float(wblock0426.y) * scale;
                const float w5 = float(wblock1537.y) * scale;
                const float w6 = float(wblock0426.w) * scale;
                const float w7 = float(wblock1537.w) * scale;
                const uint rhs_offset = r * rhs_row_stride + c * 8;
                rhs_block[rhs_offset] = w0;
                rhs_block[rhs_offset + 1] = w1;
                rhs_block[rhs_offset + 2] = w2;
                rhs_block[rhs_offset + 3] = w3;
                rhs_block[rhs_offset + 4] = w4;
                rhs_block[rhs_offset + 5] = w5;
                rhs_block[rhs_offset + 6] = w6;
                rhs_block[rhs_offset + 7] = w7;
            }
        }
        threadgroup_barrier(metal::mem_flags::mem_threadgroup);
#pragma clang loop unroll(full)
        for (uint k = 0; k < Bk; k += 8) {
#pragma clang loop unroll(full)
            for (uint m_subtile_ = 0; m_subtile_ < Sg_Bm; m_subtile_ += 8) {
                const uint row_index_in_out_tile = m_subtile_ / 8;
                metal::simdgroup_float8x8 lhs_frag;

                simdgroup_load(lhs_frag, lhs_block, Bk, ulong2(k, m_subtile_ + row_sg_offset));
#pragma clang loop unroll(full)
                for (uint n_subtile_ = 0; n_subtile_ < Sg_Bn; n_subtile_ += 8) {
                    const uint col_index_in_out_tile = n_subtile_ / 8;
                    const uint current_index_out_tile =
                        row_index_in_out_tile * (Sg_Bn / 8) + col_index_in_out_tile;
                    metal::simdgroup_float8x8 rhs_frag;
                    simdgroup_load(rhs_frag, rhs_block, Bk, ulong2(k, n_subtile_ + col_sg_offset), true);
                    simdgroup_multiply_accumulate(OutTiles[current_index_out_tile], lhs_frag, rhs_frag,
                        OutTiles[current_index_out_tile]);
                }
            }
        }
        threadgroup_barrier(metal::mem_flags::mem_threadgroup);
    }

    // Epilogue.
    threadgroup float scratch[Bm * Bn];
#pragma clang loop unroll(full)
    for (uint n_subtile_ = 0; n_subtile_ < Sg_Bn; n_subtile_ += 8) {
        const uint col_index_in_out_tile = n_subtile_ / 8;
        const uint local_col_offset = col_sg_offset + n_subtile_;
#pragma clang loop unroll(full)
        for (uint m_subtile_ = 0; m_subtile_ < Sg_Bm; m_subtile_ += 8) {
            const uint row_index_in_out_tile = m_subtile_ / 8;
            const uint local_row_offset = row_sg_offset + m_subtile_;
            const uint current_index_out_tile =
                row_index_in_out_tile * (Sg_Bn / 8) + col_index_in_out_tile;
            simdgroup_store(OutTiles[current_index_out_tile], scratch, Bn,
                ulong2(local_col_offset, local_row_offset));
        }
    }
    threadgroup float bias_tile[Bn];
    for (uint c_local = local_tid.x; c_local < Bn; c_local += thread_count_per_tg) {
        const uint c_global = col_tg_offset + c_local;
        bias_tile[c_local] = (c_global < N) ? static_cast<float>(bias[c_global]) : 0.0f;
    }

    threadgroup_barrier(metal::mem_flags::mem_threadgroup);
    for (uint idx = local_tid.x; idx < Bm * Bn; idx += thread_count_per_tg) {
        const uint r = idx / Bn;
        const uint c = idx % Bn;

        const uint out_row = row_tg_offset + r;
        const uint out_col = col_tg_offset + c;

        if (out_row < M && out_col < N) {
            float acc = scratch[idx] + bias_tile[c];
            out[out_row * N + out_col] = acc;
        }
    }
}


================================================
FILE: gpt_oss/metal/source/random.metal
================================================
#include <metal_integer>
#include <metal_math>

#include <internal/kernel-args.h>

#pragma METAL fp math_mode(safe)
#pragma METAL fp contract(off)


inline static uint rng_squares32(ulong offset, ulong seed) {
    const ulong y = offset * seed;
    const ulong z = y + seed;

    /* Round 1 */
    ulong x = y * y + y;
    x = metal::rotate(x, 32ul);

    /* Round 2 */
    x = x * x + z;
    x = metal::rotate(x, 32ul);

    /* Round 3 */
    x = x * x + y;
    x = metal::rotate(x, 32ul);

    /* Round 4 */
    x = x * x + z;
    return as_type<uint2>(x).y;
}

kernel void gptoss_u32_fill_random(
    constant gptoss_u32_fill_random_args& args [[ buffer(0) ]],
    device uint* output [[ buffer(1) ]],
    uint gid [[threadgroup_position_in_grid]],
    uint tid [[thread_position_in_threadgroup]],
    uint threadgroup_size [[ threads_per_threadgroup ]])
{
    const ulong num_vecs_per_threadgroup = args.num_vecs_per_threadgroup;
    const ulong threadgroup_start = gid * num_vecs_per_threadgroup;
    const ulong threadgroup_end = metal::min(threadgroup_start + num_vecs_per_threadgroup, args.num_vecs);
    const ulong thread_start = threadgroup_start + tid;
    uint num_iter = static_cast<uint>((threadgroup_end - thread_start + (threadgroup_size - 1)) / threadgroup_size);

    output += thread_start;
    ulong offset = args.offset + thread_start;
    for (; num_iter != 0; num_iter--) {
        *output = rng_squares32(offset, args.seed);
        output += threadgroup_size;
        offset += threadgroup_size;
    }
}

kernel void gptoss_f32_fill_random(
    constant gptoss_f32_fill_random_args& args [[ buffer(0) ]],
    device float* output [[ buffer(1) ]],
    uint gid [[threadgroup_position_in_grid]],
    uint tid [[thread_position_in_threadgroup]],
    uint threadgroup_size [[ threads_per_threadgroup ]])
{
    const ulong num_vecs_per_threadgroup = args.num_vecs_per_threadgroup;
    const ulong threadgroup_start = gid * num_vecs_per_threadgroup;
    const ulong threadgroup_end = metal::min(threadgroup_start + num_vecs_per_threadgroup, args.num_vecs);
    const ulong thread_start = threadgroup_start + tid;
    uint num_iter = static_cast<uint>((threadgroup_end - thread_start + (threadgroup_size - 1)) / threadgroup_size);

    output += thread_start;
    ulong offset = args.offset + thread_start;
    for (; num_iter != 0; num_iter--) {
        const uint word = rng_squares32(offset, args.seed);
        *output = metal::fma(static_cast<float>(as_type<int>(word)), args.scale, args.bias);
        output += threadgroup_size;
        offset += threadgroup_size;
    }
}

kernel void gptoss_bf16_fill_random(
    constant gptoss_f32_fill_random_args& args [[ buffer(0) ]],
    device bfloat* output [[ buffer(1) ]],
    uint gid [[threadgroup_position_in_grid]],
    uint tid [[thread_position_in_threadgroup]],
    uint threadgroup_size [[ threads_per_threadgroup ]])
{
    const ulong num_vecs_per_threadgroup = args.num_vecs_per_threadgroup;
    const ulong threadgroup_start = gid * num_vecs_per_threadgroup;
    const ulong threadgroup_end = metal::min(threadgroup_start + num_vecs_per_threadgroup, args.num_vecs);
    const ulong thread_start = threadgroup_start + tid;
    uint num_iter = static_cast<uint>((threadgroup_end - thread_start + (threadgroup_size - 1)) / threadgroup_size);

    output += thread_start;
    ulong offset = args.offset + thread_start;
    for (; num_iter != 0; num_iter--) {
        const uint word = rng_squares32(offset, args.seed);
        *output = static_cast<bfloat>(metal::fma(static_cast<float>(as_type<int>(word)), args.scale, args.bias));
        output += threadgroup_size;
        offset += threadgroup_size;
    }
}


================================================
FILE: gpt_oss/metal/source/rmsnorm.metal
================================================
#include <metal_compute>
#include <metal_math>
#include <metal_simdgroup>

#include <internal/kernel-args.h>

#pragma METAL fp math_mode(safe)
#pragma METAL fp contract(off)


[[max_total_threads_per_threadgroup(1024)]]
kernel void gptoss_f32_bf16w_rmsnorm(
    constant gptoss_rmsnorm_args& args [[ buffer(0) ]],
    const device float4* input [[ buffer(1) ]],
    const device bfloat4* weights [[ buffer(2) ]],
    device float4* output [[ buffer(3) ]],
    const device gptoss_control* control [[ buffer(4) ]],
    uint gid [[threadgroup_position_in_grid]],
    uint tid [[thread_position_in_threadgroup]],
    uint threadgroup_size [[ threads_per_threadgroup ]])
{
    const uint simdgroup_size = 32;
    threadgroup float threadgroup_buffer[32];
    if (control->abort != 0) {
        return;
    }

    input += gid * args.num_vecs;
    output += gid * args.num_vecs;

    float4 sumsq4 = 0.0f;
    for (uint i = tid; i < args.num_vecs; i += threadgroup_size) {
        const float4 val = input[i];
        sumsq4 = metal::fma(val, val, sumsq4);
    }

    // Tree-reduce sumsq within thread, then all-reduce within threadgroup.
    const float2 sumsq2 = sumsq4.xy + sumsq4.zw;
    float sumsq = sumsq2.x + sumsq2.y;
    // Warning: this all-reduce works only for simdgroup of 32 threads and threadgroup of 32*32=1024 threads.
    sumsq = metal::simd_sum(sumsq);
    if (metal::simd_is_first()) {
        const uint simdgroup_idx = tid / simdgroup_size;
        threadgroup_buffer[simdgroup_idx] = sumsq;
    }
    metal::threadgroup_barrier(metal::mem_flags::mem_threadgroup);
    const uint simdgroup_tid = tid % simdgroup_size;
    sumsq = threadgroup_buffer[simdgroup_tid];
    sumsq = metal::simd_sum(sumsq);

    const float avgsq = sumsq / args.num_channels;
    const float scale = metal::precise::rsqrt(avgsq + args.epsilon);
    for (uint i = tid; i < args.num_vecs; i += threadgroup_size) {
        const float4 val = input[i] * scale;
        const float4 weight_val = static_cast<float4>(weights[i]);
        output[i] = val * weight_val;
    }
}


================================================
FILE: gpt_oss/metal/source/rope.metal
================================================
#include <metal_common>
#include <metal_math>

#include <internal/kernel-args.h>

#pragma METAL fp math_mode(safe)
#pragma METAL fp contract(off)


// Each thread handles 2 head elements.
// Each simdgroup handles one head (64 head elements).

kernel void gptoss_f32_rope(
    constant gptoss_rope_args& args [[ buffer(0) ]],
    device float2* activations [[ buffer(1) ]],
    device float2* kv [[ buffer(2) ]],
    const device gptoss_control* control [[ buffer(3) ]],
    uint2 gid [[thread_position_in_grid]])
{
    const uint num_head_dims = 64;
    if (control->abort != 0) {
        return;
    }

    const float dim_idx = static_cast<float>(gid.x % (num_head_dims / 2));
    const uint token_idx = args.token_offset + gid.y;
    activations += gid.y * args.token_stride + gid.x;

    const float2 input_vals = *activations;
    const float inv_extrapolation_freq = metal::precise::exp(dim_idx * args.freq_scale);
    const float inv_interpolation_freq = inv_extrapolation_freq * args.interpolation_scale;
    const float alpha = metal::saturate(metal::fma(dim_idx, args.yarn_scale, args.yarn_offset));
    const float inv_freq = metal::mix(inv_extrapolation_freq, inv_interpolation_freq, alpha);

    const float phi = static_cast<float>(token_idx) * inv_freq;
    const float yarn_multiplier = args.yarn_multiplier;
    float cosphi;
    const float sinphi = metal::precise::sincos(phi, cosphi) * yarn_multiplier;
    cosphi *= yarn_multiplier;

    const float output_re = input_vals.x * cosphi - input_vals.y * sinphi;
    const float output_im = input_vals.x * sinphi + input_vals.y * cosphi;
    *activations = (float2) { output_re, output_im };

    const uint head_dim = 64;
    const uint num_q_heads = 64;
    const uint num_kv_heads = 8;
    const uint head_idx = gid.x / (head_dim / 2);
    float2 vals = (float2) { output_re, output_im };
    if ((head_idx < num_q_heads)) {
        *activations = vals;
    } else if (head_idx < num_q_heads + num_kv_heads) {
        // Write k and v directly to the kv cache.
        const uint kv_head_idx = head_idx - num_q_heads;
        const uint dim_pair_idx = gid.x % (head_dim / 2);
        kv[(kv_head_idx * args.max_tokens + token_idx) * head_dim + dim_pair_idx] = vals;
    }
}

================================================
FILE: gpt_oss/metal/source/sample.metal
================================================
#include <metal_compute>
#include <metal_integer>
#include <metal_math>
#include <metal_simdgroup>

#include <internal/kernel-args.h>

#pragma METAL fp math_mode(safe)
#pragma METAL fp contract(off)


inline static uint rng_squares32(ulong offset, ulong seed) {
    const ulong y = offset * seed;
    const ulong z = y + seed;

    /* Round 1 */
    ulong x = y * y + y;
    x = metal::rotate(x, 32ul);

    /* Round 2 */
    x = x * x + z;
    x = metal::rotate(x, 32ul);

    /* Round 3 */
    x = x * x + y;
    x = metal::rotate(x, 32ul);

    /* Round 4 */
    x = x * x + z;
    return as_type<uint2>(x).y;
}

kernel void gptoss_f32_softmax(
    constant gptoss_softmax_args& args [[ buffer(0) ]],
    const device float* score [[ buffer(1) ]],
    const device uint2* argmax [[ buffer(2) ]],
    device float* prob [[ buffer(3) ]],
    device float* sum [[ buffer(4) ]],
    const device gptoss_control* control [[ buffer(5) ]],
    uint tidx [[thread_index_in_threadgroup]],
    uint2 gid [[threadgroup_position_in_grid]],
    uint2 threadgroup_size [[threads_per_threadgroup]],
    uint simdgroup_tid [[thread_index_in_simdgroup]],
    uint simdgroup_idx [[simdgroup_index_in_threadgroup]],
    uint num_simdgroups [[simdgroups_per_threadgroup]])
{
    threadgroup float threadgroup_sumexp[32];
    if (control->abort != 0) {
        return;
    }

    score += gid.y * args.num_vecs + gid.x * args.num_vecs_per_threadgroup;
    prob += gid.y * args.num_vecs + gid.x * args.num_vecs_per_threadgroup;
    sum += gid.y * args.max_threadgroups;

    uint max_bits = argmax[gid.y].y;
    if (static_cast<int>(max_bits) >= 0) {
        max_bits ^= 0x7FFFFFFFu;
    }
    const float max_val = as_type<float>(max_bits);
    float sum_exp = 0.0f;
    const uint num_vecs_per_threadgroup = metal::min(args.num_vecs - gid.x * args.num_vecs_per_threadgroup, args.num_vecs_per_threadgroup);
    for (uint i = tidx; i < num_vecs_per_threadgroup; i += threadgroup_size.x) {
        const float score_val = score[i];
        const float prob_val = metal::precise::exp((score_val - max_val) * args.temperature);
        prob[i] = prob_val;
        sum_exp += prob_val;
    }
    sum_exp = metal::simd_sum(sum_exp);
    if (metal::simd_is_first()) {
        threadgroup_sumexp[simdgroup_idx] = sum_exp;
    }
    metal::threadgroup_barrier(metal::mem_flags::mem_threadgroup);
    if (simdgroup_idx == 0) {
        // Sum-Reduce threadgroup_sumexp
        sum_exp = 0.0f;
        if (simdgroup_tid < num_simdgroups) {
            sum_exp = threadgroup_sumexp[simdgroup_tid];
        }
        sum_exp = metal::simd_sum(sum_exp);
        if (metal::simd_is_first()) {
            sum[gid.x] = sum_exp;
        }
    }
}

[[max_total_threads_per_threadgroup(1024)]]
kernel void gptoss_f32_sample(
    constant gptoss_sample_args& args [[ buffer(0) ]],
    device const float* prob [[ buffer(1) ]],
    device const float* sum [[ buffer(2) ]],
    device uint* prediction [[ buffer(3) ]],
    device gptoss_control* control [[ buffer(4) ]],
    uint tid [[thread_position_in_threadgroup]],
    uint threadgroup_size [[threads_per_threadgroup]],
    uint simdgroup_tid [[thread_index_in_simdgroup]],
    uint simdgroup_idx [[simdgroup_index_in_threadgroup]],
    uint num_simdgroups [[simdgroups_per_threadgroup]])
{
    threadgroup float threadgroup_sum_buffer[32];
    threadgroup uint threadgroup_idx_buffer[32];
    threadgroup float threadgroup_cumsum_buffer[32];
    if (control->abort != 0) {
        return;
    }

    const uint sample_word = rng_squares32(args.rng_offset, args.rng_seed);
    float sample_cdf = static_cast<float>(sample_word & 0x00FFFFFFu) * 0x1.0p-24f;

    float cumsum = 0.0f;
    if (tid < args.num_blocks) {
        cumsum = sum[tid];
    }
    cumsum = metal::simd_prefix_inclusive_sum(cumsum);
    if (simdgroup_tid == 31) {
        threadgroup_sum_buffer[simdgroup_idx] = cumsum;
    }
    metal::threadgroup_barrier(metal::mem_flags::mem_threadgroup);
    float threadgroup_cumsum = 0.0f, threadgroup_sum = 0.0f;
    if (simdgroup_tid < num_simdgroups) {
        threadgroup_sum = threadgroup_sum_buffer[simdgroup_tid];
        if (simdgroup_tid < simdgroup_idx) {
            threadgroup_cumsum = threadgroup_sum;
        }
    }
    threadgroup_sum = metal::simd_sum(threadgroup_sum);
    cumsum += metal::simd_sum(threadgroup_cumsum);

    sample_cdf *= threadgroup_sum;
    sample_cdf = metal::max(sample_cdf, 0x1.0p-149f);

    // Find the block: the smallest tid where sample_cdf >= s
    uint block_idx = args.num_blocks;
    float block_sum = cumsum;
    if (tid >= args.num_blocks - 1) {
        block_idx = args.num_blocks - 1;
        block_sum = 0.0f;
    } else if (cumsum >= sample_cdf) {
        block_idx = tid;
        block_sum = 0.0f;
    }
    block_idx = metal::simd_min(block_idx);
    block_sum = metal::simd_max(block_sum);
    if (simdgroup_tid == 0) {
        threadgroup_idx_buffer[simdgroup_idx] = block_idx;
        threadgroup_cumsum_buffer[simdgroup_idx] = block_sum;
    }
    metal::threadgroup_barrier(metal::mem_flags::mem_threadgroup);
    if (simdgroup_tid < num_simdgroups) {
        block_idx = threadgroup_idx_buffer[simdgroup_tid];
        block_sum = threadgroup_cumsum_buffer[simdgroup_tid];
    }
    block_idx = metal::simd_min(block_idx);
    block_sum = metal::simd_max(block_sum);

    const uint block_start = args.num_dims_per_block * block_idx;
    const uint block_end = metal::min(block_start + args.num_dims_per_block, args.num_dims);
    uint offset = block_start + tid;
    float accumulated_sum = block_sum;
    uint sample_idx;

    // This loop must be threadgroup-uniform.
    do {
        // Find the token: the smallest tid where sample_cdf >= s
        float cumsum = 0.0f;
        if (offset < block_end) {
            cumsum = prob[offset];
        }
        cumsum = metal::simd_prefix_inclusive_sum(cumsum);
        if (simdgroup_tid == 31) {
            threadgroup_sum_buffer[simdgroup_idx] = cumsum;
        }
        metal::threadgroup_barrier(metal::mem_flags::mem_threadgroup);
        float threadgroup_cumsum = 0.0f, threadgroup_sum = 0.0f;
        if (simdgroup_tid < num_simdgroups) {
            threadgroup_sum = threadgroup_sum_buffer[simdgroup_tid];
            if (simdgroup_tid < simdgroup_idx) {
                threadgroup_cumsum = threadgroup_sum;
            }
        }
        threadgroup_sum = metal::simd_sum(threadgroup_sum);
        cumsum += metal::simd_sum(threadgroup_cumsum);
        cumsum += accumulated_sum;

        sample_idx = block_end;
        if (offset >= block_end) {
            // Trigger loop exit, with the last token in the block being sampled if no other candidate was found.
            sample_idx = block_end - 1;
        } else if (cumsum >= sample_cdf) {
            sample_idx = offset;
        }
        sample_idx = metal::simd_min(sample_idx);
        if (simdgroup_tid == 0) {
            threadgroup_idx_buffer[simdgroup_idx] = sample_idx;
        }
        metal::threadgroup_barrier(metal::mem_flags::mem_threadgroup);
        if (simdgroup_tid < num_simdgroups) {
            sample_idx = threadgroup_idx_buffer[simdgroup_tid];
        }
        sample_idx = metal::simd_min(sample_idx);

        offset += threadgroup_size;
        accumulated_sum += threadgroup_sum;
    } while (sample_idx == block_end);

    if (tid == 0) {
        *prediction = sample_idx;
    }
}


================================================
FILE: gpt_oss/metal/source/scatter.metal
================================================
#include <internal/kernel-args.h>
#include <metal_integer>
#include <metal_math>
#include <metal_stdlib>

// TODO(ibrahim): This is not optimal as each thread only scatters a single float4. To amortize the
// cost of reading the expert id and offset for a token, we should let each thread scatter several
// float4s.
kernel void gptoss_f32_scatter_e4(
    constant gptoss_scatter_args& args [[ buffer(0) ]],
    const device float* in [[ buffer(1) ]],
    const device gptoss_expert_prediction* __restrict__ expert_predictions [[ buffer(2) ]],
    const device uint* __restrict__ expert_offsets [[ buffer(3) ]],
    const device uint* __restrict__ intra_expert_offsets [[ buffer(4) ]],
    device float* out [[ buffer(5) ]],
    uint3 gid [[thread_position_in_grid]]) 
{
    const uint total_tokens = args.tokens;
    const uint active_experts_per_token = args.active_experts_per_token;
    const uint embedding_dim = args.token_stride;
    assert(embedding_dim % 4 == 0);
    // Hard coded to top4 for now.
    assert(active_experts_per_token == 4);
    const uint row_in = gid.y;
    if (row_in >= total_tokens) {
        return;
    }
    // Consecutive threads in a tg read consecutive columns of the input.
    const uint col_in_vec4 = gid.x;
    const uint col_in = col_in_vec4 * 4;
    if (col_in >= embedding_dim) {
        return;
    }
    // Pointer to the piece of the input that we will copy to the top4 experts.
    const device float4* src4 =
        reinterpret_cast<const device float4*>(in + row_in * embedding_dim + col_in);

    // Get the 4 destinations -- 4 experts.
    const uint base = row_in * active_experts_per_token;
    const uint expert0_id = expert_predictions[base].expert_id;
    const uint expert1_id = expert_predictions[base + 1].expert_id;
    const uint expert2_id = expert_predictions[base + 2].expert_id;
    const uint expert3_id = expert_predictions[base + 3].expert_id;
    const uint expert0_offset = expert_offsets[expert0_id];
    const uint expert1_offset = expert_offsets[expert1_id];
    const uint expert2_offset = expert_offsets[expert2_id];
    const uint expert3_offset = expert_offsets[expert3_id];
    const uint expert0_intra_expert_offset = intra_expert_offsets[base];
    const uint expert1_intra_expert_offset = intra_expert_offsets[base + 1];
    const uint expert2_intra_expert_offset = intra_expert_offsets[base + 2];
    const uint expert3_intra_expert_offset = intra_expert_offsets[base + 3];
    device float4* dst4_0 = reinterpret_cast<device float4*>(
        out + (expert0_offset + expert0_intra_expert_offset) * embedding_dim + col_in);
    device float4* dst4_1 = reinterpret_cast<device float4*>(
        out + (expert1_offset + expert1_intra_expert_offset) * embedding_dim + col_in);
    device float4* dst4_2 = reinterpret_cast<device float4*>(
        out + (expert2_offset + expert2_intra_expert_offset) * embedding_dim + col_in);
    device float4* dst4_3 = reinterpret_cast<device float4*>(
        out + (expert3_offset + expert3_intra_expert_offset) * embedding_dim + col_in);
    const float4 data = *src4;
    *dst4_0 = data;
    *dst4_1 = data;
    *dst4_2 = data;
    *dst4_3 = data;
}


================================================
FILE: gpt_oss/metal/source/sdpa.metal
================================================
#include <metal_geometric>
#include <metal_integer>
#include <metal_math>
#include <metal_compute>
#include <metal_simdgroup>

#include <internal/kernel-args.h>

#pragma METAL fp math_mode(safe)
#pragma METAL fp contract(off)

// Each threadgroup handles 8 Q heads / 1 KV head for 1 token

kernel void gptoss_f32_sdpa_q8_d64(
    constant gptoss_sdpa_args& args [[ buffer(0) ]],
    const device float* q [[ buffer(1) ]],
    const device float* kv [[ buffer(2) ]],
    const device bfloat* s [[ buffer(3) ]],
    device float* output [[ buffer(4) ]],
    const device gptoss_control* control [[ buffer(6) ]],
    threadgroup void* threadgroup_buffer [[ threadgroup(0) ]],
    uint2 gid [[threadgroup_position_in_grid]],
    uint2 tid [[thread_position_in_threadgroup]],
    uint simdgroup_tid [[thread_index_in_simdgroup]],
    uint simdgroup_idx [[simdgroup_index_in_threadgroup]],
    uint num_simdgroups [[simdgroups_per_threadgroup]])
{
    const uint simdgroup_size = 32;
    if (control->abort != 0) {
        return;
    }

    const uint num_q_heads = 64;
    const uint head_dim = 64;
    const uint qmul = 8;

    const uint token_stride = 2 * head_dim;

    const uint qt = gid.x;  // Q token index
    const uint h = gid.y;   // KV head index

    q += qt * args.qkv_dim + h * (qmul * head_dim);
    kv += h * args.kv_stride;
    output += qt * (num_q_heads * head_dim) + h * (qmul * head_dim);

    float m0 = static_cast<float>(s[h * qmul + 0]);
    float m1 = static_cast<float>(s[h * qmul + 1]);
    float m2 = static_cast<float>(s[h * qmul + 2]);
    float m3 = static_cast<float>(s[h * qmul + 3]);
    float m4 = static_cast<float>(s[h * qmul + 4]);
    float m5 = static_cast<float>(s[h * qmul + 5]);
    float m6 = static_cast<float>(s[h * qmul + 6]);
    float m7 = static_cast<float>(s[h * qmul + 7]);

    float l0 = simdgroup_idx == 0 ? 1.0f : 0.0f;
    float l1 = simdgroup_idx == 0 ? 1.0f : 0.0f;
    float l2 = simdgroup_idx == 0 ? 1.0f : 0.0f;
    float l3 = simdgroup_idx == 0 ? 1.0f : 0.0f;
    float l4 = simdgroup_idx == 0 ? 1.0f : 0.0f;
    float l5 = simdgroup_idx == 0 ? 1.0f : 0.0f;
    float l6 = simdgroup_idx == 0 ? 1.0f : 0.0f;
    float l7 = simdgroup_idx == 0 ? 1.0f : 0.0f;

    float2 out0 = 0.0f;
    float2 out1 = 0.0f;
    float2 out2 = 0.0f;
    float2 out3 = 0.0f;
    float2 out4 = 0.0f;
    float2 out5 = 0.0f;
    float2 out6 = 0.0f;
    float2 out7 = 0.0f;

    float2 q0 = reinterpret_cast<const device float2*>(q + 0 * head_dim)[simdgroup_tid];
    float2 q1 = reinterpret_cast<const device float2*>(q + 1 * head_dim)[simdgroup_tid];
    float2 q2 = reinterpret_cast<const device float2*>(q + 2 * head_dim)[simdgroup_tid];
    float2 q3 = reinterpret_cast<const device float2*>(q + 3 * head_dim)[simdgroup_tid];
    float2 q4 = reinterpret_cast<const device float2*>(q + 4 * head_dim)[simdgroup_tid];
    float2 q5 = reinterpret_cast<const device float2*>(q + 5 * head_dim)[simdgroup_tid];
    float2 q6 = reinterpret_cast<const device float2*>(q + 6 * head_dim)[simdgroup_tid];
    float2 q7 = reinterpret_cast<const device float2*>(q + 7 * head_dim)[simdgroup_tid];

    const uint kt_end = qt + args.num_kv_tokens + 1;
    const uint kt_start = metal::subsat(kt_end, args.window) + simdgroup_idx;
    kv += token_stride * kt_start;
    for (uint kt = kt_start; kt < kt_end; kt += num_simdgroups) {
        const float2 kval = reinterpret_cast<const device float2*>(kv)[simdgroup_tid];

        float qk0 = metal::dot(q0, kval);
        float qk1 = metal::dot(q1, kval);
        float qk2 = metal::dot(q2, kval);
        float qk3 = metal::dot(q3, kval);
        float qk4 = metal::dot(q4, kval);
        float qk5 = metal::dot(q5, kval);
        float qk6 = metal::dot(q6, kval);
        float qk7 = metal::dot(q7, kval);

        qk0 = metal::simd_sum(qk0);
        qk1 = metal::simd_sum(qk1);
        qk2 = metal::simd_sum(qk2);
        qk3 = metal::simd_sum(qk3);
        qk4 = metal::simd_sum(qk4);
        qk5 = metal::simd_sum(qk5);
        qk6 = metal::simd_sum(qk6);
        qk7 = metal::simd_sum(qk7);

        const float new_m0 = metal::max(m0, qk0);
        const float new_m1 = metal::max(m1, qk1);
        const float new_m2 = metal::max(m2, qk2);
        const float new_m3 = metal::max(m3, qk3);
        const float new_m4 = metal::max(m4, qk4);
        const float new_m5 = metal::max(m5, qk5);
        const float new_m6 = metal::max(m6, qk6);
        const float new_m7 = metal::max(m7, qk7);

        const float alpha0 = metal::fast::exp(m0 - new_m0);
        const float alpha1 = metal::fast::exp(m1 - new_m1);
        const float alpha2 = metal::fast::exp(m2 - new_m2);
        const float alpha3 = metal::fast::exp(m3 - new_m3);
        const float alpha4 = metal::fast::exp(m4 - new_m4);
        const float alpha5 = metal::fast::exp(m5 - new_m5);
        const float alpha6 = metal::fast::exp(m6 - new_m6);
        const float alpha7 = metal::fast::exp(m7 - new_m7);

        qk0 = metal::fast::exp(qk0 - new_m0);
        qk1 = metal::fast::exp(qk1 - new_m1);
        qk2 = metal::fast::exp(qk2 - new_m2);
        qk3 = metal::fast::exp(qk3 - new_m3);
        qk4 = metal::fast::exp(qk4 - new_m4);
        qk5 = metal::fast::exp(qk5 - new_m5);
        qk6 = metal::fast::exp(qk6 - new_m6);
        qk7 = metal::fast::exp(qk7 - new_m7);

        l0 = metal::fma(l0, alpha0, qk0);
        l1 = metal::fma(l1, alpha1, qk1);
        l2 = metal::fma(l2, alpha2, qk2);
        l3 = metal::fma(l3, alpha3, qk3);
        l4 = metal::fma(l4, alpha4, qk4);
        l5 = metal::fma(l5, alpha5, qk5);
        l6 = metal::fma(l6, alpha6, qk6);
        l7 = metal::fma(l7, alpha7, qk7);

        m0 = new_m0;
        m1 = new_m1;
        m2 = new_m2;
        m3 = new_m3;
        m4 = new_m4;
        m5 = new_m5;
        m6 = new_m6;
        m7 = new_m7;

        const float2 vval = reinterpret_cast<const device float2*>(kv + head_dim)[simdgroup_tid];
        kv += token_stride * num_simdgroups;
        out0 = metal::fma(vval, qk0, out0 * alpha0);
        out1 = metal::fma(vval, qk1, out1 * alpha1);
        out2 = metal::fma(vval, qk2, out2 * alpha2);
        out3 = metal::fma(vval, qk3, out3 * alpha3);
        out4 = metal::fma(vval, qk4, out4 * alpha4);
        out5 = metal::fma(vval, qk5, out5 * alpha5);
        out6 = metal::fma(vval, qk6, out6 * alpha6);
        out7 = metal::fma(vval, qk7, out7 * alpha7);
    }
    if (num_simdgroups > 1) {
        if (metal::simd_is_first()) {
            static_cast<threadgroup float*>(threadgroup_buffer)[0 * num_simdgroups + simdgroup_idx] = m0;
            static_cast<threadgroup float*>(threadgroup_buffer)[1 * num_simdgroups + simdgroup_idx] = m1;
            static_cast<threadgroup float*>(threadgroup_buffer)[2 * num_simdgroups + simdgroup_idx] = m2;
            static_cast<threadgroup float*>(threadgroup_buffer)[3 * num_simdgroups + simdgroup_idx] = m3;
            static_cast<threadgroup float*>(threadgroup_buffer)[4 * num_simdgroups + simdgroup_idx] = m4;
            static_cast<threadgroup float*>(threadgroup_buffer)[5 * num_simdgroups + simdgroup_idx] = m5;
            static_cast<threadgroup float*>(threadgroup_buffer)[6 * num_simdgroups + simdgroup_idx] = m6;
            static_cast<threadgroup float*>(threadgroup_buffer)[7 * num_simdgroups + simdgroup_idx] = m7;

            static_cast<threadgroup float*>(threadgroup_buffer)[ 8 * num_simdgroups + simdgroup_idx] = l0;
            static_cast<threadgroup float*>(threadgroup_buffer)[ 9 * num_simdgroups + simdgroup_idx] = l1;
            static_cast<threadgroup float*>(threadgroup_buffer)[10 * num_simdgroups + simdgroup_idx] = l2;
            static_cast<threadgroup float*>(threadgroup_buffer)[11 * num_simdgroups + simdgroup_idx] = l3;
            static_cast<threadgroup float*>(threadgroup_buffer)[12 * num_simdgroups + simdgroup_idx] = l4;
            static_cast<threadgroup float*>(threadgroup_buffer)[13 * num_simdgroups + simdgroup_idx] = l5;
            static_cast<threadgroup float*>(threadgroup_buffer)[14 * num_simdgroups + simdgroup_idx] = l6;
            static_cast<threadgroup float*>(threadgroup_buffer)[15 * num_simdgroups + simdgroup_idx] = l7;
        }
        metal::threadgroup_barrier(metal::mem_flags::mem_threadgroup);
        // Note: simdgroup refers not to the thread's current simdgroup, but to one with simdgroup_idx == thread's simdgroup_tid.
        float simdgroup_m0 = m0;
        float simdgroup_m1 = m1;
        float simdgroup_m2 = m2;
        float simdgroup_m3 = m3;
        float simdgroup_m4 = m4;
        float simdgroup_m5 = m5;
        float simdgroup_m6 = m6;
        float simdgroup_m7 = m7;
        if (simdgroup_tid < num_simdgroups) {
            simdgroup_m0 = static_cast<const threadgroup float*>(threadgroup_buffer)[0 * num_simdgroups + simdgroup_tid];
            simdgroup_m1 = static_cast<const threadgroup float*>(threadgroup_buffer)[1 * num_simdgroups + simdgroup_tid];
            simdgroup_m2 = static_cast<const threadgroup float*>(threadgroup_buffer)[2 * num_simdgroups + simdgroup_tid];
            simdgroup_m3 = static_cast<const threadgroup float*>(threadgroup_buffer)[3 * num_simdgroups + simdgroup_tid];
            simdgroup_m4 = static_cast<const threadgroup float*>(threadgroup_buffer)[4 * num_simdgroups + simdgroup_tid];
            simdgroup_m5 = static_cast<const threadgroup float*>(threadgroup_buffer)[5 * num_simdgroups + simdgroup_tid];
            simdgroup_m6 = static_cast<const threadgroup float*>(threadgroup_buffer)[6 * num_simdgroups + simdgroup_tid];
            simdgroup_m7 = static_cast<const threadgroup float*>(threadgroup_buffer)[7 * num_simdgroups + simdgroup_tid];
        }

        const float threadgroup_m0 = metal::simd_max(simdgroup_m0);
        const float threadgroup_m1 = metal::simd_max(simdgroup_m1);
        const float threadgroup_m2 = metal::simd_max(simdgroup_m2);
        const float threadgroup_m3 = metal::simd_max(simdgroup_m3);
        const float threadgroup_m4 = metal::simd_max(simdgroup_m4);
        const float threadgroup_m5 = metal::simd_max(simdgroup_m5);
        const float threadgroup_m6 = metal::simd_max(simdgroup_m6);
        const float threadgroup_m7 = metal::simd_max(simdgroup_m7);

        out0 *= metal::fast::exp(m0 - threadgroup_m0);
        out1 *= metal::fast::exp(m1 - threadgroup_m1);
        out2 *= metal::fast::exp(m2 - threadgroup_m2);
        out3 *= metal::fast::exp(m3 - threadgroup_m3);
        out4 *= metal::fast::exp(m4 - threadgroup_m4);
        out5 *= metal::fast::exp(m5 - threadgroup_m5);
        out6 *= metal::fast::exp(m6 - threadgroup_m6);
        out7 *= metal::fast::exp(m7 - threadgroup_m7);

        if (simdgroup_idx == 0) {
            l0 = 0.0f;
            l1 = 0.0f;
            l2 = 0.0f;
            l3 = 0.0f;
            l4 = 0.0f;
            l5 = 0.0f;
            l6 = 0.0f;
            l7 = 0.0f;
            if (simdgroup_tid < num_simdgroups) {
                l0 = static_cast<const threadgroup float*>(threadgroup_buffer)[ 8 * num_simdgroups + simdgroup_tid];
                l1 = static_cast<const threadgroup float*>(threadgroup_buffer)[ 9 * num_simdgroups + simdgroup_tid];
                l2 = static_cast<const threadgroup float*>(threadgroup_buffer)[10 * num_simdgroups + simdgroup_tid];
                l3 = static_cast<const threadgroup float*>(threadgroup_buffer)[11 * num_simdgroups + simdgroup_tid];
                l4 = static_cast<const threadgroup float*>(threadgroup_buffer)[12 * num_simdgroups + simdgroup_tid];
                l5 = static_cast<const threadgroup float*>(threadgroup_buffer)[13 * num_simdgroups + simdgroup_tid];
                l6 = static_cast<const threadgroup float*>(threadgroup_buffer)[14 * num_simdgroups + simdgroup_tid];
                l7 = static_cast<const threadgroup float*>(threadgroup_buffer)[15 * num_simdgroups + simdgroup_tid];
            }

            l0 = metal::simd_sum(l0 * metal::fast::exp(simdgroup_m0 - threadgroup_m0));
            l1 = metal::simd_sum(l1 * metal::fast::exp(simdgroup_m1 - threadgroup_m1));
            l2 = metal::simd_sum(l2 * metal::fast::exp(simdgroup_m2 - threadgroup_m2));
            l3 = metal::simd_sum(l3 * metal::fast::exp(simdgroup_m3 - threadgroup_m3));
            l4 = metal::simd_sum(l4 * metal::fast::exp(simdgroup_m4 - threadgroup_m4));
            l5 = metal::simd_sum(l5 * metal::fast::exp(simdgroup_m5 - threadgroup_m5));
            l6 = metal::simd_sum(l6 * metal::fast::exp(simdgroup_m6 - threadgroup_m6));
            l7 = metal::simd_sum(l7 * metal::fast::exp(simdgroup_m7 - threadgroup_m7));
        }

        uint num_threads = num_simdgroups * simdgroup_size;
        do {
            const uint num_smem_threads = (num_threads / 2) & -simdgroup_size;
            const uint num_half_threads = num_threads - num_smem_threads;

            metal::threadgroup_barrier(metal::mem_flags::mem_threadgroup);
            const uint smem_tid = tid.x - num_half_threads;
            if (smem_tid < num_smem_threads) {
                static_cast<threadgroup float2*>(threadgroup_buffer)[num_smem_threads * 0 + smem_tid] = out0;
                static_cast<threadgroup float2*>(threadgroup_buffer)[num_smem_threads * 1 + smem_tid] = out1;
                static_cast<threadgroup float2*>(threadgroup_buffer)[num_smem_threads * 2 + smem_tid] = out2;
                static_cast<threadgroup float2*>(threadgroup_buffer)[num_smem_threads * 3 + smem_tid] = out3;
                static_cast<threadgroup float2*>(threadgroup_buffer)[num_smem_threads * 4 + smem_tid] = out4;
                static_cast<threadgroup float2*>(threadgroup_buffer)[num_smem_threads * 5 + smem_tid] = out5;
                static_cast<threadgroup float2*>(threadgroup_buffer)[num_smem_threads * 6 + smem_tid] = out6;
                static_cast<threadgroup float2*>(threadgroup_buffer)[num_smem_threads * 7 + smem_tid] = out7;
            }
            metal::threadgroup_barrier(metal::mem_flags::mem_threadgroup);
            if (tid.x < num_smem_threads) {
                out0 += static_cast<const threadgroup float2*>(threadgroup_buffer)[num_smem_threads * 0 + tid.x];
                out1 += static_cast<const threadgroup float2*>(threadgroup_buffer)[num_smem_threads * 1 + tid.x];
                out2 += static_cast<const threadgroup float2*>(threadgroup_buffer)[num_smem_threads * 2 + tid.x];
                out3 += static_cast<const threadgroup float2*>(threadgroup_buffer)[num_smem_threads * 3 + tid.x];
                out4 += static_cast<const threadgroup float2*>(threadgroup_buffer)[num_smem_threads * 4 + tid.x];
                out5 += static_cast<const threadgroup float2*>(threadgroup_buffer)[num_smem_threads * 5 + tid.x];
                out6 += static_cast<const threadgroup float2*>(threadgroup_buffer)[num_smem_threads * 6 + tid.x];
                out7 += static_cast<const threadgroup float2*>(threadgroup_buffer)[num_smem_threads * 7 + tid.x];
            }

            num_threads = num_half_threads;
        } while (num_threads > simdgroup_size);
    }
    if (simdgroup_idx == 0) {
        reinterpret_cast<device float2*>(output + 0 * head_dim)[simdgroup_tid] = out0 / l0;
        reinterpret_cast<device float2*>(output + 1 * head_dim)[simdgroup_tid] = out1 / l1;
        reinterpret_cast<device float2*>(output + 2 * head_dim)[simdgroup_tid] = out2 / l2;
        reinterpret_cast<device float2*>(output + 3 * head_dim)[simdgroup_tid] = out3 / l3;
        reinterpret_cast<device float2*>(output + 4 * head_dim)[simdgroup_tid] = out4 / l4;
        reinterpret_cast<device float2*>(output + 5 * head_dim)[simdgroup_tid] = out5 / l5;
        reinterpret_cast<device float2*>(output + 6 * head_dim)[simdgroup_tid] = out6 / l6;
        reinterpret_cast<device float2*>(output + 7 * head_dim)[simdgroup_tid] = out7 / l7;
    }
}


================================================
FILE: gpt_oss/metal/source/tokenizer.c
================================================
#include <assert.h>
#include <stdatomic.h>
#include <stddef.h>
#include <stdint.h>
#include <stdlib.h>
#include <string.h>

#include <errno.h>
#include <sys/mman.h>

#include <gpt-oss.h>

#include "internal/log.h"
#include "internal/model.h"


enum gptoss_status GPTOSS_ABI gptoss_tokenizer_get_special_token_id(
    gptoss_tokenizer_t tokenizer,
    enum gptoss_special_token token_type,
    uint32_t* token_id_out)
{
    uint32_t token_id = UINT32_MAX;
    if (token_type != gptoss_special_token_invalid && token_type < gptoss_special_token_max)
    {
        token_id = tokenizer->special_token_id[(uint32_t) token_type - 1];
    }
    if (token_id == UINT32_MAX) {
        return gptoss_status_invalid_argument;
    }

    *token_id_out = token_id;
    return gptoss_status_success;
}

enum gptoss_status GPTOSS_ABI gptoss_tokenizer_get_num_text_tokens(
    gptoss_tokenizer_t tokenizer,
    uint32_t* num_text_tokens_out)
{
    *num_text_tokens_out = tokenizer->num_text_tokens;
    return gptoss_status_success;
}

enum gptoss_status GPTOSS_ABI gptoss_tokenizer_get_num_special_tokens(
    gptoss_tokenizer_t tokenizer,
    uint32_t* num_special_tokens_out)
{
    *num_special_tokens_out = tokenizer->num_special_tokens;
    return gptoss_status_success;
}

enum gptoss_status GPTOSS_ABI gptoss_tokenizer_get_num_tokens(
    gptoss_tokenizer_t tokenizer,
    uint32_t* num_tokens_out)
{
    *num_tokens_out = tokenizer->num_text_tokens + tokenizer->num_special_tokens;
    return gptoss_status_success;
}

enum gptoss_status GPTOSS_ABI gptoss_tokenizer_decode(
    gptoss_tokenizer_t tokenizer,
    uint32_t token_id,
    const void** token_ptr_out,
    size_t* token_size_out)
{
    if (token_id >= tokenizer->num_text_tokens) {
        return gptoss_status_invalid_argument;
    }

    const char* token_ptr = (const char*) tokenizer->tokens_ptr;
    for (uint32_t t = 0; t < token_id; t++) {
        // Reading unaligned uint16_t
        uint16_t token_length;
        memcpy(&token_length, token_ptr, sizeof(token_length));

        token_ptr += (size_t) token_length + sizeof(uint16_t);
    }

    *token_ptr_out = (const void*) (token_ptr + sizeof(uint16_t));
    *token_size_out = (size_t) *token_ptr;
    return gptoss_status_success;
}

enum gptoss_status GPTOSS_ABI gptoss_tokenizer_retain(
    gptoss_tokenizer_t tokenizer)
{
    atomic_fetch_add_explicit(&tokenizer->ref_count, 1, memory_order_relaxed);
    return gptoss_status_success;
}

enum gptoss_status GPTOSS_ABI gptoss_tokenizer_release(
    gptoss_tokenizer_t tokenizer)
{
    if (tokenizer != NULL) {
        if (atomic_fetch_sub_explicit(&tokenizer->ref_count, 1, memory_order_acquire) == 1) {
            if (tokenizer->mapping_ptr != NULL && tokenizer->mapping_size != 0) {
                if (munmap(tokenizer->mapping_ptr, tokenizer->mapping_size) != 0) {
                    GPTOSS_LOG_WARNING("munmap for tokenizer mapping failed with error %d", errno);
                }
            }

            memset(tokenizer, 0, sizeof(struct gptoss_tokenizer));
            free(tokenizer);
        }
    }
    return gptoss_status_success;
}


================================================
FILE: gpt_oss/metal/source/topk.metal
================================================
#include <metal_compute>
#include <metal_integer>
#include <metal_math>
#include <metal_simdgroup>

#include <internal/kernel-args.h>

#pragma METAL fp math_mode(safe)
#pragma METAL fp contract(off)


[[max_total_threads_per_threadgroup(32)]]
kernel void gptoss_f32_topk_softmax_e128_k4(
    constant gptoss_topk_args& args [[ buffer(0) ]],
    const device float4* input [[ buffer(1) ]],
    device gptoss_expert_prediction* output [[ buffer(2) ]],
    const device gptoss_control* control [[ buffer(3) ]],
    uint gid [[threadgroup_position_in_grid]],
    uint tid [[thread_position_in_threadgroup]])
{
    const uint num_experts = 128;
    const uint num_active_experts = 4;
    if (control->abort != 0) {
        return;
    }

    input += gid * (num_experts / 4);
    output += gid * num_active_experts;

    uint4 idx = tid * 4 + (uint4) {0, 1, 2, 3};
    float4 val = input[tid];

    const float topval0 = metal::simd_max(metal::max3(metal::max(val.x, val.y), val.z, val.w));
    uint idx0 = 0xFFFFFFFFu;
    if (val.w == topval0) {
        idx0 = idx.w;
    }
    if (val.z == topval0) {
        idx0 = idx.z;
    }
    if (val.y == topval0) {
        idx0 = idx.y;
    }
    if (val.x == topval0) {
        idx0 = idx.x;
    }
    const uint topidx0 = metal::simd_min(idx0);
    const bool4 is_topidx0 = idx == topidx0;
    val = metal::select(val, -INFINITY, is_topidx0);
    idx = metal::select(idx, 0xFFFFFFFFu, is_topidx0);

    const float topval1 = metal::simd_max(metal::max3(metal::max(val.x, val.y), val.z, val.w));
    uint idx1 = 0xFFFFFFFFu;
    if (val.w == topval1) {
        idx1 = idx.w;
    }
    if (val.z == topval1) {
        idx1 = idx.z;
    }
    if (val.y == topval1) {
        idx1 = idx.y;
    }
    if (val.x == topval1) {
        idx1 = idx.x;
    }
    const uint topidx1 = metal::simd_min(idx1);
    const bool4 is_topidx1 = idx == topidx1;
    val = metal::select(val, -INFINITY, is_topidx1);
    idx = metal::select(idx, 0xFFFFFFFFu, is_topidx1);

    const float topval2 = metal::simd_max(metal::max3(metal::max(val.x, val.y), val.z, val.w));
    uint idx2 = 0xFFFFFFFFu;
    if (val.w == topval2) {
        idx2 = idx.w;
    }
    if (val.z == topval2) {
        idx2 = idx.z;
    }
    if (val.y == topval2) {
        idx2 = idx.y;
    }
    if (val.x == topval2) {
        idx2 = idx.x;
    }
    const uint topidx2 = metal::simd_min(idx2);
    const bool4 is_topidx2 = idx == topidx2;
    val = metal::select(val, -INFINITY, is_topidx2);
    idx = metal::select(idx, 0xFFFFFFFFu, is_topidx2);

    const float topval3 = metal::simd_max(metal::max3(metal::max(val.x, val.y), val.z, val.w));
    uint idx3 = 0xFFFFFFFFu;
    if (val.w == topval3) {
        idx3 = idx.w;
    }
    if (val.z == topval3) {
        idx3 = idx.z;
    }
    if (val.y == topval3) {
        idx3 = idx.y;
    }
    if (val.x == topval3) {
        idx3 = idx.x;
    }
    const uint topidx3 = metal::simd_min(idx3);

    if (metal::simd_is_first()) {
        const float topexp0 = 1.0f;
        const float topexp1 = metal::precise::exp(topval1 - topval0);
        const float topexp2 = metal::precise::exp(topval2 - topval0);
        const float topexp3 = metal::precise::exp(topval3 - topval0);

        const float sum = (topexp0 + topexp1) + (topexp2 + topexp3);
        const float scale = 1.0 / sum;

        output[0] = (gptoss_expert_prediction) {
            .expert_id = topidx0,
            .score = topexp0 * scale,
        };
        output[1] = (gptoss_expert_prediction) {
            .expert_id = topidx1,
            .score = topexp1 * scale,
        };
        output[2] = (gptoss_expert_prediction) {
            .expert_id = topidx2,
            .score = topexp2 * scale,
        };
        output[3] = (gptoss_expert_prediction) {
            .expert_id = topidx3,
            .score = topexp3 * scale,
        };
    }
}

[[max_total_threads_per_threadgroup(32)]]
kernel void gptoss_f32_topk_softmax_e32_k4(
    constant gptoss_topk_args& args [[ buffer(0) ]],
    const device float* input [[ buffer(1) ]],
    device gptoss_expert_prediction* output [[ buffer(2) ]],
    const device gptoss_control* control [[ buffer(3) ]],
    uint gid [[threadgroup_position_in_grid]],
    uint tid [[thread_position_in_threadgroup]])
{
    const uint num_experts = 32;
    const uint num_active_experts = 4;
    if (control->abort != 0) {
        return;
    }

    input += gid * num_experts;
    output += gid * num_active_experts;

    float val = input[tid];
    uint idx = tid;

    const float topval0 = metal::simd_max(val);
    const uint topidx0 = metal::simd_min(val == topval0 ? idx : 0xFFFFFFFFu);
    if (idx == topidx0) {
        val = -INFINITY;
        idx = 0xFFFFFFFFu;
    }

    const float topval1 = metal::simd_max(val);
    const uint topidx1 = metal::simd_min(val == topval1 ? idx : 0xFFFFFFFFu);
    if (idx == topidx1) {
        val = -INFINITY;
        idx = 0xFFFFFFFFu;
    }

    const float topval2 = metal::simd_max(val);
    const uint topidx2 = metal::simd_min(val == topval2 ? idx : 0xFFFFFFFFu);
    if (idx == topidx2) {
        val = -INFINITY;
        idx = 0xFFFFFFFFu;
    }

    const float topval3 = metal::simd_max(val);
    const uint topidx3 = metal::simd_min(val == topval3 ? idx : 0xFFFFFFFFu);

    if (metal::simd_is_first()) {
        const float topexp0 = 1.0f;
        const float topexp1 = metal::precise::exp(topval1 - topval0);
        const float topexp2 = metal::precise::exp(topval2 - topval0);
        const float topexp3 = metal::precise::exp(topval3 - topval0);

        const float sum = (topexp0 + topexp1) + (topexp2 + topexp3);
        const float scale = 1.0 / sum;

        output[0] = (gptoss_expert_prediction) {
            .expert_id = topidx0,
            .score = topexp0 * scale,
        };
        output[1] = (gptoss_expert_prediction) {
            .expert_id = topidx1,
            .score = topexp1 * scale,
        };
        output[2] = (gptoss_expert_prediction) {
            .expert_id = topidx2,
            .score = topexp2 * scale,
        };
        output[3] = (gptoss_expert_prediction) {
            .expert_id = topidx3,
            .score = topexp3 * scale,
        };
    }
}


================================================
FILE: gpt_oss/metal/test/bf16-f32-embeddings.cc
================================================
#include <gtest/gtest.h>

#include <cstddef>

#include "embeddings-kernel-tester.hpp"


using gptoss::EmbeddingsKernelTester;

constexpr std::size_t kThreadgroupSize = 64;


TEST(BF16_F32_EMBEDDINGS, single_token_single_tile) {
    EmbeddingsKernelTester()
        .num_channels(kThreadgroupSize)
        .threadgroup_size(kThreadgroupSize)
        .TestBF16_F32();
}

TEST(BF16_F32_EMBEDDINGS, single_token_multi_tile) {
    EmbeddingsKernelTester()
        .num_channels(kThreadgroupSize * 4 + 16)
        .threadgroup_size(kThreadgroupSize)
        .TestBF16_F32();
}

TEST(BF16_F32_EMBEDDINGS, multiple_tokens) {
    EmbeddingsKernelTester()
        .num_channels(kThreadgroupSize * 4 + 16)
        .num_tokens(3)
        .threadgroup_size(kThreadgroupSize)
        .TestBF16_F32();
}


================================================
FILE: gpt_oss/metal/test/embeddings-kernel-tester.hpp
================================================
#pragma once

#include <gtest/gtest.h>

#include <cstddef>
#include <cstdint>

#include <internal/datatype.hpp>
#include <internal/metal.hpp>
#include <internal/metal-kernels.h>


namespace gptoss {

class EmbeddingsKernelTester {
public:
    EmbeddingsKernelTester() { }

    EmbeddingsKernelTester(const EmbeddingsKernelTester&) = delete;
    EmbeddingsKernelTester(EmbeddingsKernelTester&&) = delete;
    EmbeddingsKernelTester& operator=(const EmbeddingsKernelTester&) = delete;
    EmbeddingsKernelTester& operator=(EmbeddingsKernelTester&&) = delete;

    [[nodiscard]]
    EmbeddingsKernelTester& num_channels(std::uint32_t num_channels) {
        num_channels_ = num_channels;
        return *this;
    }

    std::uint32_t num_channels() const {
        return num_channels_;
    }

    [[nodiscard]]
    EmbeddingsKernelTester& num_tokens(std::uint32_t num_tokens) {
        num_tokens_ = num_tokens;
        return *this;
    }

    std::uint32_t num_tokens() const {
        return num_tokens_;
    }

    std::uint32_t vocabulary_size() const {
        return num_tokens() + 1;
    }

    [[nodiscard]]
    EmbeddingsKernelTester& threadgroup_size(std::size_t threadgroup_size) {
        threadgroup_size_ = threadgroup_size;
        return *this;
    }

    std::size_t threadgroup_size() const {
        return threadgroup_size_;
    }

    void Validate() const {
        ASSERT_NE(num_channels(), 0);
        ASSERT_NE(num_tokens(), 0);
        ASSERT_NE(threadgroup_size(), 0);
        ASSERT_EQ(threadgroup_size() % 32, 0);
    }

    void TestBF16_F32() const {
        Validate();

        metal::CommandBuffer command_buffer{command_queue_};
        metal::Buffer token_buffer{device_, sizeof(std::uint32_t)};
        metal::Buffer weight_buffer{device_, vocabulary_size() * num_channels() * sizeof(gptoss_bfloat16)};
        metal::Buffer output_buffer{device_, num_channels() * sizeof(float)};
        metal::Buffer control_buffer{device_, sizeof(gptoss_control)};
        std::memset(control_buffer.ptr(), 0, sizeof(gptoss_control));

        std::uint32_t* token_ptr = static_cast<std::uint32_t*>(token_buffer.ptr());
        for (std::uint32_t t = 0; t < num_tokens(); t++) {
            token_ptr[t] = t + 1;
        }

        Check(gptoss_metal_command_buffer_encode_launch_bf16_f32_embeddings(
                command_buffer.handle(),
                bf16_f32_embeddings_fn.handle(),
                threadgroup_size(),
                token_buffer.handle(),
                /*token_offset=*/0,
                weight_buffer.handle(),
                /*weight_offset=*/0,
                output_buffer.handle(),
                /*output_offset=*/0,
                control_buffer.handle(),
                /*control_offset=*/0,
                num_tokens(),
                num_channels()),
            "gptoss_metal_command_buffer_encode_launch_bf16_f32_embeddings");

        command_buffer.commit();
        command_buffer.wait_completion();

        const gptoss_bfloat16* weight_ptr = static_cast<const gptoss_bfloat16*>(weight_buffer.ptr());
        const float* output_ptr = static_cast<const float*>(output_buffer.ptr());
        for (std::uint32_t t = 0; t < num_tokens(); t++) {
            const std::uint32_t token = token_ptr[t];
            for (std::uint32_t i = 0; i < num_channels(); i++) {
                const gptoss_bfloat16 input_val = weight_ptr[token * num_channels() + i];
                const float ref_output = upcast<float>(input_val);
                const float output = output_ptr[t * num_channels() + i];
                ASSERT_EQ(output, ref_output)
                    << "at token " << t << ", position " << i << " / " << num_channels() << ", input " << std::uint32_t(input_val.bits);
            }
        }
    }

private:
    metal::Device device_{};
    metal::CommandQueue command_queue_{device_};
    metal::Library library_{device_};
    metal::Function bf16_f32_embeddings_fn{library_, "gptoss_bf16_f32_embeddings"};
    std::uint32_t num_tokens_{1};
    std::uint32_t num_channels_{1};
    std::size_t threadgroup_size_{32};
};

}  // namespace gptoss


================================================
FILE: gpt_oss/metal/test/f32-bf16w-matmul.cc
================================================
#include <gtest/gtest.h>

#include <cstddef>
#include <cstdint>

#include "matmul-kernel-tester.hpp"


using gptoss::MatMulKernelTester;

constexpr size_t kSimdgroupSize = 32;  // fixed in the kernel

TEST(F32_BF16W_MATMUL, single_simdgroup_single_iteration) {
    MatMulKernelTester()
        .num_rows(1)
        .num_cols(kSimdgroupSize * 4)
        .threadgroup_size(kSimdgroupSize)
        .TestF32_BF16W();
}

TEST(F32_BF16W_MATMUL, single_simdgroup_multiple_iteration) {
    MatMulKernelTester()
        .num_rows(1)
        .num_cols((2 * kSimdgroupSize + 1) * 4)
        .threadgroup_size(kSimdgroupSize)
        .TestF32_BF16W();
}

TEST(F32_BF16W_MATMUL, single_threadgroup) {
    constexpr std::size_t threadgroup_size = 2 * kSimdgroupSize;

    MatMulKernelTester()
        .num_rows(threadgroup_size / kSimdgroupSize)
        .num_cols((2 * kSimdgroupSize + 1) * 4)
        .threadgroup_size(threadgroup_size)
        .TestF32_BF16W();
}

TEST(F32_BF16W_MATMUL, multiple_threadgroups) {
    constexpr std::size_t threadgroup_size = 2 * kSimdgroupSize;
    constexpr std::uint32_t num_threadgroups = 3;

    MatMulKernelTester()
        .num_rows(num_threadgroups * threadgroup_size / kSimdgroupSize)
        .num_cols((2 * kSimdgroupSize + 1) * 4)
        .threadgroup_size(threadgroup_size)
        .TestF32_BF16W();
}

TEST(F32_BF16W_MATMUL, multiple_tokens) {
    constexpr std::size_t threadgroup_size = 2 * kSimdgroupSize;
    constexpr std::uint32_t num_threadgroups = 3;

    MatMulKernelTester()
        .num_rows(num_threadgroups * threadgroup_size / kSimdgroupSize)
        .num_cols((2 * kSimdgroupSize + 1) * 4)
        .num_tokens(2)
        .threadgroup_size(threadgroup_size)
        .TestF32_BF16W();
}

TEST(F32_BF16W_DENSE_MATMUL_QKV, seq_len_1024) {
    MatMulKernelTester()
        .num_tokens(1024)
        .num_rows(5120)
        .num_cols(2880)
        .TestF32_BF16W(
            MatMulKernelTester::MatMulKernelType::PREFILL_QKV_OPTIMIZED);
}

TEST(F32_BF16W_DENSE_MATMUL_ATTN_OUTPUT, seq_len_1024) {
    MatMulKernelTester()
        .num_tokens(1024)
        .num_rows(2880)
        .num_cols(4096)
        .TestF32_BF16W(
            MatMulKernelTester::MatMulKernelType::PREFILL_ATTN_OUTPUT_OPTIMIZED);
}

TEST(F32_BF16W_DENSE_MATMUL_MLP_GATE, seq_len_1024) {
    MatMulKernelTester()
        .num_tokens(1024)
        .num_rows(128)
        .num_cols(2880)
        .TestF32_BF16W(
            MatMulKernelTester::MatMulKernelType::PREFILL_MLP_GATE_OPTIMIZED);
}

================================================
FILE: gpt_oss/metal/test/f32-bf16w-rmsnorm.cc
================================================
#include <gtest/gtest.h>

#include <cstdint>

#include "rmsnorm-kernel-tester.hpp"


using gptoss::RMSNormKernelTester;

constexpr std::uint32_t kThreadgroupSize = 1024;  // fixed in the kernel
constexpr std::uint32_t kVectorSize = 4;  // fixed in the kernel

TEST(F32_BF16W_RMSNORM, single_iteration) {
    RMSNormKernelTester()
        .num_channels(kThreadgroupSize)
        .TestF32_BF16W();
}

TEST(F32_BF16W_RMSNORM, multiple_iterations) {
    RMSNormKernelTester()
        .num_channels(kThreadgroupSize * 2)
        .TestF32_BF16W();
}

TEST(F32_BF16W_RMSNORM, partial_iteration) {
    RMSNormKernelTester()
        .num_channels(kThreadgroupSize * 2 + kVectorSize)
        .TestF32_BF16W();
}

TEST(F32_BF16W_RMSNORM, multiple_tokens) {
    RMSNormKernelTester()
        .num_tokens(3)
        .num_channels(kThreadgroupSize * 2 + kVectorSize)
        .TestF32_BF16W();
}


================================================
FILE: gpt_oss/metal/test/f32-random.cc
================================================
#include <gtest/gtest.h>

#include <cmath>

#include <internal/metal.hpp>
#include <internal/metal-kernels.h>
#include <internal/rng.hpp>

using gptoss::Check;
using namespace gptoss::metal;


constexpr uint64_t kSeed = UINT64_C(1019827666124465388);
constexpr uint64_t kOffset = UINT64_C(12345678901234567890);
constexpr float kMin = -1.0f;
constexpr float kMax = +1.5f;
constexpr float kScale = (kMax - kMin) * 0.5f;
constexpr float kBias = (kMin + kMax) * 0.5f;
constexpr size_t kThreadgroupSize = 128;

TEST(F32_FILL_RANDOM, single_threadgroup_single_iteration) {
    constexpr size_t num_bytes = kThreadgroupSize * 16;
    constexpr size_t num_elements = num_bytes / sizeof(uint32_t);

    Device device;
    CommandQueue command_queue{device};
    CommandBuffer command_buffer{command_queue};
    Library library{device};
    Function f32_fill_random_fn{library, "gptoss_f32_fill_random"};
    Buffer buffer{device, num_elements * sizeof(float)};

    Check(gptoss_metal_command_buffer_encode_launch_f32_fill_random(
            command_buffer.handle(),
            f32_fill_random_fn.handle(),
            /*threadgroup_size=*/kThreadgroupSize,
            /*max_threadgroups=*/1,
            /*output_buffer=*/buffer.handle(),
            /*output_offset=*/0,
            num_elements, kSeed, kOffset, kMin, kMax),
        "gptoss_metal_command_buffer_encode_launch_f32_fill_random");

    command_buffer.commit();
    command_buffer.wait_completion();

    const float* output_ptr = static_cast<const float*>(buffer.ptr());
    for (size_t i = 0; i < num_elements; i++) {
        const uint32_t ref_word = gptoss::rng::squares32(kOffset + i, kSeed);
        const float ref_float = static_cast<int32_t>(ref_word) * 0x1.0p-31f;
        const float ref_value = std::fma(ref_float, kScale, kBias);
        ASSERT_EQ(output_ptr[i], ref_value)
            << "at position " << i << " / " << num_elements;
    }
}

TEST(F32_FILL_RANDOM, single_threadgroup_multiple_iterations) {
    constexpr size_t num_iterations = 3;
    constexpr size_t num_bytes = num_iterations * kThreadgroupSize * 16;
    constexpr size_t num_elements = num_bytes / sizeof(uint32_t);

    Device device;
    CommandQueue command_queue{device};
    CommandBuffer command_buffer{command_queue};
    Library library{device};
    Function f32_fill_random_fn{library, "gptoss_f32_fill_random"};
    Buffer buffer{device, num_elements * sizeof(float)};

    Check(gptoss_metal_command_buffer_encode_launch_f32_fill_random(
            command_buffer.handle(),
            f32_fill_random_fn.handle(),
            /*threadgroup_size=*/kThreadgroupSize,
            /*max_threadgroups=*/1,
            /*output_buffer=*/buffer.handle(),
            /*output_offset=*/0,
            num_elements, kSeed, kOffset, kMin, kMax),
        "gptoss_metal_command_buffer_encode_launch_f32_fill_random");

    command_buffer.commit();
    command_buffer.wait_completion();

    const float* output_ptr = static_cast<const float*>(buffer.ptr());
    for (size_t i = 0; i < num_elements; i++) {
        const uint32_t ref_word = gptoss::rng::squares32(kOffset + i, kSeed);
        const float ref_float = static_cast<int32_t>(ref_word) * 0x1.0p-31f;
        const float ref_value = std::fma(ref_float, kScale, kBias);
        ASSERT_EQ(output_ptr[i], ref_value)
            << "at position " << i << " / " << num_elements;
    }
}

TEST(F32_FILL_RANDOM, multiple_threadgroups_multiple_iterations) {
    constexpr size_t num_iterations = 3;
    constexpr size_t num_threadgroups = 2;
    constexpr size_t num_bytes = num_iterations * num_threadgroups * kThreadgroupSize * 16;
    constexpr size_t num_elements = num_bytes / sizeof(uint32_t);

    Device device;
    CommandQueue command_queue{device};
    CommandBuffer command_buffer{command_queue};
    Library library{device};
    Function f32_fill_random_fn{library, "gptoss_f32_fill_random"};
    Buffer buffer{device, num_elements * sizeof(float)};

    Check(gptoss_metal_command_buffer_encode_launch_f32_fill_random(
            command_buffer.handle(),
            f32_fill_random_fn.handle(),
            /*threadgroup_size=*/kThreadgroupSize,
            /*max_threadgroups=*/num_threadgroups,
            /*output_buffer=*/buffer.handle(),
            /*output_offset=*/0,
            num_elements, kSeed, kOffset, kMin, kMax),
        "gptoss_metal_command_buffer_encode_launch_f32_fill_random");

    command_buffer.commit();
    command_buffer.wait_completion();

    const float* output_ptr = static_cast<const float*>(buffer.ptr());
    for (size_t i = 0; i < num_elements; i++) {
        const uint32_t ref_word = gptoss::rng::squares32(kOffset + i, kSeed);
        const float ref_float = static_cast<int32_t>(ref_word) * 0x1.0p-31f;
        const float ref_value = std::fma(ref_float, kScale, kBias);
        ASSERT_EQ(output_ptr[i], ref_value)
            << "at position " << i << " / " << num_elements;
    }
}

TEST(F32_FILL_RANDOM, excessive_threadgroups) {
    constexpr size_t num_bytes = kThreadgroupSize * 16;
    constexpr size_t num_elements = num_bytes / sizeof(uint32_t);

    Device device;
    CommandQueue command_queue{device};
    CommandBuffer command_buffer{command_queue};
    Library library{device};
    Function f32_fill_random_fn{library, "gptoss_f32_fill_random"};
    Buffer buffer{device, num_elements * sizeof(float)};

    Check(gptoss_metal_command_buffer_encode_launch_f32_fill_random(
            command_buffer.handle(),
            f32_fill_random_fn.handle(),
            /*threadgroup_size=*/kThreadgroupSize,
            /*max_threadgroups=*/2,
            /*output_buffer=*/buffer.handle(),
            /*output_offset=*/0,
            num_elements, kSeed, kOffset, kMin, kMax),
        "gptoss_metal_command_buffer_encode_launch_f32_fill_random");

    command_buffer.commit();
    command_buffer.wait_completion();

    const float* output_ptr = static_cast<const float*>(buffer.ptr());
    for (size_t i = 0; i < num_elements; i++) {
        const uint32_t ref_word = gptoss::rng::squares32(kOffset + i, kSeed);
        const float ref_float = static_cast<int32_t>(ref_word) * 0x1.0p-31f;
        const float ref_value = std::fma(ref_float, kScale, kBias);
        ASSERT_EQ(output_ptr[i], ref_value)
            << "at position " << i << " / " << num_elements;
    }
}

TEST(F32_FILL_RANDOM, nonuniform_range) {
    constexpr size_t num_iterations = 3;
    constexpr size_t num_threadgroups = 2;
    constexpr size_t num_bytes = (num_iterations * num_threadgroups + 1) * kThreadgroupSize * 16;
    constexpr size_t num_elements = num_bytes / sizeof(uint32_t);

    Device device;
    CommandQueue command_queue{device};
    CommandBuffer command_buffer{command_queue};
    Library library{device};
    Function f32_fill_random_fn{library, "gptoss_f32_fill_random"};
    Buffer buffer{device, num_elements * sizeof(float)};

    Check(gptoss_metal_command_buffer_encode_launch_f32_fill_random(
            command_buffer.handle(),
            f32_fill_random_fn.handle(),
            /*threadgroup_size=*/kThreadgroupSize,
            /*max_threadgroups=*/num_threadgroups,
            /*output_buffer=*/buffer.handle(),
            /*output_offset=*/0,
            num_elements, kSeed, kOffset, kMin, kMax),
        "gptoss_metal_command_buffer_encode_launch_f32_fill_random");

    command_buffer.commit();
    command_buffer.wait_completion();

    const float* output_ptr = static_cast<const float*>(buffer.ptr());
    for (size_t i = 0; i < num_elements; i++) {
        const uint32_t ref_word = gptoss::rng::squares32(kOffset + i, kSeed);
        const float ref_float = static_cast<int32_t>(ref_word) * 0x1.0p-31f;
        const float ref_value = std::fma(ref_float, kScale, kBias);
        ASSERT_EQ(output_ptr[i], ref_value)
            << "at position " << i << " / " << num_elements;
    }
}

TEST(F32_FILL_RANDOM, partial_range) {
    constexpr size_t num_iterations = 3;
    constexpr size_t num_threadgroups = 2;
    constexpr size_t num_bytes = (num_iterations * num_threadgroups * kThreadgroupSize + 1) * 16;
    constexpr size_t num_elements = num_bytes / sizeof(uint32_t);

    Device device;
    CommandQueue command_queue{device};
    CommandBuffer command_buffer{command_queue};
    Library library{device};
    Function f32_fill_random_fn{library, "gptoss_f32_fill_random"};
    Buffer buffer{device, num_elements * sizeof(float)};

    Check(gptoss_metal_command_buffer_encode_launch_f32_fill_random(
            command_buffer.handle(),
            f32_fill_random_fn.handle(),
            /*threadgroup_size=*/kThreadgroupSize,
            /*max_threadgroups=*/num_threadgroups,
            /*output_buffer=*/buffer.handle(),
            /*output_offset=*/0,
            num_elements, kSeed, kOffset, kMin, kMax),
        "gptoss_metal_command_buffer_encode_launch_f32_fill_random");

    command_buffer.commit();
    command_buffer.wait_completion();

    const float* output_ptr = static_cast<const float*>(buffer.ptr());
    for (size_t i = 0; i < num_elements; i++) {
        const uint32_t ref_word = gptoss::rng::squares32(kOffset + i, kSeed);
        const float ref_float = static_cast<int32_t>(ref_word) * 0x1.0p-31f;
        const float ref_value = std::fma(ref_float, kScale, kBias);
        ASSERT_EQ(output_ptr[i], ref_value)
            << "at position " << i << " / " << num_elements;
    }
}


================================================
FILE: gpt_oss/metal/test/f32-rope.cc
================================================
#include <gtest/gtest.h>

#include <cstddef>
#include <cstdint>

#include "rope-kernel-tester.hpp"


using gptoss::RoPEKernelTester;

constexpr float kFrequencyBase = 50000.0f;
constexpr std::uint32_t kHeadDim = 64;  // fixed in the kernel
constexpr std::uint32_t kTokenOffset = 7;


TEST(F32_ROPE, single_simdgroup) {
    RoPEKernelTester()
        .head_dim(kHeadDim)
        .num_q_heads(1)
        .num_kv_heads(0)
        .token_offset(kTokenOffset)
        .frequency_base(kFrequencyBase)
        .threadgroup_size(kHeadDim / 2)
        .TestF32();
}

TEST(F32_ROPE, single_threadgroup) {
    constexpr std::size_t threadgroup_size = 64;
    constexpr std::uint32_t num_heads = threadgroup_size / (kHeadDim / 2);

    RoPEKernelTester()
        .head_dim(kHeadDim)
        .num_q_heads(num_heads)
        .num_kv_heads(0)
        .token_offset(kTokenOffset)
        .frequency_base(kFrequencyBase)
        .threadgroup_size(threadgroup_size)
        .TestF32();
}

TEST(F32_ROPE, multiple_threadgroups) {
    constexpr std::uint32_t num_threadgroups = 3;
    constexpr std::size_t threadgroup_size = 64;
    constexpr std::uint32_t num_heads = num_threadgroups * (threadgroup_size / (kHeadDim / 2));

    RoPEKernelTester()
        .head_dim(kHeadDim)
        .num_q_heads(num_heads)
        .num_kv_heads(0)
        .token_offset(kTokenOffset)
        .frequency_base(kFrequencyBase)
        .threadgroup_size(threadgroup_size)
        .TestF32();
}

TEST(F32_ROPE, multiple_tokens) {
    constexpr std::uint32_t num_tokens = 2;
    constexpr std::uint32_t num_threadgroups = 3;
    constexpr std::size_t threadgroup_size = 64;
    constexpr std::uint32_t num_heads = num_threadgroups * (threadgroup_size / (kHeadDim / 2));

    RoPEKernelTester()
        .head_dim(kHeadDim)
        .num_tokens(2)
        .num_q_heads(num_heads)
        .num_kv_heads(0)
        .token_offset(kTokenOffset)
        .frequency_base(kFrequencyBase)
        .threadgroup_size(threadgroup_size)
        .TestF32();
}


================================================
FILE: gpt_oss/metal/test/fill-random-kernel-tester.hpp
================================================
#pragma once

#include <gtest/gtest.h>

#include <cstddef>
#include <cstdint>

#include <internal/datatype.hpp>
#include <internal/metal.hpp>
#include <internal/metal-kernels.h>
#include <internal/rng.hpp>


namespace gptoss {

class FillRandomKernelTester {
public:
    FillRandomKernelTester() { }

    FillRandomKernelTester(const FillRandomKernelTester&) = delete;
    FillRandomKernelTester(FillRandomKernelTester&&) = delete;
    FillRandomKernelTester& operator=(const FillRandomKernelTester&) = delete;
    FillRandomKernelTester& operator=(FillRandomKernelTester&&) = delete;

    [[nodiscard]]
    FillRandomKernelTester& num_elements(std::uint32_t num_elements) {
        num_elements_ = num_elements;
        return *this;
    }

    std::uint32_t num_elements() const {
        return num_elements_;
    }

    [[nodiscard]]
    FillRandomKernelTester& threadgroup_size(std::size_t threadgroup_size) {
        threadgroup_size_ = threadgroup_size;
        return *this;
    }

    std::size_t threadgroup_size() const {
        return threadgroup_size_;
    }

    [[nodiscard]]
    FillRandomKernelTester& max_threadgroups(std::size_t max_threadgroups) {
        max_threadgroups_ = max_threadgroups;
        return *this;
    }

    std::size_t max_threadgroups() const {
        return max_threadgroups_;
    }

    void Validate() const {
        ASSERT_NE(num_elements(), 0);
        ASSERT_NE(threadgroup_size(), 0);
        ASSERT_NE(max_threadgroups(), 0);
    }

    void TestU32() const {
        Validate();

        metal::Buffer output_buffer{device_, num_elements() * sizeof(std::uint32_t)};

        metal::CommandBuffer command_buffer{command_queue_};
        command_buffer.encode_launch_u32_fill_random(
            u32_fill_random_fn_,
            threadgroup_size(),
            max_threadgroups(),
            output_buffer,
            /*output_offset=*/0,
            num_elements(), kSeed, kOffset);

        command_buffer.commit();
        command_buffer.wait_completion();

        const std::uint32_t* output_ptr = static_cast<const std::uint32_t*>(output_buffer.ptr());
        for (std::size_t i = 0; i < num_elements(); i++) {
            const std::uint32_t ref_value = gptoss::rng::squares32(kOffset + i, kSeed);
            ASSERT_EQ(output_ptr[i], ref_value)
                << "at position " << i << " / " << num_elements();
        }
    }

private:
    static constexpr uint64_t kSeed{UINT64_C(1019827666124465388)};
    static constexpr uint64_t kOffset{UINT64_C(12345678901234567890)};

    metal::Device device_{};
    metal::CommandQueue command_queue_{device_};
    metal::Library library_{device_};
    metal::Function f32_fill_random_fn_{library_, "gptoss_f32_fill_random"};
    metal::Function bf16_fill_random_fn_{library_, "gptoss_bf16_fill_random"};
    metal::Function u32_fill_random_fn_{library_, "gptoss_u32_fill_random"};
    std::uint32_t num_elements_{1};
    std::size_t threadgroup_size_{32};
    std::size_t max_threadgroups_{1};
};

}  // namespace gptoss


================================================
FILE: gpt_oss/metal/test/matmul-kernel-tester.hpp
================================================
#pragma once

#include <gtest/gtest.h>

#include <cmath>
#include <cstddef>
#include <cstdint>

#include <internal/datatype.hpp>
#include <internal/metal.hpp>
#include <internal/metal-kernels.h>

namespace gptoss {

template <typename T>
::testing::AssertionResult
IsNearAbsRel(const char* a_expr, const char* b_expr, const char* abs_expr,
             const char* rel_expr, T a, T b, T abs_tol, T rel_tol = 1.0) {

    using std::abs;
    if (!std::isfinite(a) || !std::isfinite(b)) {
        return ::testing::AssertionFailure()
               << "Non-finite value(s): " << a_expr << "=" << a << ", " << b_expr
               << "=" << b;
        // At least one of abs_tol and rel_tol must be provided
    }
    const T diff = abs(a - b);
    const T rel = rel_tol * std::max(abs(a), abs(b));
    const T thr = std::max(abs_tol, rel);

    if (diff <= thr)
        return ::testing::AssertionSuccess();

    return ::testing::AssertionFailure()
           << a_expr << " vs " << b_expr << " differ by " << diff
           << " > max(abs_tol=" << abs_tol << ", rel_tol*max(|a|,|b|)=" << rel
           << ") with " << abs_expr << "=" << abs_tol << ", " << rel_expr << "="
           << rel_tol << ". \n"
           << a_expr << "=" << a << ". \n"
           << b_expr << "=" << b;
}

#define ASSERT_NEAR_ABS_REL(a, b, abs_tol, rel_tol) \
    ASSERT_PRED_FORMAT4(IsNearAbsRel<double>, a, b, abs_tol, rel_tol)

class MatMulKernelTester {
public:
    MatMulKernelTester() { }

    MatMulKernelTester(const MatMulKernelTester&) = delete;
    MatMulKernelTester(MatMulKernelTester&&) = delete;
    MatMulKernelTester& operator=(const MatMulKernelTester&) = delete;
    MatMulKernelTester& operator=(MatMulKernelTester&&) = delete;

    [[nodiscard]]
    MatMulKernelTester& num_rows(std::uint32_t num_rows) {
        num_rows_ = num_rows;
        return *this;
    }

    std::uint32_t num_rows() const {
        return num_rows_;
    }

    [[nodiscard]]
    MatMulKernelTester& num_cols(std::uint32_t num_cols) {
        num_cols_ = num_cols;
        return *this;
    }

    std::uint32_t num_cols() const {
        return num_cols_;
    }

    [[nodiscard]]
    MatMulKernelTester& num_tokens(std::uint32_t num_tokens) {
        num_tokens_ = num_tokens;
        return *this;
    }

    std::uint32_t num_tokens() const {
        return num_tokens_;
    }

    [[nodiscard]]
    MatMulKernelTester& threadgroup_size(std::size_t threadgroup_size) {
        threadgroup_size_ = threadgroup_size;
        return *this;
    }

    std::size_t threadgroup_size() const {
        return threadgroup_size_;
    }

    void Validate(std::uint32_t vec_size) const {
        ASSERT_NE(num_rows(), 0);
        ASSERT_NE(num_cols(), 0);
        ASSERT_EQ(num_cols() % vec_size, 0);
        ASSERT_NE(num_tokens(), 0);
        ASSERT_NE(threadgroup_size(), 0);
    }

    enum class MatMulKernelType {
        DECODE_OPTIMIZED,
        PREFILL_QKV_OPTIMIZED,
        PREFILL_ATTN_OUTPUT_OPTIMIZED,
        PREFILL_MLP_GATE_OPTIMIZED,
    };

    void TestF32_BF16W(MatMulKernelType kernel_type = MatMulKernelType::DECODE_OPTIMIZED) const {
        Validate(/*vec_size=*/4);

        metal::CommandBuffer command_buffer_initialize{command_queue_};
        metal::Buffer input_buffer{device_, num_tokens() * num_cols() * sizeof(float)};
        metal::Buffer weight_buffer{device_, num_rows() * num_cols() * sizeof(gptoss_bfloat16)};
        metal::Buffer bias_buffer{device_, num_rows() * sizeof(gptoss_bfloat16)};
        metal::Buffer output_buffer{device_, num_tokens() * num_rows() * sizeof(float)};
        metal::Buffer output_buffer_copy{device_, num_tokens() * num_rows() * sizeof(float)};
        // KV cache buffer for PREFILL_QKV_OPTIMIZED: assume head_dim=64, num_kv_heads=8
        const std::uint32_t kHeadDim = 64;
        const std::uint32_t kNumKvHeads = 8;
        metal::Buffer kv_cache_buffer{device_, static_cast<std::size_t>(kNumKvHeads) * num_tokens() * 2 * kHeadDim * sizeof(float)};
        metal::Buffer control_buffer{device_, sizeof(gptoss_control)};
        std::memset(control_buffer.ptr(), 0, sizeof(gptoss_control));

        command_buffer_initialize.encode_launch_f32_fill_random(
            f32_fill_random_fn_,
            /*threadgroup_size=*/0,
            /*max_threadgroups=*/kFillRandomMaxThreadgroups,
            /*output_buffer=*/input_buffer,
            /*output_offset=*/0,
            num_tokens() * num_cols(), kSeed, /*offset=*/0, /*min=*/-1.0f, /*max=*/1.0);

        command_buffer_initialize.encode_launch_bf16_fill_random(
            bf16_fill_random_fn_,
            /*threadgroup_size=*/0,
            /*max_threadgroups=*/kFillRandomMaxThreadgroups,
            /*output_buffer=*/weight_buffer,
            /*output_offset=*/0,
            num_rows() * num_cols(), kSeed + 1, /*offset=*/0, /*min=*/-1.0f, /*max=*/1.0);

        command_buffer_initialize.encode_launch_bf16_fill_random(
            bf16_fill_random_fn_,
            /*threadgroup_size=*/0,
            /*max_threadgroups=*/kFillRandomMaxThreadgroups,
            /*output_buffer=*/bias_buffer,
            /*output_offset=*/0,
            num_rows(), kSeed + 2, /*offset=*/0, /*min=*/-1.0f, /*max=*/1.0);

        // Fill output buffer with random values to test matmul with add.
        command_buffer_initialize.encode_launch_f32_fill_random(
            f32_fill_random_fn_,
            /*threadgroup_size=*/0,
            /*max_threadgroups=*/kFillRandomMaxThreadgroups,
            /*output_buffer=*/output_buffer,
            /*output_offset=*/0, num_tokens() * num_rows(), kSeed + 3,
            /*offset=*/0,
            /*min=*/-1.0f, /*max=*/1.0);
        command_buffer_initialize.commit();
        command_buffer_initialize.wait_completion();
        if (kernel_type ==
            MatMulKernelType::PREFILL_ATTN_OUTPUT_OPTIMIZED) {
            // Copy output buffer to output buffer copy to use when calculating reference.
            const uint64_t bytes =
                uint64_t(num_tokens()) * uint64_t(num_rows()) * sizeof(float);

            void* src = output_buffer.ptr();
            void* dst = output_buffer_copy.ptr();
            assert(src && dst && "Buffers must be CPU-mappable for memcpy");

            std::memcpy(reinterpret_cast<std::byte*>(dst),
                        reinterpret_cast<const std::byte*>(src), bytes);
        }

        metal::CommandBuffer command_buffer_compute{command_queue_};
        switch (kernel_type) {
        case MatMulKernelType::DECODE_OPTIMIZED:
            Check(gptoss_metal_command_buffer_encode_launch_f32_bf16w_matmul(
                      command_buffer_compute.handle(), f32_bf16w_matmul_fn_.handle(),
                      /*threadgroup_size=*/threadgroup_size(), input_buffer.handle(),
                      /*input_offset=*/0, weight_buffer.handle(),
                      /*weight_offset=*/0, bias_buffer.handle(),
                      /*bias_offset=*/0, output_buffer.handle(),
                      /*output_offset=*/0, control_buffer.handle(),
                      /*control_offset=*/0, num_tokens(), num_cols(), num_rows()),
                  "gptoss_metal_command_buffer_encode_launch_f32_bf16w_matmul");
            break;
        case MatMulKernelType::PREFILL_QKV_OPTIMIZED:
            Check(
                gptoss_metal_command_buffer_encode_launch_f32_bf16w_dense_matmul_qkv(
                    command_buffer_compute.handle(),
                    f32_bf16w_dense_matmul_qkv_fn_.handle(), input_buffer.handle(),
                    /*input_offset=*/0, weight_buffer.handle(),
                    /*weight_offset=*/0, bias_buffer.handle(),
                    /*bias_offset=*/0, output_buffer.handle(),
                    /*output_offset=*/0, kv_cache_buffer.handle(),
                    /*kv_offset=*/0, control_buffer.handle(),
                    /*control_offset=*/0, num_tokens(), num_cols(), num_rows(),
                    /*max_tokens=*/num_tokens(), /*token_offset=*/0),
                "gptoss_metal_command_buffer_encode_launch_f32_bf16w_dense_matmul_qkv");
            break;
        case MatMulKernelType::PREFILL_ATTN_OUTPUT_OPTIMIZED:
            Check(
                gptoss_metal_command_buffer_encode_launch_f32_bf16w_dense_matmul_attn_output(
                    command_buffer_compute.handle(),
                    f32_bf16w_dense_matmul_attn_output_fn_.handle(),
                    input_buffer.handle(),
                    /*input_offset=*/0, weight_buffer.handle(),
                    /*weight_offset=*/0, bias_buffer.handle(),
                    /*bias_offset=*/0, output_buffer.handle(),
                    /*output_offset=*/0, control_buffer.handle(),
                    /*control_offset=*/0, num_tokens(), num_cols(), num_rows()),
                "gptoss_metal_command_buffer_encode_launch_f32_bf16w_dense_matmul_attn_output");
            break;
        case MatMulKernelType::PREFILL_MLP_GATE_OPTIMIZED:
            Check(
                gptoss_metal_command_buffer_encode_launch_f32_bf16w_dense_matmul_mlp_gate(
                    command_buffer_compute.handle(),
                    f32_bf16w_dense_matmul_mlp_gate_fn_.handle(),
                    input_buffer.handle(),
                    /*input_offset=*/0, weight_buffer.handle(),
                    /*weight_offset=*/0, bias_buffer.handle(),
                    /*bias_offset=*/0, output_buffer.handle(),
                    /*output_offset=*/0, control_buffer.handle(),
                    /*control_offset=*/0, num_tokens(), num_cols(), num_rows()),
                "gptoss_metal_command_buffer_encode_launch_f32_bf16w_dense_matmul_mlp_gate");
            break;
        }
        command_buffer_compute.commit();
        command_buffer_compute.wait_completion();
        const float* input_ptr = static_cast<const float*>(input_buffer.ptr());
        const gptoss_bfloat16* weight_ptr = static_cast<const gptoss_bfloat16*>(weight_buffer.ptr());
        const gptoss_bfloat16* bias_ptr = static_cast<const gptoss_bfloat16*>(bias_buffer.ptr());
        const float* output_ptr = static_cast<const float*>(output_buffer.ptr());
        const float* kv_ptr = static_cast<const float*>(kv_cache_buffer.ptr());
        const float* output_ptr_copy = static_cast<const float*>(output_buffer_copy.ptr());
        for (size_t t = 0; t < num_tokens(); t++) {
            for (size_t r = 0; r < num_rows(); r++) {
                double ref_sum = upcast<double>(bias_ptr[r]);
                for (size_t c = 0; c < num_cols(); c++) {
                    const double ref_weight = upcast<double>(weight_ptr[r * num_cols() + c]);
                    const double input_value = upcast<double>(input_ptr[t * num_cols() + c]);
                    ref_sum = std::fma(input_value, ref_weight, ref_sum);
                }

                if (kernel_type ==
                    MatMulKernelType::PREFILL_ATTN_OUTPUT_OPTIMIZED) {
                    ref_sum += upcast<double>(output_ptr_copy[t * num_rows() + r]);
                }
                if (kernel_type == MatMulKernelType::PREFILL_QKV_OPTIMIZED) {
                    // In this optimized path, V rows are written to the kv cache at index 1.
                    // Assume num_q_heads=64, num_kv_heads=8, head_dim=64 and QKV packed as [Q][K][V].
                    const std::size_t v_rows_start = (64 + 8) * 64; // rows offset where V begins
                    if (r >= v_rows_start) {
                        const std::size_t v_row_index = r - v_rows_start;
                        const std::size_t kv_head = v_row_index / kHeadDim;
                        const std::size_t d = v_row_index % kHeadDim;
                        const std::size_t kv_base = ((kv_head * num_tokens() + t) * 2 + 1) * kHeadDim;
                        ASSERT_NEAR_ABS_REL(upcast<double>(kv_ptr[kv_base + d]), ref_sum, 2.0e-4, 1.0e-4)
                            << "token " << t << ", v-row " << r;
                        continue;
                    }
                }
                ASSERT_NEAR_ABS_REL(upcast<double>(output_ptr[t * num_rows() + r]),
                                    ref_sum, 2.0e-4, 1.0e-4)
                    << "token " << t;
            }
        }
    }

private:
    static constexpr std::uint64_t kSeed{UINT64_C(1019827666124465388)};
    static constexpr std::size_t kFillRandomMaxThreadgroups = 10;
    static constexpr float fp4e2m1_to_fp32[16] = {
        +0.0f, +0.5f, +1.0f, +1.5f, +2.0f, +3.0f, +4.0f, +6.0f,
        -0.0f, -0.5f, -1.0f, -1.5f, -2.0f, -3.0f, -4.0f, -6.0f,
    };

    metal::Device device_{};
    metal::CommandQueue command_queue_{device_};
    metal::Library library_{device_};
    metal::Function f32_fill_random_fn_{library_, "gptoss_f32_fill_random"};
    metal::Function bf16_fill_random_fn_{library_, "gptoss_bf16_fill_random"};
    metal::Function f32_bf16w_matmul_fn_{library_, "gptoss_f32_bf16w_matmul"};
    metal::Function f32_bf16w_dense_matmul_qkv_fn_{library_, "gptoss_f32_bf16w_dense_matmul_qkv"};
    metal::Function f32_bf16w_dense_matmul_attn_output_fn_{library_, "gptoss_f32_bf16w_dense_matmul_attn_output"};
    metal::Function f32_bf16w_dense_matmul_mlp_gate_fn_{library_, "gptoss_f32_bf16w_dense_matmul_mlp_gate"};
    std::uint32_t num_tokens_{1};
    std::uint32_t num_rows_{1};
    std::uint32_t num_cols_{32};
    std::size_t threadgroup_size_{32};
};

}  // namespace gptoss


================================================
FILE: gpt_oss/metal/test/mf4-f32-convert.cc
================================================
#include <gtest/gtest.h>

#include <cmath>
#include <ios>

#include <internal/metal.hpp>
#include <internal/metal-kernels.h>

using gptoss::Check;
using namespace gptoss::metal;

constexpr size_t kThreadgroupSize = 32;


static float fp4e2m1_to_fp32[16] = {
    +0.0f, +0.5f, +1.0f, +1.5f, +2.0f, +3.0f, +4.0f, +6.0f,
    -0.0f, -0.5f, -1.0f, -1.5f, -2.0f, -3.0f, -4.0f, -6.0f,
};

TEST(MF4_F32_CONVERT, single_threadgroup_single_iteration) {
    constexpr size_t num_blocks = kThreadgroupSize;
    constexpr size_t num_elements = num_blocks * 32;
    constexpr size_t num_bytes = num_elements / 2;

    Device device;
    CommandQueue command_queue{device};
    CommandBuffer command_buffer{command_queue};
    Library library{device};
    Function mf4_f32_convert_fn{library, "gptoss_mf4_f32_convert"};
    Buffer block_buffer{device, num_bytes};
    Buffer scale_buffer{device, num_blocks * sizeof(uint8_t)};
    Buffer output_buffer{device, num_elements * sizeof(float)};

    uint8_t* block_ptr = static_cast<uint8_t*>(block_buffer.ptr());
    std::memset(block_ptr, 0, num_bytes);
    for (size_t b = 0; b < num_blocks; b++) {
        for (size_t i = 0; i < 32; i++) {
            const uint8_t nibble = (i + b) & 0x0F;
            const uint8_t byte = nibble << ((i % 2) * 4);
            block_ptr[b * 16 + i / 2] |= byte;
        }
    }

    uint8_t* scale_ptr = static_cast<uint8_t*>(scale_buffer.ptr());
    for (size_t b = 0; b < num_blocks; b++) {
        scale_ptr[b] = 127 - b;
    }

    Check(gptoss_metal_command_buffer_encode_launch_mf4_f32_convert(
            command_buffer.handle(),
            mf4_f32_convert_fn.handle(),
            /*threadgroup_size=*/kThreadgroupSize,
            /*max_threadgroups=*/1,
            block_buffer.handle(),
            scale_buffer.handle(),
            output_buffer.handle(),
            num_elements),
        "gptoss_metal_command_buffer_encode_launch_mf4_f32_convert");

    command_buffer.commit();
    command_buffer.wait_completion();

    const float* output_ptr = static_cast<const float*>(output_buffer.ptr());
    for (size_t b = 0; b < num_blocks; b++) {
        for (size_t i = 0; i < 32; i++) {
            const uint8_t byte = block_ptr[b * 16 + i / 2];
            const uint8_t nibble = (byte >> ((i % 2) * 4)) & 0x0F;
            const float ref_scale = std::ldexp(1.0f, static_cast<int>(scale_ptr[b]) - 127);
            const float ref_value = fp4e2m1_to_fp32[nibble] * ref_scale;
            ASSERT_EQ(output_ptr[b * 32 + i], ref_value)
                << "at position " << i << " / 32"
                << ", block " << b << " / " << num_blocks
                << ", FP4e2m1 value " << std::hex << uint32_t(nibble);
        }
    }
}

TEST(MF4_F32_CONVERT, multiple_threadgroups_multiple_iterations) {
    constexpr size_t num_threadgroups = 2;
    constexpr size_t num_blocks = num_threadgroups * (kThreadgroupSize + 1);
    constexpr size_t num_elements = num_blocks * 32;
    constexpr size_t num_bytes = num_elements / 2;

    Device device;
    CommandQueue command_queue{device};
    CommandBuffer command_buffer{command_queue};
    Library library{device};
    Function mf4_f32_convert_fn{library, "gptoss_mf4_f32_convert"};
    Buffer block_buffer{device, num_bytes};
    Buffer scale_buffer{device, num_blocks * sizeof(uint8_t)};
    Buffer output_buffer{device, num_elements * sizeof(float)};

    uint8_t* block_ptr = static_cast<uint8_t*>(block_buffer.ptr());
    std::memset(block_ptr, 0, num_bytes);
    for (size_t b = 0; b < num_blocks; b++) {
        for (size_t i = 0; i < 32; i++) {
            const uint8_t nibble = (i + b) & 0x0F;
            const uint8_t byte = nibble << ((i % 2) * 4);
            block_ptr[b * 16 + i / 2] |= byte;
        }
    }

    uint8_t* scale_ptr = static_cast<uint8_t*>(scale_buffer.ptr());
    for (size_t b = 0; b < num_blocks; b++) {
        scale_ptr[b] = 200 - b;
    }

    Check(gptoss_metal_command_buffer_encode_launch_mf4_f32_convert(
            command_buffer.handle(),
            mf4_f32_convert_fn.handle(),
            /*threadgroup_size=*/kThreadgroupSize,
            /*max_threadgroups=*/num_threadgroups,
            block_buffer.handle(),
            scale_buffer.handle(),
            output_buffer.handle(),
            num_elements),
        "gptoss_metal_command_buffer_encode_launch_mf4_f32_convert");

    command_buffer.commit();
    command_buffer.wait_completion();

    const float* output_ptr = static_cast<const float*>(output_buffer.ptr());
    for (size_t b = 0; b < num_blocks; b++) {
        for (size_t i = 0; i < 32; i++) {
            const uint8_t byte = block_ptr[b * 16 + i / 2];
            const uint8_t nibble = (byte >> ((i % 2) * 4)) & 0x0F;
            const float ref_scale = std::ldexp(1.0f, static_cast<int>(scale_ptr[b]) - 127);
            const float ref_value = fp4e2m1_to_fp32[nibble] * ref_scale;
            ASSERT_EQ(output_ptr[b * 32 + i], ref_value)
                << "at position " << i << " / 32"
                << ", block " << b << " / " << num_blocks
                << ", FP4e2m1 value " << std::hex << uint32_t(nibble);
        }
    }
}


================================================
FILE: gpt_oss/metal/test/rmsnorm-kernel-tester.hpp
================================================
#pragma once

#include <gtest/gtest.h>

#include <cmath>
#include <cstddef>
#include <cstdint>

#include <internal/datatype.hpp>
#include <internal/metal.hpp>
#include <internal/metal-kernels.h>


namespace gptoss {

class RMSNormKernelTester {
public:
    RMSNormKernelTester() { }

    RMSNormKernelTester(const RMSNormKernelTester&) = delete;
    RMSNormKernelTester(RMSNormKernelTester&&) = delete;
    RMSNormKernelTester& operator=(const RMSNormKernelTester&) = delete;
    RMSNormKernelTester& operator=(RMSNormKernelTester&&) = delete;

    [[nodiscard]]
    RMSNormKernelTester& num_channels(std::uint32_t num_channels) {
        num_channels_ = num_channels;
        return *this;
    }

    std::uint32_t num_channels() const {
        return num_channels_;
    }

    [[nodiscard]]
    RMSNormKernelTester& num_tokens(std::uint32_t num_tokens) {
        num_tokens_ = num_tokens;
        return *this;
    }

    std::uint32_t num_tokens() const {
        return num_tokens_;
    }

    [[nodiscard]]
    RMSNormKernelTester& epsilon(float epsilon) {
        epsilon_ = epsilon;
        return *this;
    }

    float epsilon() const {
        return epsilon_;
    }

    void Validate() const {
        ASSERT_NE(num_channels(), 0);
        ASSERT_NE(num_tokens(), 0);
        ASSERT_GE(epsilon(), 0.0f);
    }

    void TestF32_BF16W() const {
        Validate();

        metal::Buffer input_buffer{device_, num_tokens() * num_channels() * sizeof(float)};
        metal::Buffer weight_buffer{device_, num_channels() * sizeof(gptoss_bfloat16)};
        metal::Buffer output_buffer{device_, num_tokens() * num_channels() * sizeof(float)};
        metal::Buffer control_buffer{device_, sizeof(gptoss_control)};
        std::memset(control_buffer.ptr(), 0, sizeof(gptoss_control));

        metal::CommandBuffer command_buffer{command_queue_};

        command_buffer.encode_launch_f32_fill_random(
            f32_fill_random_fn_,
            /*threadgroup_size=*/0,
            /*max_threadgroups=*/kFillRandomMaxThreadgroups,
            /*output_buffer=*/input_buffer, /*output_offset=*/0,
            num_channels(), kSeed, /*offset=*/0, /*min=*/-1.0f, /*max=*/1.0);

        command_buffer.encode_launch_bf16_fill_random(
            bf16_fill_random_fn_,
            /*threadgroup_size=*/0,
            /*max_threadgroups=*/kFillRandomMaxThreadgroups,
            /*output_buffer=*/weight_buffer, /*output_offset=*/0,
            num_channels(), kSeed + 1, /*offset=*/0, /*min=*/-1.0f, /*max=*/1.0);

        Check(gptoss_metal_command_buffer_encode_launch_f32_bf16w_rmsnorm(
                command_buffer.handle(),
                f32_bf16w_rmsnorm_fn_.handle(),
                input_buffer.handle(),
                /*input_offset=*/0,
                weight_buffer.handle(),
                /*weight_offset=*/0,
                output_buffer.handle(),
                /*output_offset=*/0,
                control_buffer.handle(),
                /*control_offset=*/0,
                num_tokens(),
                num_channels(),
                epsilon()),
            "gptoss_metal_command_buffer_encode_launch_f32_bf16w_rmsnorm");

        command_buffer.commit();
        command_buffer.wait_completion();

        const float* input_ptr = static_cast<const float*>(input_buffer.ptr());
        const gptoss_bfloat16* weight_ptr = static_cast<const gptoss_bfloat16*>(weight_buffer.ptr());
        const float* output_ptr = static_cast<const float*>(output_buffer.ptr());
        for (std::uint32_t t = 0; t < num_tokens(); t++) {
            double sumsq = 0.0;
            for (std::uint32_t c = 0; c < num_channels(); c++) {
                const double val = static_cast<double>(input_ptr[t * num_channels() + c]);
                sumsq = std::fma(val, val, sumsq);
            }
            const double avgsq = sumsq / static_cast<double>(num_channels());
            const double scale = 1.0 / std::sqrt(avgsq + epsilon());
            for (std::uint32_t c = 0; c < num_channels(); c++) {
                const double input_val = upcast<double>(input_ptr[t * num_channels() + c]);
                const double weight_val = upcast<double>(weight_ptr[c]);
                const double ref_output = scale * input_val * weight_val;
                const double output = upcast<double>(output_ptr[t * num_channels() + c]);
                ASSERT_NEAR(output, ref_output, 1.0e-5 * std::abs(ref_output))
                    << "at channel " << c << " / " << num_channels() << ", token " << t << " / " << num_tokens()
                    << ", input " << input_val << ", weight " << weight_val << ", scale " << scale;
            }
        }
    }

private:
    static constexpr std::uint64_t kSeed{UINT64_C(1019827666124465388)};
    static constexpr std::size_t kFillRandomMaxThreadgroups = 10;

    metal::Device device_{};
    metal::CommandQueue command_queue_{device_};
    metal::Library library_{device_};
    metal::Function f32_fill_random_fn_{library_, "gptoss_f32_fill_random"};
    metal::Function bf16_fill_random_fn_{library_, "gptoss_bf16_fill_random"};
    metal::Function f32_bf16w_rmsnorm_fn_{library_, "gptoss_f32_bf16w_rmsnorm"};
    std::uint32_t num_tokens_{1};
    std::uint32_t num_channels_{1};
    float epsilon_{1.0e-5f};
};

}  // namespace gptoss


================================================
FILE: gpt_oss/metal/test/rope-kernel-tester.hpp
================================================
#pragma once

#include <gtest/gtest.h>

#include <cmath>
#include <cstddef>
#include <cstdint>

#include <internal/datatype.hpp>
#include <internal/metal.hpp>
#include <internal/metal-kernels.h>


namespace gptoss {

class RoPEKernelTester {
public:
    RoPEKernelTester() { }

    RoPEKernelTester(const RoPEKernelTester&) = delete;
    RoPEKernelTester(RoPEKernelTester&&) = delete;
    RoPEKernelTester& operator=(const RoPEKernelTester&) = delete;
    RoPEKernelTester& operator=(RoPEKernelTester&&) = delete;

    [[nodiscard]]
    RoPEKernelTester& threadgroup_size(std::size_t threadgroup_size) {
        threadgroup_size_ = threadgroup_size;
        return *this;
    }

    std::size_t threadgroup_size() const {
        return threadgroup_size_;
    }

    [[nodiscard]]
    RoPEKernelTester& head_dim(std::uint32_t head_dim) {
        head_dim_ = head_dim;
        return *this;
    }

    std::uint32_t head_dim() const {
        return head_dim_;
    }

    [[nodiscard]]
    RoPEKernelTester& num_q_heads(std::uint32_t num_q_heads) {
        num_q_heads_ = num_q_heads;
        return *this;
    }

    std::uint32_t num_q_heads() const {
        return num_q_heads_;
    }

    [[nodiscard]]
    RoPEKernelTester& num_kv_heads(std::uint32_t num_kv_heads) {
        num_kv_heads_ = num_kv_heads;
        return *this;
    }

    std::uint32_t num_kv_heads() const {
        return num_kv_heads_;
    }

    std::uint32_t num_qk_heads() const {
        return num_q_heads() + num_kv_heads();
    }

    std::uint32_t num_qkv_heads() const {
        return num_q_heads() + 2 * num_kv_heads();
    }

    [[nodiscard]]
    RoPEKernelTester& num_tokens(std::uint32_t num_tokens) {
        num_tokens_ = num_tokens;
        return *this;
    }

    std::uint32_t num_tokens() const {
        return num_tokens_;
    }

    [[nodiscard]]
    RoPEKernelTester& token_offset(std::uint32_t token_offset) {
        token_offset_ = token_offset;
        return *this;
    }

    std::uint32_t token_offset() const {
        return token_offset_;
    }

    [[nodiscard]]
    RoPEKernelTester& frequency_base(float frequency_base) {
        frequency_base_ = frequency_base;
        return *this;
    }

    float frequency_base() const {
        return frequency_base_;
    }

    void Validate() const {
        ASSERT_NE(head_dim(), 0);
        ASSERT_EQ(head_dim() % 2, 0);
        ASSERT_NE(num_q_heads(), 0);
        ASSERT_NE(num_tokens(), 0);
    }

    void TestF32() const {
        Validate();

        metal::Buffer activations_buffer{device_, (num_tokens() * num_qkv_heads() + num_qk_heads()) * head_dim() * sizeof(float)};
        metal::Buffer ref_activations_buffer{device_, (num_tokens() * num_qkv_heads() + num_qk_heads()) * head_dim() * sizeof(float)};
        // KV cache buffer layout: [num_kv_heads][max_tokens][2 (K,V)][head_dim]
        const std::uint32_t max_tokens = num_tokens();
        const std::uint32_t kv_heads_for_alloc = std::max<std::uint32_t>(1, num_kv_heads());
        const std::size_t kv_bytes = static_cast<std::size_t>(kv_heads_for_alloc) * max_tokens * 2 * head_dim() * sizeof(float);
        metal::Buffer kv_cache_buffer{device_, kv_bytes};
        metal::Buffer control_buffer{device_, sizeof(gptoss_control)};
        std::memset(control_buffer.ptr(), 0, sizeof(gptoss_control));

        metal::CommandBuffer command_buffer{command_queue_};

        command_buffer.encode_launch_f32_fill_random(
            f32_fill_random_fn_,
            /*threadgroup_size=*/0,
            /*max_threadgroups=*/kFillRandomMaxThreadgroups,
            /*output_buffer=*/activations_buffer,
            /*output_offset=*/0,
            (num_tokens() * num_qkv_heads() + num_qk_heads()) * head_dim(),
            kSeed, /*offset=*/0, /*min=*/-1.0f, /*max=*/1.0);

        command_buffer.encode_launch_f32_fill_random(
            f32_fill_random_fn_,
            /*threadgroup_size=*/0,
            /*max_threadgroups=*/kFillRandomMaxThreadgroups,
            /*output_buffer=*/ref_activations_buffer,
            /*output_offset=*/0,
            (num_tokens() * num_qkv_heads() + num_qk_heads()) * head_dim(),
            kSeed, /*offset=*/0, /*min=*/-1.0f, /*max=*/1.0);

        Check(gptoss_metal_command_buffer_encode_launch_f32_rope(
                command_buffer.handle(),
                f32_rope_fn_.handle(),
                threadgroup_size(),
                activations_buffer.handle(),
                /*activations_offset=*/0,
                kv_cache_buffer.handle(),
                /*kv_offset=*/0,
                control_buffer.handle(),
                /*control_offset=*/0,
                frequency_base(),
                /*interpolation_scale=*/1.0f,
                /*yarn_offset=*/0.0f,
                /*yarn_scale=*/1.0f,
                /*yarn_multiplier=*/1.0f,
                /*num_tokens=*/num_tokens(),
                /*num_q_heads=*/num_q_heads(),
                /*num_kv_heads=*/num_kv_heads(),
                head_dim(),
                /*max_tokens=*/max_tokens,
                /*token_offset=*/token_offset()),
            "gptoss_metal_command_buffer_encode_launch_f32_rope");

        command_buffer.commit();
        command_buffer.wait_completion();

        const float* ref_activations_ptr = static_cast<const float*>(ref_activations_buffer.ptr());
        const float* activations_ptr = static_cast<const float*>(activations_buffer.ptr());
        const float* kv_ptr = static_cast<const float*>(kv_cache_buffer.ptr());
        for (std::uint32_t t = 0; t < num_tokens(); t++) {
            // Validate rotated Q written in-place in activations
            for (std::uint32_t h = 0; h < num_q_heads(); h++) {
                for (std::uint32_t d = 0; d < head_dim(); d += 2) {
                    const double inv_freq = 1.0 /
                        std::pow(static_cast<double>(frequency_base()), static_cast<double>(d) / static_cast<double>(head_dim()));
                    const double phi = static_cast<double>(t + token_offset()) * inv_freq;
                    const double cos_phi = std::cos(phi);
                    const double sin_phi = std::sin(phi);
                    const double real = static_cast<double>(ref_activations_ptr[(t * num_qkv_heads() + h) * head_dim() + d]);
                    const double imag = static_cast<double>(ref_activations_ptr[(t * num_qkv_heads() + h) * head_dim() + d + 1]);
                    const double ref_real = real * cos_phi - imag * sin_phi;
                    const double ref_imag = real * sin_phi + imag * cos_phi;
                    ASSERT_NEAR(
                            static_cast<double>(activations_ptr[(t * num_qkv_heads() + h) * head_dim() + d]),
                            ref_real,
                            std::abs(ref_real) * 1.0e-4)
                        << "at token " << t << " / " << num_tokens();
                    ASSERT_NEAR(
                            static_cast<double>(activations_ptr[(t * num_qkv_heads() + h) * head_dim() + d + 1]),
                            ref_imag,
                            std::abs(ref_imag) * 1.0e-4)
                        << "at token " << t << " / " << num_tokens();

                }
            }
        }
    }

private:
    static constexpr uint64_t kSeed{UINT64_C(1019827666124465388)};
    static constexpr std::size_t kFillRandomMaxThreadgroups = 10;

    metal::Device device_{};
    metal::CommandQueue command_queue_{device_};
    metal::Library library_{device_};
    metal::Function f32_fill_random_fn_{library_, "gptoss_f32_fill_random"};
    metal::Function f32_rope_fn_{library_, "gptoss_f32_rope"};
    std::size_t threadgroup_size_{32};
    std::uint32_t head_dim_{64};
    std::uint32_t num_q_heads_{1};
    std::uint32_t num_kv_heads_{1};
    std::uint32_t num_tokens_{1};
    std::uint32_t token_offset_{0};
    float frequency_base_{50000.0f};
};

}  // namespace gptoss


================================================
FILE: gpt_oss/metal/test/u32-random.cc
================================================
#include <gtest/gtest.h>

#include <cstddef>
#include <cstdint>

#include "fill-random-kernel-tester.hpp"


using gptoss::FillRandomKernelTester;

constexpr std::size_t kThreadgroupSize = 128;

TEST(U32_FILL_RANDOM, single_threadgroup_single_iteration) {
    FillRandomKernelTester()
        .num_elements(kThreadgroupSize)
        .threadgroup_size(kThreadgroupSize)
        .max_threadgroups(1)
        .TestU32();
}

TEST(U32_FILL_RANDOM, single_threadgroup_multiple_iterations) {
    constexpr std::size_t num_iterations = 3;

    FillRandomKernelTester()
        .num_elements(num_iterations * kThreadgroupSize)
        .threadgroup_size(kThreadgroupSize)
        .max_threadgroups(1)
        .TestU32();
}

TEST(U32_FILL_RANDOM, multiple_threadgroups_multiple_iterations) {
    constexpr std::size_t num_iterations = 3;
    constexpr std::size_t num_threadgroups = 2;

    FillRandomKernelTester()
        .num_elements(num_iterations * num_threadgroups * kThreadgroupSize)
        .threadgroup_size(kThreadgroupSize)
        .max_threadgroups(num_threadgroups)
        .TestU32();
}

TEST(U32_FILL_RANDOM, excessive_threadgroups) {
    FillRandomKernelTester()
        .num_elements(kThreadgroupSize)
        .threadgroup_size(kThreadgroupSize)
        .max_threadgroups(2)
        .TestU32();
}

TEST(U32_FILL_RANDOM, nonuniform_range) {
    constexpr std::size_t num_iterations = 3;
    constexpr std::size_t num_threadgroups = 2;

    FillRandomKernelTester()
        .num_elements((num_iterations * num_threadgroups + 1) * kThreadgroupSize)
        .threadgroup_size(kThreadgroupSize)
        .max_threadgroups(num_threadgroups)
        .TestU32();
}

TEST(U32_FILL_RANDOM, partial_range) {
    constexpr std::size_t num_iterations = 3;
    constexpr std::size_t num_threadgroups = 2;

    FillRandomKernelTester()
        .num_elements(num_iterations * num_threadgroups * kThreadgroupSize + 1)
        .threadgroup_size(kThreadgroupSize)
        .max_threadgroups(num_threadgroups)
        .TestU32();
}


================================================
FILE: gpt_oss/responses_api/__init__.py
================================================


================================================
FILE: gpt_oss/responses_api/api_server.py
================================================
import os
import datetime
import uuid
from typing import Callable, Literal, Optional, Union

from fastapi import FastAPI, Request
from fastapi.exception_handlers import request_validation_exception_handler
from fastapi.exceptions import RequestValidationError
from fastapi.responses import StreamingResponse
from openai_harmony import (
    Author,
    Conversation,
    DeveloperContent,
    HarmonyEncoding,
    Message,
    ReasoningEffort,
    Role,
    StreamableParser,
    StreamState,
    SystemContent,
    ToolDescription,
)

from gpt_oss.tools.python_docker.docker_tool import PythonTool
from gpt_oss.tools.simple_browser import SimpleBrowserTool
from gpt_oss.tools.simple_browser.backend import YouComBackend, ExaBackend

from .events import (
    ResponseCodeInterpreterCallCodeDelta,
    ResponseCodeInterpreterCallCodeDone,
    ResponseCodeInterpreterCallCompleted,
    ResponseCodeInterpreterCallInProgress,
    ResponseCodeInterpreterCallInterpreting,
    ResponseCompletedEvent,
    ResponseContentPartAdded,
    ResponseContentPartDone,
    ResponseCreatedEvent,
    ResponseEvent,
    ResponseInProgressEvent,
    ResponseOutputItemAdded,
    ResponseOutputItemDone,
    ResponseOutputTextAnnotationAdded,
    ResponseOutputTextDelta,
    ResponseOutputTextDone,
    ResponseReasoningTextDelta,
    ResponseReasoningTextDone,
    ResponseWebSearchCallCompleted,
    ResponseWebSearchCallInProgress,
    ResponseWebSearchCallSearching,
)
from .types import (
    CodeInterpreterCallItem,
    CodeInterpreterOutputImage,
    CodeInterpreterOutputLogs,
    Error,
    FunctionCallItem,
    Item,
    ReasoningItem,
    ReasoningTextContentItem,
    ResponseObject,
    ResponsesRequest,
    TextContentItem,
    UrlCitation,
    Usage,
    WebSearchActionFind,
    WebSearchActionOpenPage,
    WebSearchActionSearch,
    WebSearchCallItem,
)

DEFAULT_TEMPERATURE = 0.0


def get_reasoning_effort(
    effort: Union[Literal["low", "medium", "high"], ReasoningEffort]
) -> ReasoningEffort:
    if isinstance(effort, ReasoningEffort):
        return effort
    if effort == "low":
        return ReasoningEffort.LOW
    if effort == "medium":
        return ReasoningEffort.MEDIUM
    if effort == "high":
        return ReasoningEffort.HIGH
    raise ValueError(f"Invalid reasoning effort: {effort}")


def is_not_builtin_tool(
    recipient: str, treat_functions_python_as_builtin: bool = False
) -> bool:
    if treat_functions_python_as_builtin and recipient == "functions.python":
        return False
    return (
        not recipient.startswith("browser.")
        and recipient != "python"
        and recipient != "assistant"
    )


def create_api_server(
    infer_next_token: Callable[[list[int], float], int], encoding: HarmonyEncoding
) -> FastAPI:
    app = FastAPI()

    @app.exception_handler(RequestValidationError)
    async def log_validation_error(request: Request, exc: RequestValidationError):
        try:
            body_bytes = await request.body()
            print(
                "Invalid request body received:"
                f" {body_bytes.decode('utf-8', errors='replace')}"
            )
        except Exception as body_exc:
            print(f"Failed to read invalid request body: {body_exc}")
        return await request_validation_exception_handler(request, exc)
    responses_store: dict[str, tuple[ResponsesRequest, ResponseObject]] = {}

    def generate_response(
        input_tokens: list[int],
        output_tokens: list[int],
        request_body: ResponsesRequest,
        debug_mode: bool = False,
        function_call_ids: Optional[list[tuple[str, str]]] = None,
        response_id: Optional[str] = None,
        previous_response_id: Optional[str] = None,
        browser_tool: Optional[SimpleBrowserTool] = None,
        browser_call_ids: Optional[list[str]] = None,
        python_tool: Optional[PythonTool] = None,
        python_call_ids: Optional[list[str]] = None,
        python_call_outputs: Optional[
            dict[str, list[CodeInterpreterOutputLogs | CodeInterpreterOutputImage]]
        ] = None,
        reasoning_ids: Optional[list[str]] = None,
        message_ids: Optional[list[str]] = None,
        treat_functions_python_as_builtin: bool = False,
    ) -> ResponseObject:
        output = []
        error = None
        if len(output_tokens) > 0:
            if debug_mode:
                try:
                    entries = encoding.parse_messages_from_completion_tokens(
                        output_tokens, Role.ASSISTANT
                    )
                except Exception as e:
                    print(f"Error parsing tokens: {e}")
                    error = Error(
                        code="invalid_function_call",
                        message=f"{e}",
                    )
                    entries = []
            else:
                entries = encoding.parse_messages_from_completion_tokens(
                    output_tokens, Role.ASSISTANT
                )

            fc_index = 0
            browser_tool_index = 0
            python_tool_index = 0
            reasoning_ids_iter = iter(reasoning_ids or [])
            message_ids_iter = iter(message_ids or [])

            for entry in entries:
                entry_dict = entry.to_dict()
                recipient = entry_dict.get("recipient", "")
                if len(recipient) > 0 and is_not_builtin_tool(
                    recipient, treat_functions_python_as_builtin
                ):
                    call = entry_dict["content"][0]
                    arguments = call["text"]
                    name = recipient

                    if name.startswith("functions."):
                        name = name[len("functions.") :]
                    if function_call_ids and fc_index < len(function_call_ids):
                        fc_id, call_id = function_call_ids[fc_index]
                    else:
                        fc_id, call_id = (
                            f"fc_{uuid.uuid4().hex}",
                            f"call_{uuid.uuid4().hex}",
                        )
                    fc_index += 1
                    output.append(
                        FunctionCallItem(
                            type="function_call",
                            name=name,
                            arguments=arguments,
                            id=fc_id,
                            call_id=call_id,
                        )
                    )
                elif (
                    len(recipient) > 0
                    and recipient.startswith("browser.")
                    and browser_tool is not None
                ):
                    # Mirror event-based creation of WebSearchCallItems when the browser tool is invoked
                    name = recipient
                    call = entry_dict["content"][0]
                    arguments = call["text"]
                    function_name = name[len("browser.") :]

                    # Reconstruct a Message for argument parsing
                    tool_msg = (
                        Message.from_role_and_content(Role.ASSISTANT, arguments)
                        .with_recipient(name)
                        .with_channel("analysis")
                    )

                    action = None
                    try:
                        parsed_args = browser_tool.process_arguments(tool_msg)
                        if function_name == "search":
                            action = WebSearchActionSearch(
                                type="search",
                                query=parsed_args["query"],
                            )
                        elif function_name == "open":
                            action = WebSearchActionOpenPage(
                                type="open_page",
                                url=parsed_args["url"],
                            )
                        elif function_name == "find":
                            action = WebSearchActionFind(
                                type="find",
                                pattern=parsed_args["pattern"],
                                url=parsed_args["url"],
                            )
                    except Exception as e:
                        print(f"Error processing browser tool arguments: {e}")
                        action = None

                    if action is not None:
                        if browser_call_ids and browser_tool_index < len(
                            browser_call_ids
                        ):
                            web_search_call_id = browser_call_ids[browser_tool_index]
                        else:
                            web_search_call_id = f"ws_{uuid.uuid4().hex}"
                        browser_tool_index += 1
                        output.append(
                            WebSearchCallItem(
                                type="web_search_call",
                                id=web_search_call_id,
                                action=action,
                            )
                        )
                elif (
                    len(recipient) > 0
                    and (
                        recipient.startswith("python")
                        or (
                            treat_functions_python_as_builtin
                            and recipient == "functions.python"
                        )
                    )
                    and python_tool is not None
                ):
                    if python_call_ids and python_tool_index < len(python_call_ids):
                        code_call_id = python_call_ids[python_tool_index]
                    else:
                        code_call_id = f"ci_{uuid.uuid4().hex}"
                    python_tool_index += 1
                    code_snippet = None
                    if entry_dict.get("content"):
                        code_snippet = entry_dict["content"][0].get("text")
                    outputs = (
                        (python_call_outputs or {}).get(code_call_id)
                        if python_call_outputs
                        else None
                    )
                    output.append(
                        CodeInterpreterCallItem(
                            type="code_interpreter_call",
                            id=code_call_id,
                            status="completed",
                            code=code_snippet,
                            outputs=outputs,
                        )
                    )
                elif entry_dict["channel"] == "final":
                    content = []
                    for content_entry in entry_dict["content"]:
                        if browser_tool:
                            text_content, annotation_entries, _has_partial_citations = (
                                browser_tool.normalize_citations(content_entry["text"])
                            )
                            annotations = [UrlCitation(**a) for a in annotation_entries]
                        else:
                            text_content = content_entry["text"]
                            annotations = []

                        content.append(
                            TextContentItem(
                                type="output_text",
                                text=text_content,
                                annotations=annotations,
                            )
                        )

                    message_id = next(message_ids_iter, None)
                    output.append(
                        Item(
                            id=message_id,
                            type="message",
                            role="assistant",
                            content=content,
                            status="completed",
                        )
                    )
                elif entry_dict["channel"] == "analysis":
                    if entry_dict.get("recipient"):
                        continue
                    author_dict = entry_dict.get("author") or {}
                    if author_dict.get("role") and author_dict.get("role") != "assistant":
                        continue
                    summary = []
                    content = [
                        ReasoningTextContentItem(
                            type="reasoning_text",
                            text=entry["text"],
                        )
                        for entry in entry_dict["content"]
                    ]
                    reasoning_id = next(reasoning_ids_iter, None)
                    if reasoning_id is None:
                        reasoning_id = f"rs_{uuid.uuid4().hex}"
                    output.append(
                        ReasoningItem(
                            id=reasoning_id,
                            type="reasoning",
                            summary=summary,
                            content=content,
                        )
                    )
        else:
            output = []

        usage = (
            Usage(
                input_tokens=len(input_tokens),
                output_tokens=len(output_tokens),
                total_tokens=len(input_tokens) + len(output_tokens),
            )
            if len(output_tokens) > 0
            else None
        )

        try:
            debug_str = encoding.decode_utf8(input_tokens + output_tokens)
        except Exception:
            debug_str = input_tokens + output_tokens
        try:
            debug_input_str = encoding.decode_utf8(input_tokens)
        except Exception:
            debug_input_str = input_tokens
        try:
            debug_output_str = encoding.decode_utf8(output_tokens)
        except Exception:
            debug_output_str = output_tokens

        metadata = (
            {
                "__debug": debug_str,
                "__debug_input": debug_input_str,
                "__debug_output": debug_output_str,
            }
            if debug_mode
            else {}
        )

        return ResponseObject(
            created_at=int(datetime.datetime.now().timestamp()),
            status="completed",
            output=output,
            text={"format": {"type": "text"}},
            usage=usage,
            max_output_tokens=request_body.max_output_tokens,
            error=error,
            metadata=metadata,
            id=response_id,
            previous_response_id=previous_response_id,
        )

    class StreamResponsesEvents:
        BROWSER_RESERVED_FUNCTIONS = {"browser.search", "browser.open", "browser.find"}
        initial_tokens: list[int]
        tokens: list[int]
        output_tokens: list[int]
        output_text: str
        request_body: ResponsesRequest
        request: Request
        sequence_number: int

        def __init__(
            self,
            initial_tokens,
            request_body: ResponsesRequest,
            as_sse: bool = False,
            request: Optional[Request] = None,
            response_id: Optional[str] = None,
            store_callback: Optional[
                Callable[[str, ResponsesRequest, ResponseObject], None]
            ] = None,
            browser_tool: Optional[SimpleBrowserTool] = None,
            python_tool: Optional[PythonTool] = None,
            functions_python_as_builtin: bool = False,
        ):
            self.initial_tokens = initial_tokens
            self.tokens = initial_tokens.copy()
            self.output_tokens = []
            self.output_text = ""
            self.request_body = request_body
            self.parser = StreamableParser(encoding, role=Role.ASSISTANT)
            self.as_sse = as_sse
            self.debug_mode = request_body.metadata.get(
                "__debug", False
            )  # we use this for demo purposes
            # Set temperature for this stream, fallback to DEFAULT_TEMPERATURE if not set
            self.temperature = (
                request_body.temperature
                if request_body.temperature is not None
                else DEFAULT_TEMPERATURE
            )
            self.request = request
            self.sequence_number = 0
            self.function_call_ids: list[tuple[str, str]] = []
            self.response_id = response_id
            self.store_callback = store_callback
            self.new_request = True
            self.browser_tool = browser_tool
            self.use_browser_tool = browser_tool is not None
            self.browser_call_ids: list[str] = []
            self.python_tool = python_tool
            self.use_code_interpreter = python_tool is not None
            self.python_call_ids: list[str] = []
            self.python_call_outputs: dict[
                str, list[CodeInterpreterOutputLogs | CodeInterpreterOutputImage]
            ] = {}
            self.reasoning_item_ids: list[str] = []
            self.current_reasoning_item_id: Optional[str] = None
            self.message_item_ids: list[str] = []
            self.current_message_item_id: Optional[str] = None
            self.functions_python_as_builtin = functions_python_as_builtin
            self.user_defined_function_names = {
                name
                for tool in (request_body.tools or [])
                for name in [getattr(tool, "name", None)]
                if getattr(tool, "type", None) == "function" and name
            }

        def _resolve_browser_recipient(
            self, recipient: Optional[str]
        ) -> tuple[Optional[str], bool]:
            if not self.use_browser_tool or not recipient:
                return (None, False)

            if recipient.startswith("browser."):
                return (recipient, False)

            if recipient.startswith("functions."):
                potential = recipient[len("functions.") :]
                if (
                    potential in self.BROWSER_RESERVED_FUNCTIONS
                    and potential not in self.user_defined_function_names
                ):
                    return (potential, True)

            return (None, False)

        def _ensure_message_item_id(self) -> str:
            if self.current_message_item_id is None:
                message_id = f"item_{uuid.uuid4().hex}"
                self.current_message_item_id = message_id
                self.message_item_ids.append(message_id)
            return self.current_message_item_id

        def _ensure_reasoning_item_id(self) -> str:
            if self.current_reasoning_item_id is None:
                reasoning_id = f"rs_{uuid.uuid4().hex}"
                self.current_reasoning_item_id = reasoning_id
                self.reasoning_item_ids.append(reasoning_id)
            return self.current_reasoning_item_id

        def _send_event(self, event: ResponseEvent):
            event.sequence_number = self.sequence_number
            self.sequence_number += 1
            if self.as_sse:
                return f"event: {event.type}\ndata: {event.model_dump_json(indent=None)}\n\n"
            else:
                return event

        async def run(self):
            browser_tool = self.browser_tool
            self.new_request = True
            initial_response = generate_response(
                self.initial_tokens,
                self.output_tokens,
                self.request_body,
                function_call_ids=self.function_call_ids,
                response_id=self.response_id,
                previous_response_id=self.request_body.previous_response_id,
                browser_tool=self.browser_tool,
                browser_call_ids=self.browser_call_ids,
                python_tool=self.python_tool,
                python_call_ids=self.python_call_ids,
                python_call_outputs=getattr(self, "python_call_outputs", None),
                reasoning_ids=self.reasoning_item_ids,
                message_ids=self.message_item_ids,
                treat_functions_python_as_builtin=self.functions_python_as_builtin,
            )
            initial_response.status = "in_progress"
            yield self._send_event(
                ResponseCreatedEvent(
                    type="response.created",
                    response=initial_response,
                )
            )
            yield self._send_event(
                ResponseInProgressEvent(
                    type="response.in_progress",
                    response=initial_response,
                )
            )

            current_content_index = (
                0  # for this implementation we will always have one content item only
            )
            current_output_index = -1
            sent_output_item_added = False

            # we use this if the model outputs a citation to buffer until completed
            output_delta_buffer = ""
            # we use this to track the current output text content for things like providing the right indices in citations
            current_output_text_content = ""
            current_annotations = []

            while True:
                # Check for client disconnect
                if self.request is not None and await self.request.is_disconnected():
                    print("Client disconnected, stopping token generation.")
                    break
                next_tok = infer_next_token(
                    self.tokens,
                    temperature=self.temperature,
                    new_request=self.new_request,
                )
                self.new_request = False
                self.tokens.append(next_tok)
                try:
                    self.parser.process(next_tok)
                except Exception:
                    pass

                if self.parser.state == StreamState.EXPECT_START:
                    current_output_index += 1
                    sent_output_item_added = False

                    if len(self.parser.messages) > 0:
                        previous_item = self.parser.messages[-1]
                        if previous_item.recipient is not None:
                            recipient = previous_item.recipient
                            browser_recipient, _ = self._resolve_browser_recipient(
                                recipient
                            )
                            if (
                                browser_recipient is None
                                and not (
                                    recipient == "python"
                                    or (
                                        self.functions_python_as_builtin
                                        and recipient == "functions.python"
                                    )
                                )
                            ):
                                fc_id = f"fc_{uuid.uuid4().hex}"
                                call_id = f"call_{uuid.uuid4().hex}"
                                self.function_call_ids.append((fc_id, call_id))
                                yield self._send_event(
                                    ResponseOutputItemDone(
                                        type="response.output_item.done",
                                        output_index=current_output_index,
                                        item=FunctionCallItem(
                                            type="function_call",
                                            name=(
                                                previous_item.recipient[
                                                    len("functions.") :
                                                ]
                                                if previous_item.recipient.startswith(
                                                    "functions."
                                                )
                                                else previous_item.recipient
                                            ),
                                            arguments=previous_item.content[0].text,
                                            id=fc_id,
                                            call_id=call_id,
                                        ),
                                    )
                                )
                        if (
                            previous_item.channel == "analysis"
                            and previous_item.recipient is None
                        ):
                            reasoning_id = (
                                self.current_reasoning_item_id
                                if self.current_reasoning_item_id is not None
                                else self._ensure_reasoning_item_id()
                            )
                            reasoning_text = previous_item.content[0].text
                            yield self._send_event(
                                ResponseReasoningTextDone(
                                    type="response.reasoning_text.done",
                                    output_index=current_output_index,
                                    content_index=current_content_index,
                                    item_id=reasoning_id,
                                    text=reasoning_text,
                                )
                            )
                            yield self._send_event(
                                ResponseContentPartDone(
                                    type="response.content_part.done",
                                    output_index=current_output_index,
                                    content_index=current_content_index,
                                    item_id=reasoning_id,
                                    part=ReasoningTextContentItem(
                                        type="reasoning_text",
                                        text=reasoning_text,
                                    ),
                                )
                            )
                            yield self._send_event(
                                ResponseOutputItemDone(
                                    type="response.output_item.done",
                                    output_index=current_output_index,
                                    item=ReasoningItem(
                                        id=reasoning_id,
                                        type="reasoning",
                                        summary=[],
                                        content=[
                                            ReasoningTextContentItem(
                                                type="reasoning_text",
                                                text=reasoning_text,
                                            )
                                        ],
                                    ),
                                )
                            )
                            self.current_reasoning_item_id = None
                        if previous_item.channel == "final":
                            annotations = [
                                UrlCitation(**a) for a in current_annotations
                            ]
                            if browser_tool:
                                (
                                    normalized_text,
                                    _annotations,
                                    _has_partial_citations,
                                ) = browser_tool.normalize_citations(
                                    previous_item.content[0].text
                                )
                            else:
                                normalized_text = previous_item.content[0].text
                                annotations = []
                            text_content = TextContentItem(
                                type="output_text",
                                text=normalized_text,
                                annotations=annotations,
                            )
                            message_id = (
                                self.current_message_item_id
                                if self.current_message_item_id is not None
                                else self._ensure_message_item_id()
                            )
                            yield self._send_event(
                                ResponseOutputTextDone(
                                    type="response.output_text.done",
                                    output_index=current_output_index,
                                    content_index=current_content_index,
                                    item_id=message_id,
                                    text=normalized_text,
                                )
                            )
                            yield self._send_event(
                                ResponseContentPartDone(
                                    type="response.content_part.done",
                                    output_index=current_output_index,
                                    content_index=current_content_index,
                                    item_id=message_id,
                                    part=text_content,
                                )
                            )
                            yield self._send_event(
                                ResponseOutputItemDone(
                                    type="response.output_item.done",
                                    output_index=current_output_index,
                                    item=Item(
                                        id=message_id,
                                        type="message",
                                        role="assistant",
                                        content=[text_content],
                                    ),
                                )
                            )
                            current_annotations = []
                            current_output_text_content = ""
                            self.current_message_item_id = None

                if (
                    self.parser.last_content_delta
                    and self.parser.current_channel == "final"
                    and self.parser.current_recipient is None
                ):
                    if not sent_output_item_added:
                        sent_output_item_added = True
                        message_id = self._ensure_message_item_id()
                        yield self._send_event(
                            ResponseOutputItemAdded(
                                type="response.output_item.added",
                                output_index=current_output_index,
                                item=Item(
                                    id=message_id,
                                    type="message",
                                    role="assistant",
                                    content=[],
                                ),
                            )
                        )
                        yield self._send_event(
                            ResponseContentPartAdded(
                                type="response.content_part.added",
                                output_index=current_output_index,
                                content_index=current_content_index,
                                item_id=message_id,
                                part=TextContentItem(type="output_text", text=""),
                            )
                        )

                    output_delta_buffer += self.parser.last_content_delta
                    should_send_output_text_delta = True
                    if browser_tool:
                        # we normalize on the full current text to get the right indices in citations
                        updated_output_text, annotations, has_partial_citations = (
                            browser_tool.normalize_citations(
                                current_output_text_content + output_delta_buffer
                            )
                        )
                        # remove the current text to get back the delta but now normalized
                        output_delta_buffer = updated_output_text[
                            len(current_output_text_content) :
                        ]

                        # Filter annotations to only include those whose start_index is not already present in current_annotations
                        # this is to avoid sending duplicate annotations as multiple annotations can't be in the same place
                        existing_start_indices = {
                            a["start_index"] for a in current_annotations
                        }
                        new_annotations = [
                            a
                            for a in annotations
                            if a["start_index"] not in existing_start_indices
                        ]
                        for a in new_annotations:
                            current_annotations.append(a)
                            citation = UrlCitation(**a)
                            message_id = self._ensure_message_item_id()
                            yield self._send_event(
                                ResponseOutputTextAnnotationAdded(
                                    type="response.output_text.annotation.added",
                                    output_index=current_output_index,
                                    content_index=current_content_index,
                                    item_id=message_id,
                                    annotation_index=len(current_annotations),
                                    annotation=citation,
                                )
                            )

                        if has_partial_citations:
                            should_send_output_text_delta = False

                    if should_send_output_text_delta:
                        message_id = self._ensure_message_item_id()
                        yield self._send_event(
                            ResponseOutputTextDelta(
                                type="response.output_text.delta",
                                output_index=current_output_index,
                                content_index=current_content_index,
                                item_id=message_id,
                                delta=output_delta_buffer,
                            )
                        )
                        current_output_text_content += output_delta_buffer
                        output_delta_buffer = ""

                if (
                    self.parser.last_content_delta
                    and self.parser.current_channel == "analysis"
                    and self.parser.current_recipient is None
                ):
                    if not sent_output_item_added:
                        sent_output_item_added = True
                        reasoning_id = self._ensure_reasoning_item_id()
                        yield self._send_event(
                            ResponseOutputItemAdded(
                                type="response.output_item.added",
                                output_index=current_output_index,
                                item=ReasoningItem(
                                    id=reasoning_id,
                                    type="reasoning",
                                    summary=[],
                                    content=[],
                                ),
                            )
                        )
                        yield self._send_event(
                            ResponseContentPartAdded(
                                type="response.content_part.added",
                                output_index=current_output_index,
                                content_index=current_content_index,
                                item_id=reasoning_id,
                                part=ReasoningTextContentItem(
                                    type="reasoning_text", text=""
                                ),
                            )
                        )
                    reasoning_id = self._ensure_reasoning_item_id()
                    yield self._send_event(
                        ResponseReasoningTextDelta(
                            type="response.reasoning_text.delta",
                            output_index=current_output_index,
                            content_index=current_content_index,
                            item_id=reasoning_id,
                            delta=self.parser.last_content_delta,
                        )
                    )

                try:
                    # purely for debugging purposes
                    output_token_text = encoding.decode_utf8([next_tok])
                    self.output_text += output_token_text
                    print(output_token_text, end="", flush=True)

                except RuntimeError:
                    pass

                if next_tok in encoding.stop_tokens_for_assistant_actions():
                    if len(self.parser.messages) > 0:
                        last_message = self.parser.messages[-1]
                        browser_recipient, is_browser_fallback = (
                            self._resolve_browser_recipient(last_message.recipient)
                        )
                        if browser_recipient is not None and browser_tool is not None:
                            message_for_browser = (
                                last_message
                                if not is_browser_fallback
                                else last_message.with_recipient(browser_recipient)
                            )
                            function_name = browser_recipient[len("browser.") :]
                            action = None
                            parsed_args = browser_tool.process_arguments(
                                message_for_browser
                            )
                            if function_name == "search":
                                action = WebSearchActionSearch(
                                    type="search",
                                    query=parsed_args["query"],
                                )
                            elif function_name == "open":
                                action = WebSearchActionOpenPage(
                                    type="open_page",
                                    url=(
                                        parsed_args["url"]
                                        if "url" in parsed_args
                                        else None
                                    ),
                                )
                            elif function_name == "find":
                                action = WebSearchActionFind(
                                    type="find",
                                    pattern=parsed_args["pattern"],
                                    url=(
                                        parsed_args["url"]
                                        if "url" in parsed_args
                                        else None
                                    ),
                                )

                            if action is not None:
                                web_search_call_id = f"ws_{uuid.uuid4().hex}"
                                self.browser_call_ids.append(web_search_call_id)
                                yield self._send_event(
                                    ResponseOutputItemAdded(
                                        type="response.output_item.added",
                                        output_index=current_output_index,
                                        item=WebSearchCallItem(
                                            type="web_search_call",
                                            id=web_search_call_id,
                                            action=action,
                                        ),
                                    )
                                )
                            yield self._send_event(
                                ResponseWebSearchCallInProgress(
                                    type="response.web_search_call.in_progress",
                                    output_index=current_output_index,
                                    item_id=web_search_call_id,
                                )
                            )

                            async def run_tool():
                                results = []
                                async for msg in browser_tool.process(
                                    message_for_browser
                                ):
                                    results.append(msg)
                                return results

                            yield self._send_event(
                                ResponseWebSearchCallSearching(
                                    type="response.web_search_call.searching",
                                    output_index=current_output_index,
                                    item_id=web_search_call_id,
                                )
                            )
                            result = await run_tool()

                            new_tokens = encoding.render_conversation_for_completion(
                                Conversation.from_messages(result), Role.ASSISTANT
                            )

                            print(encoding.decode_utf8(new_tokens))
                            self.output_tokens.append(next_tok)
                            self.tokens.append(
                                encoding.encode("<|end|>", allowed_special="all")[0]
                            )

                            for token in new_tokens:
                                self.parser.process(token)
                                self.output_tokens.append(token)
                                self.tokens.append(token)

                            yield self._send_event(
                                ResponseWebSearchCallCompleted(
                                    type="response.web_search_call.completed",
                                    output_index=current_output_index,
                                    item_id=web_search_call_id,
                                )
                            )
                            yield self._send_event(
                                ResponseOutputItemDone(
                                    type="response.output_item.done",
                                    output_index=current_output_index,
                                    item=WebSearchCallItem(
                                        type="web_search_call",
                                        id=web_search_call_id,
                                        action=action,
                                    ),
                                )
                            )

                            current_output_index += 1
                            self.new_request = True

                            continue

                        elif (
                            self.use_code_interpreter
                            and last_message.recipient is not None
                            and (
                                last_message.recipient.startswith("python")
                                or (
                                    self.functions_python_as_builtin
                                    and last_message.recipient == "functions.python"
                                )
                            )
                        ):
                            code_call_id = f"ci_{uuid.uuid4().hex}"
                            code_snippet = None
                            if (
                                last_message.content
                                and len(last_message.content) > 0
                                and getattr(last_message.content[0], "text", None)
                            ):
                                text_value = last_message.content[0].text or ""
                                code_snippet = text_value if text_value.strip() else None

                            self.python_call_ids.append(code_call_id)
                            yield self._send_event(
                                ResponseOutputItemAdded(
                                    type="response.output_item.added",
                                    output_index=current_output_index,
                                    item=CodeInterpreterCallItem(
                                        type="code_interpreter_call",
                                        id=code_call_id,
                                        status="in_progress",
                                        code=code_snippet,
                                    ),
                                )
                            )
                            yield self._send_event(
                                ResponseCodeInterpreterCallInProgress(
                                    type="response.code_interpreter_call.in_progress",
                                    output_index=current_output_index,
                                    item_id=code_call_id,
                                )
                            )
                            if code_snippet:
                                yield self._send_event(
                                    ResponseCodeInterpreterCallCodeDelta(
                                        type="response.code_interpreter_call_code.delta",
                                        output_index=current_output_index,
                                        item_id=code_call_id,
                                        delta=code_snippet,
                                    )
                                )
                                yield self._send_event(
                                    ResponseCodeInterpreterCallCodeDone(
                                        type="response.code_interpreter_call_code.done",
                                        output_index=current_output_index,
                                        item_id=code_call_id,
                                        code=code_snippet,
                                    )
                                )
                            yield self._send_event(
                                ResponseCodeInterpreterCallInterpreting(
                                    type="response.code_interpreter_call.interpreting",
                                    output_index=current_output_index,
                                    item_id=code_call_id,
                                )
                            )

                            async def run_python_tool():
                                results = []
                                async for msg in self.python_tool.process(last_message):
                                    results.append(msg)
                                return results

                            result = await run_python_tool()

                            print(result)

                            code_outputs: list[
                                CodeInterpreterOutputLogs | CodeInterpreterOutputImage
                            ] = []
                            for message in result:
                                for content in getattr(message, "content", []):
                                    text_value = getattr(content, "text", None)
                                    if text_value:
                                        code_outputs.append(
                                            CodeInterpreterOutputLogs(
                                                type="logs",
                                                logs=text_value,
                                            )
                                        )

                            self.python_call_outputs[code_call_id] = code_outputs

                            new_tokens = encoding.render_conversation_for_completion(
                                Conversation.from_messages(result), Role.ASSISTANT
                            )

                            print(encoding.decode_utf8(new_tokens))
                            self.output_tokens.append(next_tok)
                            self.tokens.append(
                                encoding.encode("<|end|>", allowed_special="all")[0]
                            )

                            for token in new_tokens:
                                self.parser.process(token)
                                self.output_tokens.append(token)
                                self.tokens.append(token)

                            yield self._send_event(
                                ResponseCodeInterpreterCallCompleted(
                                    type="response.code_interpreter_call.completed",
                                    output_index=current_output_index,
                                    item_id=code_call_id,
                                )
                            )
                            yield self._send_event(
                                ResponseOutputItemDone(
                                    type="response.output_item.done",
                                    output_index=current_output_index,
                                    item=CodeInterpreterCallItem(
                                        type="code_interpreter_call",
                                        id=code_call_id,
                                        status="completed",
                                        code=code_snippet,
                                        outputs=code_outputs or None,
                                    ),
                                )
                            )

                            current_output_index += 1
                            self.new_request = True

                            continue

                        else:
                            break
                    else:
                        raise ValueError("No messages to process")
                if len(self.output_tokens) >= self.request_body.max_output_tokens:
                    break

                # Adding in the end if we know we are not done
                self.output_tokens.append(next_tok)

            if self.request is None or not await self.request.is_disconnected():
                response = generate_response(
                    self.initial_tokens,
                    self.output_tokens,
                    self.request_body,
                    debug_mode=self.debug_mode,
                    function_call_ids=self.function_call_ids,
                    response_id=self.response_id,
                    previous_response_id=self.request_body.previous_response_id,
                    browser_tool=self.browser_tool,
                    browser_call_ids=self.browser_call_ids,
                    python_tool=self.python_tool,
                    python_call_ids=self.python_call_ids,
                    python_call_outputs=self.python_call_outputs,
                    reasoning_ids=self.reasoning_item_ids,
                    message_ids=self.message_item_ids,
                    treat_functions_python_as_builtin=self.functions_python_as_builtin,
                )
                if self.store_callback and self.request_body.store:
                    self.store_callback(self.response_id, self.request_body, response)
                yield self._send_event(
                    ResponseCompletedEvent(
                        type="response.completed",
                        response=response,
                    )
                )

    @app.post("/v1/responses", response_model=ResponseObject)
    async def generate(body: ResponsesRequest, request: Request):
        print("request received")
        print(body.reasoning)

        use_browser_tool = any(
            getattr(tool, "type", None) in ("browser_search", "web_search")
            for tool in (body.tools or [])
        )
        use_code_interpreter = any(
            getattr(tool, "type", None) == "code_interpreter"
            for tool in (body.tools or [])
        )

        if use_browser_tool:
            tool_backend = os.getenv("BROWSER_BACKEND", "exa")
            if tool_backend == "youcom":
                backend = YouComBackend(source="web")
            elif tool_backend == "exa":
                backend = ExaBackend(source="web")
            else:
                raise ValueError(f"Invalid tool backend: {tool_backend}")
            browser_tool = SimpleBrowserTool(backend=backend)
        else:
            browser_tool = None

        if use_code_interpreter:
            python_tool = PythonTool()
        else:
            python_tool = None

        python_function_name_conflict = any(
            getattr(tool, "type", None) == "function"
            and getattr(tool, "name", None) == "python"
            for tool in (body.tools or [])
        )
        functions_python_as_builtin = use_code_interpreter and not (
            python_function_name_conflict
        )

        if body.previous_response_id:
            prev = responses_store.get(body.previous_response_id)
            if prev:
                prev_req, prev_resp = prev

                def _ensure_list(inp):
                    if isinstance(inp, str):
                        return [
                            Item(
                                type="message",
                                role="user",
                                content=[TextContentItem(type="input_text", text=inp)],
                            )
                        ]
                    return list(inp)

                merged_input = _ensure_list(prev_req.input) + list(prev_resp.output)
                merged_input.extend(_ensure_list(body.input))

                if body.instructions is None:
                    body.instructions = prev_req.instructions
                body.input = merged_input

        system_message_content = SystemContent.new().with_conversation_start_date(
            datetime.datetime.now().strftime("%Y-%m-%d")
        )

        if body.reasoning is not None:
            try:

                reasoning_effort = get_reasoning_effort(body.reasoning.effort)
            except ValueError as e:
                from fastapi import HTTPException
                print(e)

                raise HTTPException(status_code=422, detail=str(e))
            system_message_content = system_message_content.with_reasoning_effort(
                reasoning_effort
            )

        if use_browser_tool:
            system_message_content = system_message_content.with_tools(
                browser_tool.tool_config
            )
        if use_code_interpreter:
            system_message_content = system_message_content.with_tools(
                python_tool.tool_config
            )

        system_message = Message.from_role_and_content(
            Role.SYSTEM, system_message_content
        )
        messages = [system_message]

        if body.instructions or body.tools:
            developer_message_content = DeveloperContent.new().with_instructions(
                body.instructions
            )

            tools = []
            for tool in body.tools:
                if tool.type == "function":
                    tools.append(
                        ToolDescription.new(
                            tool.name,
                            tool.description,
                            tool.parameters,
                        )
                    )

            if tools:
                developer_message_content = (
                    developer_message_content.with_function_tools(tools)
                )

            developer_message = Message.from_role_and_content(
                Role.DEVELOPER, developer_message_content
            )

            messages.append(developer_message)

        if isinstance(body.input, str):
            user_message = Message.from_role_and_content(Role.USER, body.input)
            messages.append(user_message)
        else:
            is_last_message_function_call_output = (
                len(body.input) > 0 and body.input[-1].type == "function_call_output"
            )
            function_call_map = {}
            # Find the index of the last assistant message
            last_assistant_idx = -1
            for idx, item in enumerate(body.input):
                if item.type == "message" and item.role == Role.ASSISTANT:
                    last_assistant_idx = idx

            for idx, item in enumerate(body.input):
                if item.type == "message":
                    # TODO: add system prompt handling
                    if isinstance(item.content, str):
                        messages.append(
                            Message.from_role_and_content(item.role, item.content)
                        )
                    else:
                        for content_item in item.content:
                            messages.append(
                                Message.from_role_and_content(
                                    item.role, content_item.text
                                )
                            )
                    # add final channel to the last assistant message if it's from the assistant
                    if item.role == Role.ASSISTANT:
                        messages[-1] = messages[-1].with_channel("final")
                elif item.type == "reasoning":
                    # Only include reasoning if it is after the last assistant message and we are handling a function call at the moment
                    if (
                        idx > last_assistant_idx
                        and is_last_message_function_call_output
                    ):
                        for content_item in item.content:
                            messages.append(
                                Message.from_role_and_content(
                                    Role.ASSISTANT, content_item.text
                                ).with_channel("analysis")
                            )
                elif item.type == "function_call":
                    function_call_map[item.call_id] = item
                    messages.append(
                        Message.from_role_and_content(Role.ASSISTANT, item.arguments)
                        .with_recipient(f"functions.{item.name}")
                        .with_channel("commentary")
                    )
                elif item.type == "function_call_output":
                    function_call = function_call_map.get(item.call_id, None)
                    if not function_call:
                        raise ValueError(f"Function call {item.call_id} not found")

                    messages.append(
                        Message.from_author_and_content(
                            Author.new(Role.TOOL, f"functions.{function_call.name}"),
                            item.output,
                        )
                        .with_recipient("assistant")
                        .with_channel("commentary")
                    )

        conversation = Conversation.from_messages(messages)

        initial_tokens = encoding.render_conversation_for_completion(
            conversation, Role.ASSISTANT
        )
        print(encoding.decode_utf8(initial_tokens))
        response_id = f"resp_{uuid.uuid4().hex}"

        def store_callback(rid: str, req: ResponsesRequest, resp: ResponseObject):
            responses_store[rid] = (req, resp)

        event_stream = StreamResponsesEvents(
            initial_tokens,
            body,
            as_sse=body.stream,
            request=request,
            response_id=response_id,
            store_callback=store_callback,
            browser_tool=browser_tool,
            python_tool=python_tool,
            functions_python_as_builtin=functions_python_as_builtin,
        )

        if body.stream:
            return StreamingResponse(event_stream.run(), media_type="text/event-stream")
        else:
            last_event = None
            async for event in event_stream.run():
                last_event = event

            return last_event.response

    return app


================================================
FILE: gpt_oss/responses_api/events.py
================================================
# torchrun --nproc-per-node=4 responses_api.py
from typing import Literal, Optional, Union

from pydantic import BaseModel

from .types import (
    CodeInterpreterCallItem,
    CodeInterpreterOutputImage,
    CodeInterpreterOutputLogs,
    FunctionCallItem,
    Item,
    ReasoningItem,
    ReasoningTextContentItem,
    ResponseObject,
    TextContentItem,
    UrlCitation,
    WebSearchCallItem,
)


class ResponseEvent(BaseModel):
    sequence_number: Optional[int] = 1


class ResponseCreatedEvent(ResponseEvent):
    type: Literal["response.created"]
    response: ResponseObject


class ResponseCompletedEvent(ResponseEvent):
    type: Literal["response.completed"]
    response: ResponseObject


class ResponseOutputTextDelta(ResponseEvent):
    type: Literal["response.output_text.delta"] = "response.output_text.delta"
    item_id: str = "item_1234"
    output_index: int = 0
    content_index: int = 0
    delta: str = ""
    logprobs: list = []


class ResponseReasoningSummaryTextDelta(ResponseEvent):
    type: Literal["response.reasoning_summary_text.delta"] = (
        "response.reasoning_summary_text.delta"
    )
    item_id: str = "item_1234"
    output_index: int = 0
    content_index: int = 0
    delta: str = ""


class ResponseReasoningTextDelta(ResponseEvent):
    type: Literal["response.reasoning_text.delta"] = "response.reasoning_text.delta"
    item_id: str = "item_1234"
    output_index: int = 0
    content_index: int = 0
    delta: str = ""


class ResponseReasoningTextDone(ResponseEvent):
    type: Literal["response.reasoning_text.done"] = "response.reasoning_text.done"
    item_id: str = "item_1234"
    output_index: int = 0
    content_index: int = 0
    text: str = ""


class ResponseOutputItemAdded(ResponseEvent):
    type: Literal["response.output_item.added"] = "response.output_item.added"
    output_index: int = 0
    item: Union[
        Item,
        ReasoningItem,
        FunctionCallItem,
        WebSearchCallItem,
        CodeInterpreterCallItem,
    ]


class ResponseOutputItemDone(ResponseEvent):
    type: Literal["response.output_item.done"] = "response.output_item.done"
    output_index: int = 0
    item: Union[
        Item,
        ReasoningItem,
        FunctionCallItem,
        WebSearchCallItem,
        CodeInterpreterCallItem,
    ]


class ResponseInProgressEvent(ResponseEvent):
    type: Literal["response.in_progress"]
    response: ResponseObject


class ResponseContentPartAdded(ResponseEvent):
    type: Literal["response.content_part.added"] = "response.content_part.added"
    item_id: str = "item_1234"
    output_index: int = 0
    content_index: int = 0
    part: Union[TextContentItem, ReasoningTextContentItem]


class ResponseOutputTextDone(ResponseEvent):
    type: Literal["response.output_text.done"] = "response.output_text.done"
    item_id: str = "item_1234"
    output_index: int = 0
    content_index: int = 0
    text: str = ""
    logprobs: list = []


class ResponseContentPartDone(ResponseEvent):
    type: Literal["response.content_part.done"] = "response.content_part.done"
    item_id: str = "item_1234"
    output_index: int = 0
    content_index: int = 0
    part: Union[TextContentItem, ReasoningTextContentItem]


class ResponseOutputTextAnnotationAdded(ResponseEvent):
    type: Literal["response.output_text.annotation.added"] = (
        "response.output_text.annotation.added"
    )
    item_id: str = "item_1234"
    output_index: int = 0
    content_index: int = 0
    annotation_index: int = 0
    annotation: UrlCitation


class ResponseWebSearchCallInProgress(ResponseEvent):
    type: Literal["response.web_search_call.in_progress"] = (
        "response.web_search_call.in_progress"
    )
    output_index: int = 0
    item_id: str = "item_1234"


class ResponseWebSearchCallSearching(ResponseEvent):
    type: Literal["response.web_search_call.searching"] = (
        "response.web_search_call.searching"
    )
    output_index: int = 0
    item_id: str = "item_1234"


class ResponseWebSearchCallCompleted(ResponseEvent):
    type: Literal["response.web_search_call.completed"] = (
        "response.web_search_call.completed"
    )
    output_index: int = 0
    item_id: str = "item_1234"


class ResponseCodeInterpreterCallInProgress(ResponseEvent):
    type: Literal["response.code_interpreter_call.in_progress"] = (
        "response.code_interpreter_call.in_progress"
    )
    output_index: int = 0
    item_id: str = "item_1234"


class ResponseCodeInterpreterCallInterpreting(ResponseEvent):
    type: Literal["response.code_interpreter_call.interpreting"] = (
        "response.code_interpreter_call.interpreting"
    )
    output_index: int = 0
    item_id: str = "item_1234"


class ResponseCodeInterpreterCallCodeDelta(ResponseEvent):
    type: Literal["response.code_interpreter_call_code.delta"] = (
        "response.code_interpreter_call_code.delta"
    )
    output_index: int = 0
    item_id: str = "item_1234"
    delta: str = ""
    code_output: Optional[
        Union[CodeInterpreterOutputLogs, CodeInterpreterOutputImage]
    ] = None


class ResponseCodeInterpreterCallCodeDone(ResponseEvent):
    type: Literal["response.code_interpreter_call_code.done"] = (
        "response.code_interpreter_call_code.done"
    )
    output_index: int = 0
    item_id: str = "item_1234"
    code: str = ""
    outputs: Optional[
        list[Union[CodeInterpreterOutputLogs, CodeInterpreterOutputImage]]
    ] = None


class ResponseCodeInterpreterCallCompleted(ResponseEvent):
    type: Literal["response.code_interpreter_call.completed"] = (
        "response.code_interpreter_call.completed"
    )
    output_index: int = 0
    item_id: str = "item_1234"


================================================
FILE: gpt_oss/responses_api/inference/__init__.py
================================================


================================================
FILE: gpt_oss/responses_api/inference/metal.py
================================================
"""Metal backend for :mod:`gpt_oss.responses_api`."""

from typing import Callable

from gpt_oss.metal import Context, Model


# Tunables
MAX_OUTPUT_TOKENS = 100


def setup_model(checkpoint: str) -> Callable[[list[int], float], int]:
    """Load the Metal model and return an inference function."""

    model = Model(checkpoint)
    context = Context(model)

    seed = 0
    output_tokens = []

    def infer_next_token(
        tokens: list[int], temperature: float = 0.0, new_request: bool = False
    ) -> int:
        """Infer next token using incremental LCP caching when possible."""
        nonlocal output_tokens

        if new_request:
            output_tokens = []

        if len(output_tokens) == 0:
            # Context handles LCP caching internally; if `tokens` matches the
            # tokens in the KV cache, the KV cache is reused after reset+append.
            context.reset()
            for t in tokens:
                context.append(t)

            output_tokens = context.sample(max_output_tokens=MAX_OUTPUT_TOKENS,
                                           temperature=temperature,
                                           seed=seed)

        return int(output_tokens.pop(0))

    return infer_next_token


================================================
FILE: gpt_oss/responses_api/inference/ollama.py
================================================
"""
NOTE: this is a stitched together implementation that uses Ollama for inference. It's primarily used
for testing and development. It does not leverage any prompt caching or other optimizations and
can therefore be slow between turns.
"""

import json
import threading
import time
from typing import Callable, Optional

import requests
from openai_harmony import HarmonyEncodingName, load_harmony_encoding

EOS_TOKEN = 200002  # only used on hard timeout

# Tunables
POLL_INTERVAL_S = 0.01  # 10ms between buffer checks
CALL_MAX_WAIT_S = 0.250  # max time to block inside a single infer call
NO_TOKEN_TIMEOUT_S = 15.0  # overall inactivity timeout before emitting EOS
FIRST_BYTE_TIMEOUT_S = 30.0  # time to wait for first token before EOS

# Shared state
_token_buffer: list[int] = []
_buffer_lock = threading.Lock()
_stream_thread: Optional[threading.Thread] = None
_stream_done = threading.Event()
_stream_error: Optional[Exception] = None
_last_progress_ts: float = 0.0  # updated whenever we enqueue or dequeue tokens
_previous_request_tokens: list[int] = []


def lcp(cache: list[int], inp: list[int]) -> list[int]:
    i = 0
    max_len = min(len(cache), len(inp))
    while i < max_len and cache[i] == inp[i]:
        i += 1
    return cache[:i]


def _now():
    return time.monotonic()


def _touch_progress():
    global _last_progress_ts
    _last_progress_ts = _now()


def _reset_stream_state():
    global _token_buffer, _stream_thread, _stream_error
    with _buffer_lock:
        _token_buffer = []
    _stream_done.clear()
    _stream_thread = None
    _stream_error = None
    _touch_progress()


def setup_model(checkpoint: str) -> Callable[[list[int], float, bool], int]:
    encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)
    model_name = checkpoint

    def _start_stream(token_ids: list[int], temperature: float):
        prompt_text = encoding.decode(token_ids)

        def run():
            nonlocal prompt_text, temperature
            global _stream_error
            global _previous_request_tokens

            accum_text = ""
            last_len = 0  # number of tokens already emitted

            try:
                url = "http://localhost:11434/api/generate"

                payload = {
                    "model": model_name,
                    "prompt": prompt_text,
                    "stream": True,
                    "options": {"temperature": temperature},
                    "raw": True,
                }

                with requests.post(url, json=payload, stream=True, timeout=60) as resp:
                    resp.raise_for_status()
                    for line in resp.iter_lines(decode_unicode=True):
                        if not line:
                            continue
                        obj = json.loads(line)

                        if isinstance(obj.get("response"), str):
                            accum_text += obj["response"]
                            toks = encoding.encode(accum_text, allowed_special="all")
                            if len(toks) > last_len:
                                new_toks = toks[last_len:]
                                with _buffer_lock:
                                    _token_buffer.extend(new_toks)
                                last_len = len(toks)
                                _touch_progress()

                        if obj.get("done", False):
                            _token_buffer.append(EOS_TOKEN)
                            last_len = len(toks)
                            _touch_progress()
                            break

                _stream_done.set()

            except Exception as e:
                _stream_error = e
                _stream_done.set()

        t = threading.Thread(target=run, name="ollama-stream", daemon=True)
        t.start()
        return t

    def infer_next_token(
        tokens: list[int], temperature: float = 0.0, new_request: bool = False
    ) -> int:
        """
        - Starts a new Ollama stream on new_request.
        - Forwards tokens as they arrive.
        - Only emits EOS_TOKEN if we exceed an inactivity timeout.
        """
        global _stream_thread

        if new_request:
            _reset_stream_state()
            _stream_thread = _start_stream(token_ids=tokens, temperature=temperature)
            # Wait for first byte within FIRST_BYTE_TIMEOUT_S (without emitting EOS early)
            start = _now()
            while _now() - start < FIRST_BYTE_TIMEOUT_S:
                with _buffer_lock:
                    if _token_buffer:
                        tok = _token_buffer.pop(0)
                        _touch_progress()
                        return tok
                if _stream_error is not None:
                    raise RuntimeError(f"Ollama stream error: {_stream_error!r}")
                # If Ollama finished instantly with no output, continue loop until timeout
                time.sleep(POLL_INTERVAL_S)
            # Hard first-byte timeout -> emit EOS so the server can stop this request
            return EOS_TOKEN

        if _stream_error is not None:
            raise RuntimeError(f"Ollama stream error: {_stream_error!r}")

        # Normal path: wait up to CALL_MAX_WAIT_S for a token to arrive
        wait_start = _now()
        while _now() - wait_start < CALL_MAX_WAIT_S:
            with _buffer_lock:
                if _token_buffer:
                    tok = _token_buffer.pop(0)
                    _touch_progress()
                    return tok
            # No token yet; if we've been idle too long overall, end with EOS
            if _now() - _last_progress_ts > NO_TOKEN_TIMEOUT_S:
                return EOS_TOKEN
            time.sleep(POLL_INTERVAL_S)

        # Still no token in this call slice. Do NOT send EOS unless we've timed out.
        if _now() - _last_progress_ts > NO_TOKEN_TIMEOUT_S:
            return EOS_TOKEN

        # Tell caller to call us again; block minimally by returning *nothing new*.
        # We must return an int; safest is to wait a tiny bit longer for a token.
        # If still none, keep returning only after short waits. Avoid EOS here.
        # One more short wait to reduce hot-looping:
        time.sleep(POLL_INTERVAL_S)
        with _buffer_lock:
            if _token_buffer:
                tok = _token_buffer.pop(0)
                _touch_progress()
                return tok

        # As a last resort for this call slice, return EOS only on true inactivity timeout.
        if _now() - _last_progress_ts > NO_TOKEN_TIMEOUT_S:
            return EOS_TOKEN

        # If we reach here, we still haven't got a token—ask the caller to call again soon.
        # Return a harmless token that the server will replace/ignore if your interface supports it.
        # If your interface does NOT allow a sentinel, keep the short-blocking behavior above.
        return (
            EOS_TOKEN if False else 0
        )  # replace `0` with a PAD/NOOP token your server ignores

    return infer_next_token


================================================
FILE: gpt_oss/responses_api/inference/stub.py
================================================
import time
from typing import Callable

fake_tokens = [
    200005,
    35644,
    200008,
    23483,
    316,
    1199,
    1114,
    717,
    170154,
    13,
    200007,
    200006,
    173781,
    200005,
    35644,
    316,
    28,
    44580,
    775,
    170154,
    464,
    91,
    542,
    141043,
    91,
    29,
    4108,
    200008,
    10848,
    7693,
    7534,
    28499,
    18826,
    18583,
    200012,
]
fake_tokens = [
    200005,
    35644,
    200008,
    1844,
    31064,
    25,
    392,
    4827,
    382,
    220,
    17,
    659,
    220,
    17,
    16842,
    12295,
    81645,
    13,
    51441,
    6052,
    13,
    200007,
    200006,
    173781,
    200005,
    17196,
    200008,
    17,
    659,
    220,
    17,
    314,
    220,
    19,
    13,
    9552,
    238,
    242,
    200002,
]
# fake_tokens = [200005, 35644, 200008, 976, 1825, 31064, 25, 392, 25216, 29400, 290, 11122, 306, 52768, 2117, 16842, 1416, 1309, 316, 2281, 198, 68, 290, 2208, 11122, 13, 1416, 679, 261, 1114, 717, 170154, 484, 44390, 261, 5100, 1621, 26, 581, 1757, 2005, 198, 75, 480, 483, 5100, 392, 137956, 2117, 11, 13180, 4050, 7801, 4733, 290, 11122, 5377, 484, 290, 1114, 7377, 13, 1416, 1309, 260, 198, 78, 1199, 290, 1114, 4584, 364, 58369, 2421, 717, 170154, 483, 5100, 392, 137956, 2117, 11, 13180, 4050, 200007, 200006, 173781, 200005, 12606, 815, 260, 198, 78, 28, 117673, 3490]
# fake_tokens = [
#     198,
#     200005,
#     35644,
#     200008,
#     23483,
#     316,
#     1199,
#     1114,
#     717,
#     170154,
#     13,
#     200007,
#     200006,
#     173781,
#     200005,
#     12606,
#     815,
#     316,
#     32455,
#     106847,
#     316,
#     28,
#     44580,
#     775,
#     170154,
#     464,
#     91,
#     542,
#     141043,
#     91,
#     29,
#     4108,
#     200008,
#     10848,
#     7693,
#     7534,
#     28499,
#     18826,
#     18583,
#     200012,
#     198,
# ]

token_queue = fake_tokens.copy()


def stub_infer_next_token(
    tokens: list[int], temperature: float = 0.0, new_request: bool = False
) -> int:
    global token_queue
    next_tok = token_queue.pop(0)
    if len(token_queue) == 0:
        token_queue = fake_tokens.copy()
    time.sleep(0.1)
    return next_tok


def setup_model(_checkpoint: str) -> Callable[[list[int], float], int]:
    return stub_infer_next_token


================================================
FILE: gpt_oss/responses_api/inference/transformers.py
================================================
"""
NOTE: this is not the most efficient way to use transformers. It's a simple implementation that infers
one token at a time to mimic the behavior of the Triton implementation.
"""

import os
from typing import Callable, List

# Transformers imports
from transformers import AutoModelForCausalLM, PreTrainedModel
import torch


DEFAULT_TEMPERATURE = 0.0
TP = os.environ.get("TP", 2)

def load_model(checkpoint: str):
    """
    Serve the model directly with the Auto API.
    """

    model = AutoModelForCausalLM.from_pretrained(
        checkpoint,
        torch_dtype=torch.bfloat16,
        device_map="auto",
    )

    return model


def get_infer_next_token(model: PreTrainedModel):
    """
    Return a callable with the same shape as the original triton implementation:
      infer_next_token(tokens: List[int], temperature: float, new_request: bool) -> int

    Implementation detail:
      - We issue a single-token generation with using model.generate
      - generate handles sampling (temperature=0 => greedy, otherwise, sampling).
    """

    def infer_next_token(
        tokens: List[int],
        temperature: float = DEFAULT_TEMPERATURE,
        new_request: bool = False, # kept for interface compatibility; unused here
    ) -> int:
        tokens = torch.tensor([tokens], dtype=torch.int64, device=model.device)
        output = model.generate(tokens, max_new_tokens=1, do_sample=temperature != 0, temperature=temperature)
        return output[0, -1].tolist()

    return infer_next_token


def setup_model(checkpoint: str) -> Callable[[List[int], float, bool], int]:
    model = load_model(checkpoint)
    infer_next_token = get_infer_next_token(model)
    return infer_next_token


================================================
FILE: gpt_oss/responses_api/inference/triton.py
================================================
import datetime
import os
from typing import Callable

os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch
import torch.distributed as dist

from gpt_oss.triton.model import Cache, ModelConfig, Transformer

DEFAULT_TEMPERATURE = 0.0
CONTEXT = 16_384
CONCURRENT_SESSIONS = 1

rank = int(
    os.environ.get("RANK", 0)
)  # set this env var to another value to run on other GPUs


def load_model(checkpoint: str):
    print(f"[{rank}] loading model...")

    torch.cuda.set_device(rank)
    torch.set_grad_enabled(False)
    device = torch.device(f"cuda:{rank}")

    # Load model
    model = Transformer.from_checkpoint(checkpoint, device=device)

    print(f"[{rank}] loaded")
    return model, device


def get_infer_next_token(model, device):
    caches = [
        Cache(CONCURRENT_SESSIONS, CONTEXT, model.config.num_key_value_heads)
        for _ in range(len(model.block))
    ]
    # offsets = torch.zeros(CONCURRENT_SESSIONS, dtype=torch.int32, device=device) # TBD
    input_token = torch.zeros(
        1, dtype=torch.int32, device=device
    )  # add concurrent sessions support
    tokens_so_far = []

    model.prefill(torch.zeros(1, 4, dtype=torch.int32, device=device), caches)
    graph = torch.cuda.CUDAGraph()
    with torch.cuda.graph(graph):
        logits = model(input_token[None, :], caches=caches)[0]

    def lcp(cache: list[int], inp: list[int]) -> list[int]:
        i = 0
        max_len = min(len(cache), len(inp))
        while i < max_len and cache[i] == inp[i]:
            i += 1
        return cache[:i]

    def sample_next_token(
        logits: torch.Tensor, temperature: float = DEFAULT_TEMPERATURE
    ) -> int:
        """Executed only on rank 0."""
        if temperature == 0.0:
            return torch.argmax(logits[-1, :], dim=-1).item()
        probs = torch.softmax(logits * (1.0 / temperature), dim=-1)
        return torch.multinomial(probs[-1, :], num_samples=1).item()

    @torch.inference_mode()
    def infer_next_token(
        tokens: list[int],
        temperature: float = DEFAULT_TEMPERATURE,
        new_request: bool = False,
    ) -> int:
        nonlocal tokens_so_far
        tokens_so_far = lcp(tokens_so_far, tokens)
        for cache in caches:
            cache.truncate(len(tokens_so_far))
        all_tokens = tokens  # for pdb
        tokens = tokens[len(tokens_so_far) :]

        if len(tokens) > 1:
            model.prefill(
                torch.as_tensor(tokens[:-1], dtype=torch.int32, device=device)[None, :],
                caches,
            )

        if len(tokens) == 0:
            breakpoint()

        input_token[-1] = tokens[-1]
        graph.replay()

        # decide next token on rank‑0
        next_tok = sample_next_token(logits, temperature=temperature)

        return next_tok

    return infer_next_token


def setup_model(checkpoint: str) -> Callable[[list[int], float], int]:
    model, device = load_model(checkpoint)
    infer_next_token = get_infer_next_token(model, device)
    return infer_next_token


================================================
FILE: gpt_oss/responses_api/inference/vllm.py
================================================
"""
NOTE: this is not the most efficient way to use vLLM. It's a simple implementation that infers 
one token at a time to mimic the behavior of the Triton implementation. 
"""

import os
from typing import Callable, List, Optional

# vLLM imports
from vllm import LLM, SamplingParams
from vllm.inputs import TokensPrompt

DEFAULT_TEMPERATURE = 0.0
TP = os.environ.get("TP", 2)

def load_model(checkpoint: str):
    """
    Create the vLLM engine. We enable prefix caching so repeated prefixes
    across calls can reuse KV cache for faster prefill.
    """

    llm = LLM(
        model=checkpoint,
        tensor_parallel_size=TP,          # set >1 if you want TP across GPUs
        enable_prefix_caching=True,      # reuse KV for shared prefixes
        disable_log_stats=True,        # uncomment to quiet logs
    )

    return llm


def get_infer_next_token(llm: LLM):
    """
    Return a callable with the same shape as your original:
      infer_next_token(tokens: List[int], temperature: float, new_request: bool) -> int

    Implementation detail:
      - We issue a single-token generation with TokensPrompt(prompt_token_ids=tokens).
      - vLLM handles sampling (temperature=0 => greedy).
      - With enable_prefix_caching=True, the shared prefix prefill can be reused
        across calls that share the same prefix.
    """

    # Maintain compatibility with your previous closure signature.
    def infer_next_token(
        tokens: List[int],
        temperature: float = DEFAULT_TEMPERATURE,
        new_request: bool = False,  # kept for interface compatibility; unused here
    ) -> int:
        if not tokens:
            raise ValueError("tokens must contain at least one input token id")

        sampling = SamplingParams(
            temperature=float(temperature),
            max_tokens=1,            # we only want the next token
            n=1,                     # single continuation
            # You can expose/enable more controls here (top_p, top_k, etc.)
        )

        # Provide token IDs directly (no re-tokenization).
        outputs = llm.generate(
            TokensPrompt(prompt_token_ids=tokens),
            sampling_params=sampling,
        )

        if not outputs or not outputs[0].outputs:
            raise RuntimeError("vLLM returned empty outputs")

        gen = outputs[0].outputs[0]
        if not gen.token_ids:
            # If the model immediately finished (e.g., EOS), decide how you'd like
            # to signal that. Here we raise; you could also return an EOS id.
            raise RuntimeError("No next token was generated (possibly EOS).")

        next_tok = int(gen.token_ids[0])
        return next_tok

    return infer_next_token


def setup_model(checkpoint: str) -> Callable[[List[int], float, bool], int]:
    llm = load_model(checkpoint)
    infer_next_token = get_infer_next_token(llm)
    return infer_next_token


================================================
FILE: gpt_oss/responses_api/serve.py
================================================
# torchrun --nproc-per-node=4 serve.py

import argparse

import uvicorn
from openai_harmony import (
    HarmonyEncodingName,
    load_harmony_encoding,
)

from .api_server import create_api_server

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Responses API server")
    parser.add_argument(
        "--checkpoint",
        metavar="FILE",
        type=str,
        help="Path to the SafeTensors checkpoint",
        default="~/model",
        required=False,
    )
    parser.add_argument(
        "--port",
        metavar="PORT",
        type=int,
        default=8000,
        help="Port to run the server on",
    )
    parser.add_argument(
        "--inference-backend",
        metavar="BACKEND",
        type=str,
        help="Inference backend to use",
        # default to metal on macOS, triton on other platforms
        default="metal" if __import__("platform").system() == "Darwin" else "triton",
    )
    args = parser.parse_args()

    if args.inference_backend == "triton":
        from .inference.triton import setup_model
    elif args.inference_backend == "stub":
        from .inference.stub import setup_model
    elif args.inference_backend == "metal":
        from .inference.metal import setup_model
    elif args.inference_backend == "ollama":
        from .inference.ollama import setup_model
    elif args.inference_backend == "vllm":
        from .inference.vllm import setup_model
    elif args.inference_backend == "transformers":
        from .inference.transformers import setup_model
    else:
        raise ValueError(f"Invalid inference backend: {args.inference_backend}")

    encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)

    infer_next_token = setup_model(args.checkpoint)
    uvicorn.run(create_api_server(infer_next_token, encoding), port=args.port)


================================================
FILE: gpt_oss/responses_api/types.py
================================================
from typing import Any, Dict, Literal, Optional, Union

from openai_harmony import ReasoningEffort
from pydantic import BaseModel, ConfigDict

MODEL_IDENTIFIER = "gpt-oss-120b"
DEFAULT_TEMPERATURE = 0.0
REASONING_EFFORT = ReasoningEffort.LOW
DEFAULT_MAX_OUTPUT_TOKENS = 131072


class UrlCitation(BaseModel):
    type: Literal["url_citation"]
    end_index: int
    start_index: int
    url: str
    title: str


class TextContentItem(BaseModel):
    type: Union[Literal["text"], Literal["input_text"], Literal["output_text"]]
    text: str
    status: Optional[str] = "completed"
    annotations: Optional[list[UrlCitation]] = None


class SummaryTextContentItem(BaseModel):
    # using summary for compatibility with the existing API
    type: Literal["summary_text"]
    text: str


class ReasoningTextContentItem(BaseModel):
    type: Literal["reasoning_text"]
    text: str


class ReasoningItem(BaseModel):
    id: str = "rs_1234"
    type: Literal["reasoning"]
    summary: list[SummaryTextContentItem]
    content: Optional[list[ReasoningTextContentItem]] = []


class Item(BaseModel):
    id: Optional[str] = None
    type: Optional[Literal["message"]] = "message"
    role: Literal["user", "assistant", "system"]
    content: Union[list[TextContentItem], str]
    status: Union[Literal["in_progress", "completed", "incomplete"], None] = None


class FunctionCallItem(BaseModel):
    type: Literal["function_call"]
    name: str
    arguments: str
    status: Literal["in_progress", "completed", "incomplete"] = "completed"
    id: str = "fc_1234"
    call_id: str = "call_1234"


class FunctionCallOutputItem(BaseModel):
    type: Literal["function_call_output"]
    call_id: str = "call_1234"
    output: str


class WebSearchActionSearch(BaseModel):
    type: Literal["search"]
    query: Optional[str] = None


class WebSearchActionOpenPage(BaseModel):
    type: Literal["open_page"]
    url: Optional[str] = None


class WebSearchActionFind(BaseModel):
    type: Literal["find"]
    pattern: Optional[str] = None
    url: Optional[str] = None


class WebSearchCallItem(BaseModel):
    type: Literal["web_search_call"]
    id: str = "ws_1234"
    status: Literal["in_progress", "completed", "incomplete"] = "completed"
    action: Union[WebSearchActionSearch, WebSearchActionOpenPage, WebSearchActionFind]


class CodeInterpreterOutputLogs(BaseModel):
    type: Literal["logs"]
    logs: str


class CodeInterpreterOutputImage(BaseModel):
    type: Literal["image"]
    url: str


class CodeInterpreterCallItem(BaseModel):
    type: Literal["code_interpreter_call"]
    id: str = "ci_1234"
    status: Literal[
        "in_progress",
        "completed",
        "incomplete",
        "interpreting",
        "failed",
    ] = "completed"
    code: Optional[str] = None
    container_id: Optional[str] = None
    outputs: Optional[
        list[Union[CodeInterpreterOutputLogs, CodeInterpreterOutputImage]]
    ] = None


class Error(BaseModel):
    code: str
    message: str


class IncompleteDetails(BaseModel):
    reason: str


class Usage(BaseModel):
    input_tokens: int
    output_tokens: int
    total_tokens: int


class FunctionToolDefinition(BaseModel):
    type: Literal["function"]
    name: str
    parameters: dict  # this should be typed stricter if you add strict mode
    strict: bool = False  # change this if you support strict mode
    description: Optional[str] = ""


class BrowserToolConfig(BaseModel):
    model_config = ConfigDict(extra='allow')
    type: Literal["browser_search"] | Literal["web_search"]


class CodeInterpreterToolConfig(BaseModel):
    type: Literal["code_interpreter"]


class ReasoningConfig(BaseModel):
    effort: Literal["low", "medium", "high"] = REASONING_EFFORT


class ResponsesRequest(BaseModel):
    instructions: Optional[str] = None
    max_output_tokens: Optional[int] = DEFAULT_MAX_OUTPUT_TOKENS
    input: Union[
        str,
        list[
            Union[
                Item,
                ReasoningItem,
                FunctionCallItem,
                FunctionCallOutputItem,
                WebSearchCallItem,
                CodeInterpreterCallItem,
            ]
        ],
    ]
    model: Optional[str] = MODEL_IDENTIFIER
    stream: Optional[bool] = False
    tools: Optional[
        list[
            Union[FunctionToolDefinition, BrowserToolConfig, CodeInterpreterToolConfig]
        ]
    ] = []
    reasoning: Optional[ReasoningConfig] = ReasoningConfig()
    metadata: Optional[Dict[str, Any]] = {}
    tool_choice: Optional[Literal["auto", "none"]] = "auto"
    parallel_tool_calls: Optional[bool] = False
    store: Optional[bool] = False
    previous_response_id: Optional[str] = None
    temperature: Optional[float] = DEFAULT_TEMPERATURE
    include: Optional[list[str]] = None


class ResponseObject(BaseModel):
    output: list[
        Union[
            Item,
            ReasoningItem,
            FunctionCallItem,
            FunctionCallOutputItem,
            WebSearchCallItem,
            CodeInterpreterCallItem,
        ]
    ]
    created_at: int
    usage: Optional[Usage] = None
    status: Literal["completed", "failed", "incomplete", "in_progress"] = "in_progress"
    background: None = None
    error: Optional[Error] = None
    incomplete_details: Optional[IncompleteDetails] = None
    instructions: Optional[str] = None
    max_output_tokens: Optional[int] = None
    max_tool_calls: Optional[int] = None
    metadata: Optional[Dict[str, Any]] = {}
    model: Optional[str] = MODEL_IDENTIFIER
    parallel_tool_calls: Optional[bool] = False
    previous_response_id: Optional[str] = None
    id: Optional[str] = "resp_1234"
    object: Optional[str] = "response"
    text: Optional[Dict[str, Any]] = None
    tool_choice: Optional[str] = "auto"
    top_p: Optional[int] = 1


================================================
FILE: gpt_oss/responses_api/utils.py
================================================
import time

fake_tokens = [
    200005,
    35644,
    200008,
    23483,
    316,
    1199,
    1114,
    717,
    170154,
    13,
    200007,
    200006,
    173781,
    200005,
    35644,
    316,
    28,
    44580,
    775,
    170154,
    464,
    91,
    542,
    141043,
    91,
    29,
    4108,
    200008,
    10848,
    7693,
    7534,
    28499,
    18826,
    18583,
    200012,
]
fake_tokens = [
    200005,
    35644,
    200008,
    1844,
    31064,
    25,
    392,
    4827,
    382,
    220,
    17,
    659,
    220,
    17,
    16842,
    12295,
    81645,
    13,
    51441,
    6052,
    13,
    200007,
    200006,
    173781,
    200005,
    17196,
    200008,
    17,
    659,
    220,
    17,
    314,
    220,
    19,
    13,
    9552,
    238,
    242,
    200002,
]
# fake_tokens = [200005, 35644, 200008, 976, 1825, 31064, 25, 392, 25216, 29400, 290, 11122, 306, 52768, 2117, 16842, 1416, 1309, 316, 2281, 198, 68, 290, 2208, 11122, 13, 1416, 679, 261, 1114, 717, 170154, 484, 44390, 261, 5100, 1621, 26, 581, 1757, 2005, 198, 75, 480, 483, 5100, 392, 137956, 2117, 11, 13180, 4050, 7801, 4733, 290, 11122, 5377, 484, 290, 1114, 7377, 13, 1416, 1309, 260, 198, 78, 1199, 290, 1114, 4584, 364, 58369, 2421, 717, 170154, 483, 5100, 392, 137956, 2117, 11, 13180, 4050, 200007, 200006, 173781, 200005, 12606, 815, 260, 198, 78, 28, 117673, 3490]
# fake_tokens = [
#     198,
#     200005,
#     35644,
#     200008,
#     23483,
#     316,
#     1199,
#     1114,
#     717,
#     170154,
#     13,
#     200007,
#     200006,
#     173781,
#     200005,
#     12606,
#     815,
#     316,
#     32455,
#     106847,
#     316,
#     28,
#     44580,
#     775,
#     170154,
#     464,
#     91,
#     542,
#     141043,
#     91,
#     29,
#     4108,
#     200008,
#     10848,
#     7693,
#     7534,
#     28499,
#     18826,
#     18583,
#     200012,
#     198,
# ]

token_queue = fake_tokens.copy()


def stub_infer_next_token(tokens: list[int], temperature: float = 0.0) -> int:
    global token_queue
    next_tok = token_queue.pop(0)
    if len(token_queue) == 0:
        token_queue = fake_tokens.copy()
    time.sleep(0.1)
    return next_tok


================================================
FILE: gpt_oss/tokenizer.py
================================================
import tiktoken

def get_tokenizer():
    o200k_base = tiktoken.get_encoding("o200k_base")
    tokenizer = tiktoken.Encoding(
        name="o200k_harmony",
        pat_str=o200k_base._pat_str,
        mergeable_ranks=o200k_base._mergeable_ranks,
        special_tokens={
            **o200k_base._special_tokens,
            "<|startoftext|>": 199998,
            "<|endoftext|>": 199999,
            "<|reserved_200000|>": 200000,
            "<|reserved_200001|>": 200001,
            "<|return|>": 200002,
            "<|constrain|>": 200003,
            "<|reserved_200004|>": 200004,
            "<|channel|>": 200005,
            "<|start|>": 200006,
            "<|end|>": 200007,
            "<|message|>": 200008,
            "<|reserved_200009|>": 200009,
            "<|reserved_200010|>": 200010,
            "<|reserved_200011|>": 200011,
            "<|call|>": 200012,
        } | {
            f"<|reserved_{i}|>": i for i in range(200013, 201088)
        },
    )
    return tokenizer


================================================
FILE: gpt_oss/tools/__init__.py
================================================


================================================
FILE: gpt_oss/tools/apply_patch.md
================================================
When requested to perform coding-related tasks, you MUST adhere to the following criteria when executing the task:

- Use `apply_patch` to edit files.
- If completing the user's task requires writing or modifying files:
  - Your code and final answer should follow these _CODING GUIDELINES_:
    - Avoid unneeded complexity in your solution. Minimize program size.
    - Keep changes consistent with the style of the existing codebase. Changes should be minimal and focused on the task.
    - NEVER add copyright or license headers unless specifically requested.
- Never implement function stubs. Provide complete working implementations.

§ `apply_patch` Specification

Your patch language is a stripped‑down, file‑oriented diff format designed to be easy to parse and safe to apply. You can think of it as a high‑level envelope:

*** Begin Patch
[ one or more file sections ]
*** End Patch

Within that envelope, you get a sequence of file operations.
You MUST include a header to specify the action you are taking.
Each operation starts with one of three headers:

*** Add File: <path> - create a new file. Every following line is a + line (the initial contents).
*** Delete File: <path> - remove an existing file. Nothing follows.
*** Update File: <path> - patch an existing file in place (optionally with a rename).

May be immediately followed by *** Move to: <new path> if you want to rename the file.
Then one or more “hunks”, each introduced by @@ (optionally followed by a hunk header).
Within a hunk each line starts with:

- for inserted text,

* for removed text, or
  space ( ) for context.
  At the end of a truncated hunk you can emit *** End of File.

Patch := Begin { FileOp } End
Begin := "*** Begin Patch" NEWLINE
End := "*** End Patch" NEWLINE
FileOp := AddFile | DeleteFile | UpdateFile
AddFile := "*** Add File: " path NEWLINE { "+" line NEWLINE }
DeleteFile := "*** Delete File: " path NEWLINE
UpdateFile := "*** Update File: " path NEWLINE [ MoveTo ] { Hunk }
MoveTo := "*** Move to: " newPath NEWLINE
Hunk := "@@" [ header ] NEWLINE { HunkLine } [ "*** End of File" NEWLINE ]
HunkLine := (" " | "-" | "+") text NEWLINE

A full patch can combine several operations:

*** Begin Patch
*** Add File: hello.txt
+Hello world
*** Update File: src/app.py
*** Move to: src/main.py
@@ def greet():
-print("Hi")
+print("Hello, world!")
*** Delete File: obsolete.txt
*** End Patch

It is important to remember:

- You must include a header with your intended action (Add/Delete/Update)
- You must prefix new lines with `+` even when creating a new file


================================================
FILE: gpt_oss/tools/apply_patch.py
================================================
#!/usr/bin/env python3

"""
A self-contained **pure-Python 3.9+** utility for applying human-readable
“pseudo-diff” patch files to a collection of text files.

Source: https://cookbook.openai.com/examples/gpt4-1_prompting_guide
"""

from __future__ import annotations

import pathlib
from dataclasses import dataclass, field
from enum import Enum
from typing import (
    Callable,
    Dict,
    List,
    Optional,
    Tuple,
    Union,
)


# --------------------------------------------------------------------------- #
#  Domain objects
# --------------------------------------------------------------------------- #
class ActionType(str, Enum):
    ADD = "add"
    DELETE = "delete"
    UPDATE = "update"


@dataclass
class FileChange:
    type: ActionType
    old_content: Optional[str] = None
    new_content: Optional[str] = None
    move_path: Optional[str] = None


@dataclass
class Commit:
    changes: Dict[str, FileChange] = field(default_factory=dict)


# --------------------------------------------------------------------------- #
#  Exceptions
# --------------------------------------------------------------------------- #
class DiffError(ValueError):
    """Any problem detected while parsing or applying a patch."""


# --------------------------------------------------------------------------- #
#  Helper dataclasses used while parsing patches
# --------------------------------------------------------------------------- #
@dataclass
class Chunk:
    orig_index: int = -1
    del_lines: List[str] = field(default_factory=list)
    ins_lines: List[str] = field(default_factory=list)


@dataclass
class PatchAction:
    type: ActionType
    new_file: Optional[str] = None
    chunks: List[Chunk] = field(default_factory=list)
    move_path: Optional[str] = None


@dataclass
class Patch:
    actions: Dict[str, PatchAction] = field(default_factory=dict)


# --------------------------------------------------------------------------- #
#  Patch text parser
# --------------------------------------------------------------------------- #
@dataclass
class Parser:
    current_files: Dict[str, str]
    lines: List[str]
    index: int = 0
    patch: Patch = field(default_factory=Patch)
    fuzz: int = 0

    # ------------- low-level helpers -------------------------------------- #
    def _cur_line(self) -> str:
        if self.index >= len(self.lines):
            raise DiffError("Unexpected end of input while parsing patch")
        return self.lines[self.index]

    @staticmethod
    def _norm(line: str) -> str:
        """Strip CR so comparisons work for both LF and CRLF input."""
        return line.rstrip("\r")

    # ------------- scanning convenience ----------------------------------- #
    def is_done(self, prefixes: Optional[Tuple[str, ...]] = None) -> bool:
        if self.index >= len(self.lines):
            return True
        if (
            prefixes
            and len(prefixes) > 0
            and self._norm(self._cur_line()).startswith(prefixes)
        ):
            return True
        return False

    def startswith(self, prefix: Union[str, Tuple[str, ...]]) -> bool:
        return self._norm(self._cur_line()).startswith(prefix)

    def read_str(self, prefix: str) -> str:
        """
        Consume the current line if it starts with *prefix* and return the text
        **after** the prefix.  Raises if prefix is empty.
        """
        if prefix == "":
            raise ValueError("read_str() requires a non-empty prefix")
        if self._norm(self._cur_line()).startswith(prefix):
            text = self._cur_line()[len(prefix) :]
            self.index += 1
            return text
        return ""

    def read_line(self) -> str:
        """Return the current raw line and advance."""
        line = self._cur_line()
        self.index += 1
        return line

    # ------------- public entry point -------------------------------------- #
    def parse(self) -> None:
        while not self.is_done(("*** End Patch",)):
            # ---------- UPDATE ---------- #
            path = self.read_str("*** Update File: ")
            if path:
                if path in self.patch.actions:
                    raise DiffError(f"Duplicate update for file: {path}")
                move_to = self.read_str("*** Move to: ")
                if path not in self.current_files:
                    raise DiffError(f"Update File Error - missing file: {path}")
                text = self.current_files[path]
                action = self._parse_update_file(text)
                action.move_path = move_to or None
                self.patch.actions[path] = action
                continue

            # ---------- DELETE ---------- #
            path = self.read_str("*** Delete File: ")
            if path:
                if path in self.patch.actions:
                    raise DiffError(f"Duplicate delete for file: {path}")
                if path not in self.current_files:
                    raise DiffError(f"Delete File Error - missing file: {path}")
                self.patch.actions[path] = PatchAction(type=ActionType.DELETE)
                continue

            # ---------- ADD ---------- #
            path = self.read_str("*** Add File: ")
            if path:
                if path in self.patch.actions:
                    raise DiffError(f"Duplicate add for file: {path}")
                if path in self.current_files:
                    raise DiffError(f"Add File Error - file already exists: {path}")
                self.patch.actions[path] = self._parse_add_file()
                continue

            raise DiffError(f"Unknown line while parsing: {self._cur_line()}")

        if not self.startswith("*** End Patch"):
            raise DiffError("Missing *** End Patch sentinel")
        self.index += 1  # consume sentinel

    # ------------- section parsers ---------------------------------------- #
    def _parse_update_file(self, text: str) -> PatchAction:
        action = PatchAction(type=ActionType.UPDATE)
        lines = text.split("\n")
        index = 0
        while not self.is_done(
            (
                "*** End Patch",
                "*** Update File:",
                "*** Delete File:",
                "*** Add File:",
                "*** End of File",
            )
        ):
            def_str = self.read_str("@@ ")
            section_str = ""
            if not def_str and self._norm(self._cur_line()) == "@@":
                section_str = self.read_line()

            if not (def_str or section_str or index == 0):
                raise DiffError(f"Invalid line in update section:\n{self._cur_line()}")

            if def_str.strip():
                found = False
                if def_str not in lines[:index]:
                    for i, s in enumerate(lines[index:], index):
                        if s == def_str:
                            index = i + 1
                            found = True
                            break
                if not found and def_str.strip() not in [
                    s.strip() for s in lines[:index]
                ]:
                    for i, s in enumerate(lines[index:], index):
                        if s.strip() == def_str.strip():
                            index = i + 1
                            self.fuzz += 1
                            found = True
                            break

            next_ctx, chunks, end_idx, eof = peek_next_section(self.lines, self.index)
            new_index, fuzz = find_context(lines, next_ctx, index, eof)
            if new_index == -1:
                ctx_txt = "\n".join(next_ctx)
                raise DiffError(
                    f"Invalid {'EOF ' if eof else ''}context at {index}:\n{ctx_txt}"
                )
            self.fuzz += fuzz
            for ch in chunks:
                ch.orig_index += new_index
                action.chunks.append(ch)
            index = new_index + len(next_ctx)
            self.index = end_idx
        return action

    def _parse_add_file(self) -> PatchAction:
        lines: List[str] = []
        while not self.is_done(
            ("*** End Patch", "*** Update File:", "*** Delete File:", "*** Add File:")
        ):
            s = self.read_line()
            if not s.startswith("+"):
                raise DiffError(f"Invalid Add File line (missing '+'): {s}")
            lines.append(s[1:])  # strip leading '+'
        return PatchAction(type=ActionType.ADD, new_file="\n".join(lines))


# --------------------------------------------------------------------------- #
#  Helper functions
# --------------------------------------------------------------------------- #
def find_context_core(
    lines: List[str], context: List[str], start: int
) -> Tuple[int, int]:
    if not context:
        return start, 0

    for i in range(start, len(lines)):
        if lines[i : i + len(context)] == context:
            return i, 0
    for i in range(start, len(lines)):
        if [s.rstrip() for s in lines[i : i + len(context)]] == [
            s.rstrip() for s in context
        ]:
            return i, 1
    for i in range(start, len(lines)):
        if [s.strip() for s in lines[i : i + len(context)]] == [
            s.strip() for s in context
        ]:
            return i, 100
    return -1, 0


def find_context(
    lines: List[str], context: List[str], start: int, eof: bool
) -> Tuple[int, int]:
    if eof:
        new_index, fuzz = find_context_core(lines, context, len(lines) - len(context))
        if new_index != -1:
            return new_index, fuzz
        new_index, fuzz = find_context_core(lines, context, start)
        return new_index, fuzz + 10_000
    return find_context_core(lines, context, start)


def peek_next_section(
    lines: List[str], index: int
) -> Tuple[List[str], List[Chunk], int, bool]:
    old: List[str] = []
    del_lines: List[str] = []
    ins_lines: List[str] = []
    chunks: List[Chunk] = []
    mode = "keep"
    orig_index = index

    while index < len(lines):
        s = lines[index]
        if s.startswith(
            (
                "@@",
                "*** End Patch",
                "*** Update File:",
                "*** Delete File:",
                "*** Add File:",
                "*** End of File",
            )
        ):
            break
        if s == "***":
            break
        if s.startswith("***"):
            raise DiffError(f"Invalid Line: {s}")
        index += 1

        last_mode = mode
        if s == "":
            s = " "
        if s[0] == "+":
            mode = "add"
        elif s[0] == "-":
            mode = "delete"
        elif s[0] == " ":
            mode = "keep"
        else:
            raise DiffError(f"Invalid Line: {s}")
        s = s[1:]

        if mode == "keep" and last_mode != mode:
            if ins_lines or del_lines:
                chunks.append(
                    Chunk(
                        orig_index=len(old) - len(del_lines),
                        del_lines=del_lines,
                        ins_lines=ins_lines,
                    )
                )
            del_lines, ins_lines = [], []

        if mode == "delete":
            del_lines.append(s)
            old.append(s)
        elif mode == "add":
            ins_lines.append(s)
        elif mode == "keep":
            old.append(s)

    if ins_lines or del_lines:
        chunks.append(
            Chunk(
                orig_index=len(old) - len(del_lines),
                del_lines=del_lines,
                ins_lines=ins_lines,
            )
        )

    if index < len(lines) and lines[index] == "*** End of File":
        index += 1
        return old, chunks, index, True

    if index == orig_index:
        raise DiffError("Nothing in this section")
    return old, chunks, index, False


# --------------------------------------------------------------------------- #
#  Patch → Commit and Commit application
# --------------------------------------------------------------------------- #
def _get_updated_file(text: str, action: PatchAction, path: str) -> str:
    if action.type is not ActionType.UPDATE:
        raise DiffError("_get_updated_file called with non-update action")
    orig_lines = text.split("\n")
    dest_lines: List[str] = []
    orig_index = 0

    for chunk in action.chunks:
        if chunk.orig_index > len(orig_lines):
            raise DiffError(
                f"{path}: chunk.orig_index {chunk.orig_index} exceeds file length"
            )
        if orig_index > chunk.orig_index:
            raise DiffError(
                f"{path}: overlapping chunks at {orig_index} > {chunk.orig_index}"
            )

        dest_lines.extend(orig_lines[orig_index : chunk.orig_index])
        orig_index = chunk.orig_index

        dest_lines.extend(chunk.ins_lines)
        orig_index += len(chunk.del_lines)

    dest_lines.extend(orig_lines[orig_index:])
    return "\n".join(dest_lines)


def patch_to_commit(patch: Patch, orig: Dict[str, str]) -> Commit:
    commit = Commit()
    for path, action in patch.actions.items():
        if action.type is ActionType.DELETE:
            commit.changes[path] = FileChange(
                type=ActionType.DELETE, old_content=orig[path]
            )
        elif action.type is ActionType.ADD:
            if action.new_file is None:
                raise DiffError("ADD action without file content")
            commit.changes[path] = FileChange(
                type=ActionType.ADD, new_content=action.new_file
            )
        elif action.type is ActionType.UPDATE:
            new_content = _get_updated_file(orig[path], action, path)
            commit.changes[path] = FileChange(
                type=ActionType.UPDATE,
                old_content=orig[path],
                new_content=new_content,
                move_path=action.move_path,
            )
    return commit


# --------------------------------------------------------------------------- #
#  User-facing helpers
# --------------------------------------------------------------------------- #
def text_to_patch(text: str, orig: Dict[str, str]) -> Tuple[Patch, int]:
    lines = text.splitlines()  # preserves blank lines, no strip()
    if (
        len(lines) < 2
        or not Parser._norm(lines[0]).startswith("*** Begin Patch")
        or Parser._norm(lines[-1]) != "*** End Patch"
    ):
        raise DiffError("Invalid patch text - missing sentinels")

    parser = Parser(current_files=orig, lines=lines, index=1)
    parser.parse()
    return parser.patch, parser.fuzz


def identify_files_needed(text: str) -> List[str]:
    lines = text.splitlines()
    return [
        line[len("*** Update File: ") :]
        for line in lines
        if line.startswith("*** Update File: ")
    ] + [
        line[len("*** Delete File: ") :]
        for line in lines
        if line.startswith("*** Delete File: ")
    ]


def identify_files_added(text: str) -> List[str]:
    lines = text.splitlines()
    return [
        line[len("*** Add File: ") :]
        for line in lines
        if line.startswith("*** Add File: ")
    ]


# --------------------------------------------------------------------------- #
#  File-system helpers
# --------------------------------------------------------------------------- #
def load_files(paths: List[str], open_fn: Callable[[str], str]) -> Dict[str, str]:
    return {path: open_fn(path) for path in paths}


def apply_commit(
    commit: Commit,
    write_fn: Callable[[str, str], None],
    remove_fn: Callable[[str], None],
) -> None:
    for path, change in commit.changes.items():
        if change.type is ActionType.DELETE:
            remove_fn(path)
        elif change.type is ActionType.ADD:
            if change.new_content is None:
                raise DiffError(f"ADD change for {path} has no content")
            write_fn(path, change.new_content)
        elif change.type is ActionType.UPDATE:
            if change.new_content is None:
                raise DiffError(f"UPDATE change for {path} has no new content")
            target = change.move_path or path
            write_fn(target, change.new_content)
            if change.move_path:
                remove_fn(path)


def open_file(path: str) -> str:
    with open(path, "rt", encoding="utf-8") as fh:
        return fh.read()


def write_file(path: str, content: str) -> None:
    target = pathlib.Path(path)
    target.parent.mkdir(parents=True, exist_ok=True)
    with target.open("wt", encoding="utf-8") as fh:
        fh.write(content)


def remove_file(path: str) -> None:
    pathlib.Path(path).unlink(missing_ok=True)


def apply_patch(
    text: str,
    open_fn: Callable[[str], str] = open_file,
    write_fn: Callable[[str, str], None] = write_file, 
    remove_fn: Callable[[str], None] = remove_file,
) -> str:
    if not text.startswith("*** Begin Patch"):
        raise DiffError("Patch text must start with *** Begin Patch")
    paths = identify_files_needed(text)
    orig = load_files(paths, open_fn)
    patch, _fuzz = text_to_patch(text, orig)
    commit = patch_to_commit(patch, orig)
    apply_commit(commit, write_fn, remove_fn)
    return "Done!"


def main() -> None:
    import sys

    patch_text = sys.stdin.read()
    if not patch_text:
        print("Please pass patch text through stdin", file=sys.stderr)
        return
    try:
        result = apply_patch(patch_text)
    except DiffError as exc:
        print(exc, file=sys.stderr)
        return
    print(result)


if __name__ == "__main__":
    main()


================================================
FILE: gpt_oss/tools/python_docker/docker_tool.py
================================================
# Run this before running the tool:
# $ docker image pull python:3.11
import asyncio
import contextlib
import io
import os
import queue
import subprocess
import tarfile
import tempfile
from pathlib import Path
from typing import Any, AsyncIterator

import docker
from openai_harmony import (
    Author,
    Content,
    Message,
    Role,
    TextContent,
    ToolNamespaceConfig,
)

from ..tool import Tool

_docker_client = None

VALID_EXECUTION_BACKENDS = {
    "docker",
    "dangerously_use_uv",
    "dangerously_use_local_jupyter",
}

_default_backend = os.environ.get("PYTHON_EXECUTION_BACKEND", "docker")
if _default_backend not in VALID_EXECUTION_BACKENDS:
    _default_backend = "docker"

PYTHON_EXECUTION_BACKEND = _default_backend


def call_python_script(script: str) -> str:
    """
    Call a python script by writing it to a file in the container and executing it.
    """
    global _docker_client
    if _docker_client is None:
        _docker_client = docker.from_env()
        # pull image `python:3.11` if not present
        try:
            _docker_client.images.get("python:3.11")
        except docker.errors.ImageNotFound:
            _docker_client.images.pull("python:3.11")

    # 1. Create a temporary tar archive containing the script
    script_name = "script.py"
    tarstream = io.BytesIO()
    with tarfile.open(fileobj=tarstream, mode="w") as tar:
        script_bytes = script.encode("utf-8")
        tarinfo = tarfile.TarInfo(name=script_name)
        tarinfo.size = len(script_bytes)
        tar.addfile(tarinfo, io.BytesIO(script_bytes))
    tarstream.seek(0)

    # 2. Start the container
    container = _docker_client.containers.create(
        "python:3.11", command="sleep infinity", detach=True
    )
    try:
        container.start()
        # 3. Put the script into the container
        container.put_archive(path="/tmp", data=tarstream.read())
        # 4. Execute the script
        exec_result = container.exec_run(f"python /tmp/{script_name}")
        output = exec_result.output.decode("utf-8")
        if not output.strip():
            output = "[WARN] No output available. Use print() to output anything to stdout to receive the output"
    finally:
        container.remove(force=True)
    return output


def call_python_script_with_uv(script: str) -> str:
    """
    Call a python script by writing it to a file to a temporary directory
    and executing it with uv.
    """
    with tempfile.TemporaryDirectory() as temp_dir:
        script_path = os.path.join(temp_dir, "script.py")
        with open(script_path, "w") as f:
            f.write(script)
        exec_result = subprocess.run(
            ["uv", "run", "--no-project", "python", script_path],
            capture_output=True)
        return (
            exec_result.stdout.decode("utf-8")
            if exec_result.returncode == 0
            else exec_result.stderr.decode("utf-8")
        )


class LocalJupyterSession:
    """Stateful helper that proxies execution through a local Jupyter kernel."""

    def __init__(
        self,
        connection_file: str | None = None,
        *,
        timeout: float = 120.0,
    ) -> None:
        try:
            from jupyter_client import BlockingKernelClient, KernelManager
        except ImportError as exc:  # pragma: no cover - optional dependency
            raise RuntimeError(
                "The dangerously_use_local_jupyter backend requires the jupyter_client package to be installed."
            ) from exc

        self._default_timeout = timeout
        self._owns_kernel = False
        self._client: BlockingKernelClient
        self._km: KernelManager | None = None

        if connection_file:
            connection_path = Path(connection_file).expanduser()
            if not connection_path.exists():
                raise FileNotFoundError(
                    f"Cannot find Jupyter connection file at '{connection_path}'."
                )
            client = BlockingKernelClient()
            client.load_connection_file(str(connection_path))
            client.start_channels()
            # Ensure the connection is ready before executing.
            client.wait_for_ready(timeout=self._default_timeout)
            self._client = client
        else:
            km = KernelManager()
            km.start_kernel()
            client = km.blocking_client()
            client.start_channels()
            client.wait_for_ready(timeout=self._default_timeout)
            self._client = client
            self._km = km
            self._owns_kernel = True

    def execute(self, code: str, *, timeout: float | None = None) -> str:
        """Execute code in the kernel, returning combined stdout/stderr output."""

        client = self._client
        effective_timeout = timeout or self._default_timeout
        msg_id = client.execute(
            code,
            store_history=True,
            allow_stdin=False,
            stop_on_error=False,
        )

        stdout_parts: list[str] = []
        stderr_parts: list[str] = []

        while True:
            try:
                msg = client.get_iopub_msg(timeout=effective_timeout)
            except queue.Empty as exc:
                raise TimeoutError("Timed out waiting for Jupyter kernel output.") from exc

            if msg.get("parent_header", {}).get("msg_id") != msg_id:
                continue

            msg_type = msg.get("msg_type")
            content = msg.get("content", {})

            if msg_type == "stream":
                text = content.get("text", "")
                if content.get("name") == "stdout":
                    stdout_parts.append(text)
                else:
                    stderr_parts.append(text)
            elif msg_type == "error":
                traceback_data = content.get("traceback")
                if traceback_data:
                    stderr_parts.append("\n".join(traceback_data))
                else:
                    ename = content.get("ename", "")
                    evalue = content.get("evalue", "")
                    stderr_parts.append(f"{ename}: {evalue}".strip())
            elif msg_type in {"execute_result", "display_data"}:
                data = content.get("data", {})
                text = data.get("text/plain")
                if text:
                    stdout_parts.append(text if text.endswith("\n") else f"{text}\n")
            elif msg_type == "status" and content.get("execution_state") == "idle":
                break

        # Drain the shell channel to capture final execution status.
        while True:
            try:
                reply = client.get_shell_msg(timeout=effective_timeout)
            except queue.Empty as exc:
                raise TimeoutError(
                    "Timed out waiting for Jupyter kernel execution reply."
                ) from exc

            if reply.get("parent_header", {}).get("msg_id") != msg_id:
                continue

            reply_content = reply.get("content", {})
            if reply_content.get("status") == "error":
                traceback_data = reply_content.get("traceback")
                if traceback_data:
                    stderr_parts.append("\n".join(traceback_data))
                else:
                    ename = reply_content.get("ename", "")
                    evalue = reply_content.get("evalue", "")
                    stderr_parts.append(f"{ename}: {evalue}".strip())
            break

        stdout = "".join(stdout_parts)
        stderr = "".join(stderr_parts)

        if stderr:
            if stdout:
                stdout = f"{stdout.rstrip()}\n{stderr}"
            else:
                stdout = stderr

        if not stdout.strip():
            stdout = (
                "[WARN] No output available. Use print() to output anything to stdout to "
                "receive the output"
            )

        return stdout

    def close(self) -> None:
        with contextlib.suppress(Exception):
            self._client.stop_channels()

        if self._owns_kernel and self._km is not None:
            with contextlib.suppress(Exception):
                self._km.shutdown_kernel(now=True)

    def __del__(self) -> None:  # pragma: no cover - best-effort cleanup
        self.close()

class PythonTool(Tool):
    def __init__(
        self,
        name: str = "python",
        *,
        execution_backend: str | None = None,
        local_jupyter_connection_file: str | None = None,
        local_jupyter_timeout: float = 60.0,
    ):
        assert name == "python"

        backend = execution_backend or PYTHON_EXECUTION_BACKEND
        if backend not in VALID_EXECUTION_BACKENDS:
            raise ValueError(
                "execution_backend must be one of: "
                + ", ".join(sorted(VALID_EXECUTION_BACKENDS))
            )

        self._execution_backend = backend
        self._local_jupyter_connection_file = (
            local_jupyter_connection_file
            or os.environ.get("PYTHON_LOCAL_JUPYTER_CONNECTION_FILE")
        )
        self._local_jupyter_timeout = local_jupyter_timeout

        self._jupyter_session: LocalJupyterSession | None = None
        self._execution_lock: asyncio.Lock | None = None

        if self._execution_backend == "dangerously_use_local_jupyter":
            self._execution_lock = asyncio.Lock()
            self._jupyter_session = LocalJupyterSession(
                connection_file=self._local_jupyter_connection_file,
                timeout=self._local_jupyter_timeout,
            )

    @classmethod
    def get_tool_name(cls) -> str:
        return "python"

    @property
    def name(self) -> str:
        return self.get_tool_name()

    @property
    def instruction(self) -> str:
        if self._execution_backend == "dangerously_use_local_jupyter":
            return """
Use this tool to execute Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).
When you send a message containing Python code to python, it will be executed in a stateful Jupyter notebook environment. python will respond with the output of the execution or time out after 120.0 seconds. Internet access for this session is UNKNOWN. Depends on the cluster.
            """.strip()

        return """
Use this tool to execute STATELESS Python code in your chain of thought. The code will not be shown to the user. This tool should be used for internal reasoning, but not for code that is intended to be visible to the user (e.g. when creating plots, tables, or files).
When you send a message containing python code to python, it will be executed in a stateless docker container, and the stdout of that process will be returned to you. You have to use print statements to access the output.

IMPORTANT: Your python environment is not shared between calls. You will have to pass your entire code each time.
        """.strip()

    @property
    def tool_config(self) -> ToolNamespaceConfig:
        return ToolNamespaceConfig(
            name=self.get_tool_name(), description=self.instruction, tools=[]
        )

    def _make_response(
        self,
        output: str,
        channel: str | None = None,
    ) -> Message:
        content = TextContent(text=output)
        return self.make_response(content=content, channel=channel)

    def make_response(
        self,
        content: Content,
        *,
        metadata: dict[str, Any] | None = None,
        author: Author | None = None,
        channel: str | None = None,
    ) -> Message:
        tool_name = self.get_tool_name()
        author = Author(role=Role.TOOL, name=f"{tool_name}")

        message = Message(
            author=author,
            content=[content],
        ).with_recipient("assistant")

        if channel:
            message = message.with_channel(channel)

        return message

    async def _process(self, message: Message) -> AsyncIterator[Message]:
        script = message.content[0].text
        channel = message.channel

        if self._execution_backend == "docker":
            output = call_python_script(script)
        elif self._execution_backend == "dangerously_use_uv":
            output = call_python_script_with_uv(script)
        elif self._execution_backend == "dangerously_use_local_jupyter":
            assert self._jupyter_session is not None
            lock = self._execution_lock
            if lock is not None:
                async with lock:
                    try:
                        output = self._jupyter_session.execute(script)
                    except TimeoutError as exc:
                        output = f"[ERROR] {exc}"
            else:
                try:
                    output = self._jupyter_session.execute(script)
                except TimeoutError as exc:
                    output = f"[ERROR] {exc}"
        else:
            raise ValueError(
                f"Invalid PYTHON_EXECUTION_BACKEND: {self._execution_backend}"
            )
        yield self._make_response(output, channel=channel)

    def close(self) -> None:
        if self._jupyter_session is not None:
            self._jupyter_session.close()

    def __del__(self) -> None:  # pragma: no cover - best-effort cleanup
        self.close()


================================================
FILE: gpt_oss/tools/simple_browser/__init__.py
================================================
from .simple_browser_tool import SimpleBrowserTool
from .backend import ExaBackend, YouComBackend

__all__ = [
    "SimpleBrowserTool",
    "ExaBackend",
    "YouComBackend",
]


================================================
FILE: gpt_oss/tools/simple_browser/backend.py
================================================
"""
Simple backend for the simple browser tool.
"""

import functools
import asyncio
import logging
import os
from abc import abstractmethod
from importlib.metadata import version
from typing import Callable, ParamSpec, TypeVar
from urllib.parse import quote

import chz
from aiohttp import ClientSession, ClientTimeout
from tenacity import (
    after_log,
    before_sleep_log,
    retry,
    retry_if_exception_type,
    stop_after_attempt,
    wait_exponential,
)

from .page_contents import (
    Extract,
    FetchResult,
    PageContents,
    get_domain,
    process_html,
)

logger = logging.getLogger(__name__)


VIEW_SOURCE_PREFIX = "view-source:"

try:
    _GPT_OSS_VERSION = version("gpt-oss")
except Exception:
    _GPT_OSS_VERSION = "0.0.8"  # fallback version


class BackendError(Exception):
    pass


P = ParamSpec("P")
R = TypeVar("R")


def with_retries(
    func: Callable[P, R],
    num_retries: int,
    max_wait_time: float,
) -> Callable[P, R]:
    if num_retries > 0:
        retry_decorator = retry(
            stop=stop_after_attempt(num_retries),
            wait=wait_exponential(
                multiplier=1,
                min=2,
                max=max_wait_time,
            ),
            before_sleep=before_sleep_log(logger, logging.INFO),
            after=after_log(logger, logging.INFO),
            retry=retry_if_exception_type(Exception),
        )
        return retry_decorator(func)
    else:
        return func


def maybe_truncate(text: str, num_chars: int = 1024) -> str:
    if len(text) > num_chars:
        text = text[: (num_chars - 3)] + "..."
    return text


@chz.chz(typecheck=True)
class Backend:
    source: str = chz.field(doc="Description of the backend source")

    @abstractmethod
    async def search(
        self,
        query: str,
        topn: int,
        session: ClientSession,
    ) -> PageContents:
        pass

    @abstractmethod
    async def fetch(self, url: str, session: ClientSession) -> PageContents:
        pass

    async def _post(self, session: ClientSession, endpoint: str, payload: dict) -> dict:
        headers = {
            "x-api-key": self._get_api_key(),
            "user-agent": f"gpt-oss/{_GPT_OSS_VERSION}",
        }
        async with session.post(f"{self.BASE_URL}{endpoint}", json=payload, headers=headers) as resp:
            if resp.status != 200:
                raise BackendError(
                    f"{self.__class__.__name__} error {resp.status}: {await resp.text()}"
                )
            return await resp.json()

    async def _get(self, session: ClientSession, endpoint: str, params: dict) -> dict:
        headers = {
            "x-api-key": self._get_api_key(),
            "user-agent": f"gpt-oss/{_GPT_OSS_VERSION}",
        }
        async with session.get(f"{self.BASE_URL}{endpoint}", params=params, headers=headers) as resp:
            if resp.status != 200:
                raise BackendError(
                    f"{self.__class__.__name__} error {resp.status}: {await resp.text()}"
                )
            return await resp.json()


@chz.chz(typecheck=True)
class ExaBackend(Backend):
    """Backend that uses the Exa Search API."""

    source: str = chz.field(doc="Description of the backend source")
    api_key: str | None = chz.field(
        doc="Exa API key. Uses EXA_API_KEY environment variable if not provided.",
        default=None,
    )

    BASE_URL: str = "https://api.exa.ai"

    def _get_api_key(self) -> str:
        key = self.api_key or os.environ.get("EXA_API_KEY")
        if not key:
            raise BackendError("Exa API key not provided")
        return key


    async def search(
        self, query: str, topn: int, session: ClientSession
    ) -> PageContents:
        data = await self._post(
            session,
            "/search",
            {"query": query, "numResults": topn, "contents": {"text": True, "summary": True}},
        )
        # make a simple HTML page to work with browser format
        titles_and_urls = [
            (result["title"], result["url"], result["summary"])
            for result in data["results"]
        ]
        html_page = f"""
<html><body>
<h1>Search Results</h1>
<ul>
{"".join([f"<li><a href='{url}'>{title}</a> {summary}</li>" for title, url, summary in titles_and_urls])}
</ul>
</body></html>
"""

        return process_html(
            html=html_page,
            url="",
            title=query,
            display_urls=True,
            session=session,
        )

    async def fetch(self, url: str, session: ClientSession) -> PageContents:
        is_view_source = url.startswith(VIEW_SOURCE_PREFIX)
        if is_view_source:
            url = url[len(VIEW_SOURCE_PREFIX) :]
        data = await self._post(
            session,
            "/contents",
            {"urls": [url], "text": { "includeHtmlTags": True }},
        )
        results = data.get("results", [])
        if not results:
            raise BackendError(f"No contents returned for {url}")
        return process_html(
            html=results[0].get("text", ""),
            url=url,
            title=results[0].get("title", ""),
            display_urls=True,
            session=session,
        )

@chz.chz(typecheck=True)
class YouComBackend(Backend):
    """Backend that uses the You.com Search API."""

    source: str = chz.field(doc="Description of the backend source")

    BASE_URL: str = "https://api.ydc-index.io"

    def _get_api_key(self) -> str:
        key = os.environ.get("YDC_API_KEY")
        if not key:
            raise BackendError("You.com API key not provided")
        return key

    
    async def search(
        self, query: str, topn: int, session: ClientSession
    ) -> PageContents:
        data = await self._get(
            session,
            "/v1/search",
            {"query": query, "count": topn},
        )
        # make a simple HTML page to work with browser format
        web_titles_and_urls, news_titles_and_urls = [], []
        if "web" in data["results"]:
            web_titles_and_urls = [
                (result["title"], result["url"], result["snippets"])
                for result in data["results"]["web"]
            ]
        if "news" in data["results"]:
            news_titles_and_urls = [
                (result["title"], result["url"], result["description"])
                for result in data["results"]["news"]
            ]
        titles_and_urls = web_titles_and_urls + news_titles_and_urls
        html_page = f"""
<html><body>
<h1>Search Results</h1>
<ul>
{"".join([f"<li><a href='{url}'>{title}</a> {summary}</li>" for title, url, summary in titles_and_urls])}
</ul>
</body></html>
"""

        return process_html(
            html=html_page,
            url="",
            title=query,
            display_urls=True,
            session=session,
        )

    async def fetch(self, url: str, session: ClientSession) -> PageContents:
        is_view_source = url.startswith(VIEW_SOURCE_PREFIX)
        if is_view_source:
            url = url[len(VIEW_SOURCE_PREFIX) :]
        data = await self._post(
            session,
            "/v1/contents",
            {"urls": [url], "livecrawl_formats": "html"},
        )
        if not data:
            raise BackendError(f"No contents returned for {url}")
        if "html" not in data[0]:
            raise BackendError(f"No HTML returned for {url}")
        return process_html(
            html=data[0].get("html", ""),
            url=url,
            title=data[0].get("title", ""),
            display_urls=True,
            session=session,
        )


================================================
FILE: gpt_oss/tools/simple_browser/page_contents.py
================================================
"""
Page contents for the simple browser tool.
"""

from __future__ import annotations

import dataclasses
import functools
import logging
import re
from urllib.parse import urljoin, urlparse

import aiohttp
import html2text
import lxml
import lxml.etree
import lxml.html
import pydantic

import tiktoken

logger = logging.getLogger(__name__)


HTML_SUP_RE = re.compile(r"<sup( [^>]*)?>([\w\-]+)</sup>")
HTML_SUB_RE = re.compile(r"<sub( [^>]*)?>([\w\-]+)</sub>")
HTML_TAGS_SEQ_RE = re.compile(r"(?<=\w)((<[^>]*>)+)(?=\w)")
WHITESPACE_ANCHOR_RE = re.compile(r"(【\@[^】]+】)(\s+)")
EMPTY_LINE_RE = re.compile(r"^\s+$", flags=re.MULTILINE)
EXTRA_NEWLINE_RE = re.compile(r"\n(\s*\n)+")


class Extract(pydantic.BaseModel):  # A search result snippet or a quotable extract
    url: str
    text: str
    title: str
    line_idx: int | None = None


class FetchResult(pydantic.BaseModel):
    url: str
    success: bool
    title: str | None = None
    error_type: str | None = None
    error_message: str | None = None
    html: str | None = None
    raw_content: bytes | None = None
    plaintext: str | None = None


class PageContents(pydantic.BaseModel):
    url: str
    text: str
    title: str
    urls: dict[str, str]
    snippets: dict[str, Extract] | None = None
    error_message: str | None = None


@dataclasses.dataclass(frozen=True)
class Tokens:
    tokens: list[int]
    tok2idx: list[int]  # Offsets = running sum of lengths.


def get_domain(url: str) -> str:
    """Extracts the domain from a URL."""
    if "http" not in url:
        # If `get_domain` is called on a domain, add a scheme so that the
        # original domain is returned instead of the empty string.
        url = "http://" + url
    return urlparse(url).netloc


def multiple_replace(text: str, replacements: dict[str, str]) -> str:
    """Performs multiple string replacements using regex pass."""
    regex = re.compile("(%s)" % "|".join(map(re.escape, replacements.keys())))
    return regex.sub(lambda mo: replacements[mo.group(1)], text)


@functools.lru_cache(maxsize=1024)
def mark_lines(text: str) -> str:
    """Adds line numbers (ex: 'L0:') to the beginning of each line in a string."""
    # Split the string by newline characters
    lines = text.split("\n")

    # Add lines numbers to each line and join into a single string
    numbered_text = "\n".join([f"L{i}: {line}" for i, line in enumerate(lines)])
    return numbered_text


@functools.cache
def _tiktoken_vocabulary_lengths(enc_name: str) -> list[int]:
    """Gets the character lengths of all tokens in the specified TikToken vocabulary."""
    encoding = tiktoken.get_encoding(enc_name)
    return [len(encoding.decode([i])) for i in range(encoding.n_vocab)]


def warmup_caches(enc_names: list[str]) -> None:
    """Warm up the cache by computing token length lists for the given TikToken encodings."""
    for _ in map(_tiktoken_vocabulary_lengths, enc_names):
        pass


def _replace_special_chars(text: str) -> str:
    """Replaces specific special characters with visually similar alternatives."""
    replacements = {
        "【": "〖",
        "】": "〗",
        "◼": "◾",
        # "━": "─",
        "\u200b": "",  # zero width space
        # Note: not replacing †
    }
    return multiple_replace(text, replacements)


def merge_whitespace(text: str) -> str:
    """Replace newlines with spaces and merge consecutive whitespace into a single space."""
    text = text.replace("\n", " ")
    text = re.sub(r"\s+", " ", text)
    return text


def arxiv_to_ar5iv(url: str) -> str:
    """Converts an arxiv.org URL to its ar5iv.org equivalent."""
    return re.sub(r"arxiv.org", r"ar5iv.org", url)


def _clean_links(root: lxml.html.HtmlElement, cur_url: str) -> dict[str, str]:
    """Processes all anchor tags in the HTML, replaces them with a custom format and returns an ID-to-URL mapping."""
    cur_domain = get_domain(cur_url)
    urls: dict[str, str] = {}
    urls_rev: dict[str, str] = {}
    for a in root.findall(".//a[@href]"):
        assert a.getparent() is not None
        link = a.attrib["href"]
        if link.startswith(("mailto:", "javascript:")):
            continue
        text = _get_text(a).replace("†", "‡")
        if not re.sub(r"【\@([^】]+)】", "", text):  # Probably an image
            continue
        if link.startswith("#"):
            replace_node_with_text(a, text)
            continue
        try:
            link = urljoin(cur_url, link)  # works with both absolute and relative links
            domain = get_domain(link)
        except Exception:
            domain = ""
        if not domain:
            logger.debug("SKIPPING LINK WITH URL %s", link)
            continue
        link = arxiv_to_ar5iv(link)
        if (link_id := urls_rev.get(link)) is None:
            link_id = f"{len(urls)}"
            urls[link_id] = link
            urls_rev[link] = link_id
        if domain == cur_domain:
            replacement = f"【{link_id}†{text}】"
        else:
            replacement = f"【{link_id}†{text}†{domain}】"
        replace_node_with_text(a, replacement)
    return urls


def _get_text(node: lxml.html.HtmlElement) -> str:
    """Extracts all text from an HTML element and merges it into a whitespace-normalized string."""
    return merge_whitespace(" ".join(node.itertext()))


def _remove_node(node: lxml.html.HtmlElement) -> None:
    """Removes a node from its parent in the lxml tree."""
    node.getparent().remove(node)


def _escape_md(text: str) -> str:
    return text


def _escape_md_section(text: str, snob: bool = False) -> str:
    return text


def html_to_text(html: str) -> str:
    """Converts an HTML string to clean plaintext."""
    html = re.sub(HTML_SUP_RE, r"^{\2}", html)
    html = re.sub(HTML_SUB_RE, r"_{\2}", html)
    # add spaces between tags such as table cells
    html = re.sub(HTML_TAGS_SEQ_RE, r" \1", html)
    # we don't need to escape markdown, so monkey-patch the logic
    orig_escape_md = html2text.utils.escape_md
    orig_escape_md_section = html2text.utils.escape_md_section
    html2text.utils.escape_md = _escape_md
    html2text.utils.escape_md_section = _escape_md_section
    h = html2text.HTML2Text()
    h.ignore_links = True
    h.ignore_images = True
    h.body_width = 0  # no wrapping
    h.ignore_tables = True
    h.unicode_snob = True
    h.ignore_emphasis = True
    result = h.handle(html).strip()
    html2text.utils.escape_md = orig_escape_md
    html2text.utils.escape_md_section = orig_escape_md_section
    return result


def _remove_math(root: lxml.html.HtmlElement) -> None:
    """Removes all <math> elements from the lxml tree."""
    for node in root.findall(".//math"):
        _remove_node(node)


def remove_unicode_smp(text: str) -> str:
    """Removes Unicode characters in the Supplemental Multilingual Plane (SMP) from `text`.

    SMP characters are not supported by lxml.html processing.
    """
    smp_pattern = re.compile(r"[\U00010000-\U0001FFFF]", re.UNICODE)
    return smp_pattern.sub("", text)


def replace_node_with_text(node: lxml.html.HtmlElement, text: str) -> None:
    """Replaces an lxml node with a text string while preserving surrounding text."""
    previous = node.getprevious()
    parent = node.getparent()
    tail = node.tail or ""
    if previous is None:
        parent.text = (parent.text or "") + text + tail
    else:
        previous.tail = (previous.tail or "") + text + tail
    parent.remove(node)


def replace_images(
    root: lxml.html.HtmlElement,
    base_url: str,
    session: aiohttp.ClientSession | None,
) -> None:
    """Finds all image tags and replaces them with numbered placeholders (includes alt/title if available)."""
    cnt = 0
    for img_tag in root.findall(".//img"):
        image_name = img_tag.get("alt", img_tag.get("title"))
        if image_name:
            replacement = f"[Image {cnt}: {image_name}]"
        else:
            replacement = f"[Image {cnt}]"
        replace_node_with_text(img_tag, replacement)
        cnt += 1


def process_html(
    html: str,
    url: str,
    title: str | None,
    session: aiohttp.ClientSession | None = None,
    display_urls: bool = False,
) -> PageContents:
    """Convert HTML into model-readable version."""
    html = remove_unicode_smp(html)
    html = _replace_special_chars(html)
    root = lxml.html.fromstring(html)

    # Parse the title.
    title_element = root.find(".//title")
    if title:
        final_title = title
    elif title_element is not None:
        final_title = title_element.text or ""
    elif url and (domain := get_domain(url)):
        final_title = domain
    else:
        final_title = ""

    urls = _clean_links(root, url)
    replace_images(
        root=root,
        base_url=url,
        session=session,
    )
    _remove_math(root)
    clean_html = lxml.etree.tostring(root, encoding="UTF-8").decode()
    text = html_to_text(clean_html)
    text = re.sub(WHITESPACE_ANCHOR_RE, lambda m: m.group(2) + m.group(1), text)
    # ^^^ move anchors to the right thru whitespace
    # This way anchors don't create extra whitespace
    text = re.sub(EMPTY_LINE_RE, "", text)
    # ^^^ Get rid of empty lines
    text = re.sub(EXTRA_NEWLINE_RE, "\n\n", text)
    # ^^^ Get rid of extra newlines

    top_parts = []
    if display_urls:
        top_parts.append(f"\nURL: {url}\n")
    # NOTE: Publication date is currently not extracted due
    # to performance costs.

    return PageContents(
        url=url,
        text="".join(top_parts) + text,
        urls=urls,
        title=final_title,
    )


================================================
FILE: gpt_oss/tools/simple_browser/simple_browser_tool.py
================================================
import contextvars
import dataclasses
import functools
import itertools
import json
import re
import textwrap
from typing import Any, AsyncIterator, Callable, ParamSpec, Sequence
from urllib.parse import quote, unquote

import pydantic
import structlog
import tiktoken
from aiohttp import ClientSession
from openai_harmony import (
    Author,
    Content,
    Message,
    Role,
    TextContent,
    ToolNamespaceConfig
)

from ..tool import Tool

# from functions import Function, from_python
from .backend import (
    VIEW_SOURCE_PREFIX,
    Backend,
    BackendError,
    maybe_truncate,
)
from .page_contents import Extract, PageContents

logger = structlog.stdlib.get_logger(component=__name__)


# TODO(zhuohan): Use the correct encoding at release
ENC_NAME = "o200k_base"
FIND_PAGE_LINK_FORMAT = "# 【{idx}†{title}】"
PARTIAL_INITIAL_LINK_PATTERN = re.compile(r"^[^【】]*】")
PARTIAL_FINAL_LINK_PATTERN = re.compile(
    r"【\d*(?:†(?P<content>[^†】]*)(?:†[^†】]*)?)?$"
)
LINK_PATTERN = re.compile(r"【\d+†(?P<content>[^†】]+)(?:†[^†】]+)?】")

CITATION_OUTPUT_PATTERN = re.compile(r"【(?P<cursor>\d+)†(?P<content>[^†】]+)(?:†[^†】]+)?】")

CallParams = ParamSpec("CallParams")


_P = ParamSpec("_P")
_live_function_name = contextvars.ContextVar[str]("_live_function_name")


class ToolUsageError(Exception):
    pass


def function_the_model_can_call(
    fn: Callable[_P, AsyncIterator[Message]],
) -> Callable[_P, AsyncIterator[Message]]:
    fn.__fn_calling_tool_fn_type__ = "function_the_model_can_call"  # type: ignore

    @functools.wraps(fn)
    async def inner(*args: _P.args, **kwargs: _P.kwargs) -> AsyncIterator[Message]:
        token = _live_function_name.set(fn.__name__)
        try:
            async for m in fn(*args, **kwargs):
                yield m
        finally:
            _live_function_name.reset(token)

    return inner


@functools.cache
def _tiktoken_vocabulary_lengths(enc_name: str) -> list[int]:
    encoding = tiktoken.get_encoding(enc_name)
    results = []
    for i in range(encoding.n_vocab):
        try:
            results.append(len(encoding.decode([i])))
        except Exception as e:
            results.append(1)
    return results


@dataclasses.dataclass(frozen=True)
class Tokens:
    tokens: list[int]
    tok2idx: list[int]  # Offsets = running sum of lengths.


@functools.cache
def max_chars_per_token(enc_name: str) -> int:
    """Typical value is 128, but let's be safe."""
    tok_lens = _tiktoken_vocabulary_lengths(enc_name)
    return max(tok_lens)


def get_tokens(text: str, enc_name: str) -> Tokens:
    encoding = tiktoken.get_encoding(enc_name)
    tokens = encoding.encode(text, disallowed_special=())
    _vocabulary_lengths = _tiktoken_vocabulary_lengths(enc_name)
    tok2idx = [0] + list(itertools.accumulate(_vocabulary_lengths[i] for i in tokens))[
        :-1
    ]
    result = Tokens(tokens=tokens, tok2idx=tok2idx)
    return result


def get_end_loc(
    loc: int,
    num_lines: int,
    total_lines: int,
    lines: list[str],
    view_tokens: int,
    encoding_name: str,
) -> int:
    if num_lines <= 0:
        # COMPUTE NUMBER OF LINES TO SHOW
        txt = join_lines(lines[loc:], add_line_numbers=True, offset=loc)
        # if the text is very short, no need to truncate at all
        # at least one char per token
        if len(txt) > view_tokens:
            # limit the amount of text we tokenize here
            upper_bound = max_chars_per_token(encoding_name)
            tok2idx = get_tokens(
                txt[: (view_tokens + 1) * upper_bound], encoding_name
            ).tok2idx
            if len(tok2idx) > view_tokens:
                end_idx = tok2idx[view_tokens]
                num_lines = txt[:end_idx].count("\n") + 1  # round up
            else:
                num_lines = total_lines
        else:
            num_lines = total_lines

    return min(loc + num_lines, total_lines)


def get_page_metadata(
    curr_page: PageContents,
) -> dict[str, str | None | dict[str, str] | list[str]]:
    """Some attributes of the current page."""
    page_metadata: dict[str, str | None | dict[str, str] | list[str]] = {
        "url": curr_page.url,
        "title": curr_page.title,
    }
    return page_metadata


def join_lines(
    lines: list[str], add_line_numbers: bool = False, offset: int = 0
) -> str:
    if add_line_numbers:
        return "\n".join([f"L{i + offset}: {line}" for i, line in enumerate(lines)])
    else:
        return "\n".join(lines)


def wrap_lines(text: str, width: int = 80) -> list[str]:
    lines = text.split("\n")
    wrapped = itertools.chain.from_iterable(
        (
            textwrap.wrap(
                line, width=width, replace_whitespace=False, drop_whitespace=False
            )
            if line
            else [""]
        )  # preserve empty lines
        for line in lines
    )
    return list(wrapped)


def strip_links(text: str) -> str:
    text = re.sub(PARTIAL_INITIAL_LINK_PATTERN, "", text)
    text = re.sub(PARTIAL_FINAL_LINK_PATTERN, lambda mo: mo.group("content"), text)
    text = re.sub(LINK_PATTERN, lambda mo: mo.group("content"), text)
    return text


def maybe_get_function_args(
    message: Message, tool_name: str = "browser"
) -> dict[str, Any] | None:
    if not message.recipient.startswith(f"{tool_name}."):
        return None

    contents = ""
    if len(message.content) == 1 and isinstance(message.content[0], TextContent):
        contents = message.content[0].text

    if not contents:
        return {}

    try:
        parsed_contents = json.loads(contents)
        if isinstance(parsed_contents, dict):
            return parsed_contents
    except json.JSONDecodeError:
        pass

    return None


async def run_find_in_page(
    pattern: str,
    page: PageContents,
    max_results: int = 50,
    num_show_lines: int = 4,
) -> PageContents:
    lines = wrap_lines(text=page.text)
    txt = join_lines(lines, add_line_numbers=False)
    without_links = strip_links(txt)
    lines = without_links.split("\n")

    result_chunks, snippets = [], []
    line_idx, match_idx = 0, 0
    while line_idx < len(lines):
        line = lines[line_idx]
        if pattern not in line.lower():
            line_idx += 1
            continue
        snippet = "\n".join(lines[line_idx : line_idx + num_show_lines])
        link_title = FIND_PAGE_LINK_FORMAT.format(
            idx=f"{match_idx}", title=f"match at L{line_idx}"
        )
        result_chunks.append(f"{link_title}\n{snippet}")
        snippets.append(
            Extract(
                url=page.url, text=snippet, title=f"#{match_idx}", line_idx=line_idx
            )
        )
        if len(result_chunks) == max_results:
            break
        match_idx += 1
        line_idx += num_show_lines

    urls = [page.url for _ in result_chunks]

    if result_chunks:
        display_text = "\n\n".join(result_chunks)
    else:
        display_text = f"No `find` results for pattern: `{pattern}`"

    result_page = PageContents(
        url=f"{page.url}/find?pattern={quote(pattern)}",
        title=f"Find results for text: `{pattern}` in `{page.title}`",
        text=display_text,
        urls={str(i): url for i, url in enumerate(urls)},
        snippets={str(i): snip for i, snip in enumerate(snippets)},
    )
    return result_page


def handle_errors(
    func: Callable[CallParams, AsyncIterator["Message"]],
) -> Callable[CallParams, AsyncIterator["Message"]]:
    @functools.wraps(func)
    async def inner(
        *args: CallParams.args, **kwargs: CallParams.kwargs
    ) -> AsyncIterator[Message]:
        tool = args[0]
        # Could be cool to type this explicitly, but mypy makes it hard
        assert isinstance(tool, SimpleBrowserTool)
        try:
            async for msg in func(*args, **kwargs):
                yield msg
        except (ToolUsageError, BackendError) as e:
            yield tool.make_error_message(e)

    return inner


class SimpleBrowserState(pydantic.BaseModel):
    # maps page url to page contents
    pages: dict[str, PageContents] = pydantic.Field(default_factory=dict)
    # a sequential list of page urls
    page_stack: list[str] = pydantic.Field(default_factory=list)

    @property
    def current_cursor(self) -> int:
        return len(self.page_stack) - 1

    def add_page(self, page: PageContents) -> None:
        self.pages[page.url] = page
        self.page_stack.append(page.url)

    def get_page(self, cursor: int = -1) -> PageContents:
        if self.current_cursor < 0:
            raise ToolUsageError("No pages to access!")
        if cursor == -1 or cursor == self.current_cursor:
            return self.pages[self.page_stack[-1]]
        try:
            page_url = self.page_stack[cursor]
        except TypeError as e:
            raise ToolUsageError(
                f"`cursor` should be an integer, not `{type(cursor).__name__}`"
            ) from e
        except IndexError as e:
            raise ToolUsageError(
                f"Cursor `{cursor}` is out of range. "
                f"Available cursor indices: [0 - {self.current_cursor}]."
            ) from e
        return self.pages[page_url]

    def get_page_by_url(self, url: str) -> PageContents | None:
        if url in self.pages:
            return self.pages[url]
        return None

    def pop_page_stack(self) -> None:
        assert self.current_cursor >= 0, "No page to pop!"
        self.page_stack.pop()


class SimpleBrowserTool(Tool):
    def __init__(
        self,
        backend: Backend,
        encoding_name: str = ENC_NAME,
        max_search_results: int = 20,
        tool_state: dict[str, Any] | None = None,
        view_tokens: int = 1024,
        name: str = "browser",
    ):
        assert name == "browser"
        self.backend = backend
        if tool_state is None:
            self.tool_state = SimpleBrowserState()
        else:
            self.tool_state = SimpleBrowserState.model_validate(tool_state)

        self.encoding_name = encoding_name
        self.max_search_results = max_search_results
        self.view_tokens = view_tokens

    def get_tool_state(self) -> dict[str, Any]:
        return {"tool_state": self.tool_state.model_dump()}

    @classmethod
    def get_tool_name(cls) -> str:
        return "browser"

    @property
    def name(self) -> str:
        return self.get_tool_name()

    @property
    def tool_config(self) -> ToolNamespaceConfig:
        config = ToolNamespaceConfig.browser()
        config.name = self.name
        config.description = """Tool for browsing.
The `cursor` appears in brackets before each browsing display: `[{cursor}]`.
Cite information from the tool using the following format:
`【{cursor}†L{line_start}(-L{line_end})?】`, for example: `【6†L9-L11】` or `【8†L3】`.
Do not quote more than 10 words directly from the tool output.
sources=""" + self.backend.source
        return config

    @property
    def instruction(self) -> str:
        return self.tool_config.description

    def _render_browsing_display(
        self,
        tether_id: int,
        result: str,
        summary: str | None = None,
    ):
        to_return = ""
        # Always show summaries.
        if summary:
            to_return += summary
        to_return += result
        to_return = f"[{tether_id}] {to_return}"
        return to_return

    def _make_response(
        self,
        page: PageContents,
        cursor: int,
        body: str,
        scrollbar: str,
    ) -> Message:
        domain = maybe_truncate(unquote(page.url))
        header = f"{page.title}"
        if domain:
            header += f" ({domain})"
        header += f"\n**{scrollbar}**\n\n"

        content = TextContent(text=self._render_browsing_display(cursor, body, header))
        return self.make_response(
            content=content, metadata=get_page_metadata(self.tool_state.get_page())
        )

    async def show_page(self, loc: int = 0, num_lines: int = -1) -> Message:
        page = self.tool_state.get_page()
        cursor = self.tool_state.current_cursor
        lines = wrap_lines(text=page.text)
        total_lines = len(lines)

        if loc >= total_lines:
            err_msg = (
                f"Invalid location parameter: `{loc}`. "
                f"Cannot exceed page maximum of {total_lines - 1}."
            )
            raise ToolUsageError(err_msg)

        end_loc = get_end_loc(
            loc, num_lines, total_lines, lines, self.view_tokens, self.encoding_name
        )

        lines_to_show = lines[loc:end_loc]
        body = join_lines(lines_to_show, add_line_numbers=True, offset=loc)

        scrollbar = f"viewing lines [{loc} - {end_loc - 1}] of {total_lines - 1}"
        return self._make_response(page, cursor, body, scrollbar)

    async def show_page_safely(self, loc: int = 0, num_lines: int = -1) -> Message:
        try:
            return await self.show_page(loc=loc, num_lines=num_lines)
        except ToolUsageError as e:
            self.tool_state.pop_page_stack()
            raise e

    async def _open_url(self, url: str, direct_url_open: bool) -> PageContents:
        """Use the cache, if available."""
        backend = self.backend
        # direct_url_open should be regarded as a refresh
        if not direct_url_open and (page := self.tool_state.get_page_by_url(url)):
            assert page.url == url
            return page

        try:
            async with ClientSession() as session:
                page = await backend.fetch(url, session=session)
            return page
        except Exception as e:
            msg = maybe_truncate(str(e))
            logger.warning("Error fetching URL in lean browser tool", exc_info=e)
            raise BackendError(
                f"Error fetching URL `{maybe_truncate(url)}`: {msg}"
            ) from e

    def make_error_message(self, error: Exception) -> Message:
        """Uses the message creation codepath from the base class."""
        error_name = error.__class__.__name__
        content = TextContent(text=str(error))
        return self.make_response(content=content)

    @function_the_model_can_call
    @handle_errors
    async def search(
        self,
        query: str,
        topn: int = 10,
        top_n: int = 10,
        source: str | None = None,
    ) -> AsyncIterator[Message]:
        del topn
        del top_n
        try:
            async with ClientSession() as session:
                search_page = await self.backend.search(
                    query=query,
                    topn=self.max_search_results,
                    session=session,
                )
        except Exception as e:
            msg = maybe_truncate(str(e))
            raise BackendError(f"Error during search for `{query}`: {msg}") from e

        self.tool_state.add_page(search_page)
        yield await self.show_page_safely(loc=0)

    @function_the_model_can_call
    @handle_errors
    async def open(
        self,
        id: int | str = -1,
        cursor: int = -1,
        loc: int = -1,
        num_lines: int = -1,
        view_source: bool = False,
        source: str | None = None,
    ) -> AsyncIterator[Message]:
        curr_page: PageContents | None = None
        stay_on_current_page = False
        direct_url_open = False
        if isinstance(id, str):
            snippet = None
            url = id
            direct_url_open = True
        else:  # Operate on a previously opened page
            curr_page = self.tool_state.get_page(cursor)

            if id >= 0:  # click a link
                try:
                    url = curr_page.urls[str(id)]
                except KeyError as e:
                    raise ToolUsageError(f"Invalid link id `{id}`.") from e
                snippet = (curr_page.snippets or {}).get(str(id))
                if snippet and curr_page.url == "":
                    # current page is a search result page
                    assert isinstance(snippet, Extract)
            else:  # navigate to new position on the current page
                if not view_source:
                    stay_on_current_page = True
                url = curr_page.url
                snippet = None

        new_page: PageContents
        if view_source:
            url = f"{VIEW_SOURCE_PREFIX}{url}"
            snippet = None
        if stay_on_current_page:
            assert curr_page is not None
            new_page = curr_page
        else:
            new_page = await self._open_url(url, direct_url_open)

        self.tool_state.add_page(new_page)

        if loc < 0:  # unset
            if snippet is not None and snippet.line_idx is not None:
                loc = snippet.line_idx
                if loc > 4:
                    loc -= 4
            else:
                loc = 0
        yield await self.show_page_safely(loc=loc, num_lines=num_lines)

    @function_the_model_can_call
    @handle_errors
    async def find(self, pattern: str, cursor: int = -1) -> AsyncIterator[Message]:
        page = self.tool_state.get_page(cursor)
        if page.snippets is not None:
            raise ToolUsageError(
                "Cannot run `find` on search results page or find results page"
            )

        pc = await run_find_in_page(
            pattern=str(pattern).lower(),
            page=page,
        )
        self.tool_state.add_page(pc)
        yield await self.show_page_safely(loc=0)

    def make_response(
        self,
        content: Content,
        *,
        metadata: dict[str, Any] | None = None,
        author: Author | None = None,
    ) -> Message:
        """
        Make a response message.

        Should be used from `@function_the_model_can_call` if author is not provided.
        """
        if author is None:
            tool_name = self.get_tool_name()
            function_name = _live_function_name.get()
            assert function_name is not None
            author = Author(role=Role.TOOL, name=f"{tool_name}.{function_name}")

        return Message(
            author=author,
            content=[content],
        ).with_recipient("assistant")

    def process_arguments(self, message: Message) -> dict[str, Any]:
        function_args = maybe_get_function_args(message, tool_name=self.name)
        if function_args is None:
            raise ValueError("Invalid function arguments")

        if "cursor" in function_args and function_args["cursor"] >= 0:
            page = self.tool_state.get_page(cursor=function_args["cursor"])
            if "id" in function_args:
                function_args["url"] = page.urls[str(function_args["id"])]
            else:
                function_args["url"] = page.url
        elif "id" in function_args and isinstance(function_args["id"], str):
            function_args["url"] = function_args["id"]
        return function_args

    async def _process(self, message: Message) -> AsyncIterator[Message]:
        def make_error_message(error: str) -> Message:
            return self.make_response(
                content=TextContent(text=json.dumps({"error": error})),
                author=Author(role=Role.TOOL, name=message.recipient),
            )

        function_args = maybe_get_function_args(message, tool_name=self.name)
        if function_args is None:
            yield make_error_message("Invalid function arguments")
            return

        _, function_name = message.recipient.split(".")
        if function_name not in ["search", "open", "find"]:
            yield make_error_message(f"Unknown function: {function_name}")
            return

        if function_name == "search":
            async for msg in self.search(**function_args):
                yield msg
        elif function_name == "open":
            async for msg in self.open(**function_args):
                yield msg
        elif function_name == "find":
            async for msg in self.find(**function_args):
                yield msg
        else:
            raise ValueError("should not be here")


    def normalize_citations(self, old_content: str, hide_partial_citations: bool = False) -> tuple[str, list[dict[str, Any]], bool]:
        """
        Returns a tuple of (new_message, annotations, has_partial_citations)
        - new_message: Message with citations replaced by ([domain](url))
        - annotations: list of dicts with start_index, end_index, and title (url)
        - has_partial_citations: whether the text includes an unfinished citation
        """

        has_partial_citations = PARTIAL_FINAL_LINK_PATTERN.search(old_content) is not None
        if hide_partial_citations and has_partial_citations:
            old_content = PARTIAL_FINAL_LINK_PATTERN.sub("", old_content)

        matches = []
        for match in CITATION_OUTPUT_PATTERN.finditer(old_content):
            cursor = match.group("cursor")
            content = match.group("content")
            start_idx = match.start()
            end_idx = match.end()
            matches.append({
                "cursor": cursor,
                "content": content,
                "start": start_idx,
                "end": end_idx
            })

        # Build a mapping from cursor to url
        cursor_to_url = {}
        for idx, url in enumerate(self.tool_state.page_stack):
            cursor_to_url[str(idx)] = url

        def extract_domain(url):
            try:
                return unquote(url).split("/")[2]
            except Exception:
                return url

        new_content = ""
        last_idx = 0
        annotations = []
        running_offset = 0  # Offset due to length changes in replacements

        for m in matches:
            cursor = m["cursor"]
            url = cursor_to_url.get(cursor, None)
            orig_start = m["start"]
            orig_end = m["end"]

            # Add text before the citation
            new_content += old_content[last_idx:orig_start]

            if url:
                domain = extract_domain(url)
                replacement = f" ([{domain}]({url})) "
                # The start and end indices in the new content
                start_index = len(new_content)
                end_index = start_index + len(replacement)
                annotations.append({
                    "start_index": start_index,
                    "end_index": end_index,
                    "title": domain,
                    "url": url,
                    "type": "url_citation",
                })
                new_content += replacement
            else:
                # Keep the original citation format if cursor is missing
                replacement = old_content[orig_start:orig_end]
                start_index = len(new_content)
                end_index = start_index + len(replacement)
                # No annotation for missing url, but could add if desired
                new_content += replacement

            last_idx = orig_end

        new_content += old_content[last_idx:]
        return new_content, annotations, has_partial_citations


================================================
FILE: gpt_oss/tools/tool.py
================================================
from abc import ABC, abstractmethod
from uuid import UUID, uuid4
from typing import AsyncIterator

from openai_harmony import (
    Author,
    Role,
    Message,
    TextContent,
)


def _maybe_update_inplace_and_validate_channel(
    *, input_message: Message, tool_message: Message
) -> None:
    # If the channel of a new message produced by tool is different from the originating message,
    # we auto-set the new message's channel, if unset, or raise an error.
    if tool_message.channel != input_message.channel:
        if tool_message.channel is None:
            tool_message.channel = input_message.channel
        else:
            raise ValueError(
                f"Messages from tool should have the same channel ({tool_message.channel=}) as "
                f"the triggering message ({input_message.channel=})."
            )


class Tool(ABC):
    """
    Something the model can call.

    Tools expose APIs that are shown to the model in a syntax that the model
    understands and knows how to call (from training data). Tools allow the
    model to do things like run code, browse the web, etc.
    """

    @property
    @abstractmethod
    def name(self) -> str:
        """
        An identifier for the tool. The convention is that a message will be routed to the tool
        whose name matches its recipient field.
        """

    @property
    def output_channel_should_match_input_channel(self) -> bool:
        """
        A flag which indicates whether the output channel of the tool should match the input channel.
        """
        return True

    async def process(self, message: Message) -> AsyncIterator[Message]:
        """
        Process the message and return a list of messages to add to the conversation.
        The input message should already be applicable to this tool.
        Don't return the input message, just the new messages.

        If implementing a tool that has to block while calling a function use `call_on_background_thread` to get a coroutine.

        If you just want to test this use `evaluate_generator` to get the results.

        Do not override this method; override `_process` below (to avoid interfering with tracing).
        """
        async for m in self._process(message):
            if self.output_channel_should_match_input_channel:
                _maybe_update_inplace_and_validate_channel(input_message=message, tool_message=m)
            yield m

    @abstractmethod
    async def _process(self, message: Message) -> AsyncIterator[Message]:
        """Override this method to provide the implementation of the tool."""
        if False:  # This is to convince the type checker that this is an async generator.
            yield  # type: ignore[unreachable]
        _ = message  # Stifle "unused argument" warning.
        raise NotImplementedError

    @abstractmethod
    def instruction(self) -> str:
        """
        Describe the tool's functionality. For example, if it accepts python-formatted code,
        provide documentation on the functions available.
        """
        raise NotImplementedError

    def instruction_dict(self) -> dict[str, str]:
        return {self.name: self.instruction()}

    def error_message(
        self, error_message: str, id: UUID | None = None, channel: str | None = None
    ) -> Message:
        """
        Return an error message that's from this tool.
        """
        return Message(
            id=id if id else uuid4(),
            author=Author(role=Role.TOOL, name=self.name),
            content=TextContent(text=error_message), # TODO: Use SystemError instead
            channel=channel,
        ).with_recipient("assistant")


================================================
FILE: gpt_oss/torch/__init__.py
================================================


================================================
FILE: gpt_oss/torch/model.py
================================================
import json
import math
import os
from dataclasses import dataclass

import torch
import torch.distributed as dist

from gpt_oss.torch.weights import Checkpoint


@dataclass
class ModelConfig:
    num_hidden_layers: int = 36
    num_experts: int = 128
    experts_per_token: int = 4
    vocab_size: int = 201088
    hidden_size: int = 2880
    intermediate_size: int = 2880
    swiglu_limit: float = 7.0
    head_dim: int = 64
    num_attention_heads: int = 64
    num_key_value_heads: int = 8
    sliding_window: int = 128
    initial_context_length: int = 4096
    rope_theta: float = 150000.0
    rope_scaling_factor: float = 32.0
    rope_ntk_alpha: float = 1.0
    rope_ntk_beta: float = 32.0


class RMSNorm(torch.nn.Module):
    def __init__(
        self, num_features: int, eps: float = 1e-05, device: torch.device | None = None
    ):
        super().__init__()
        self.num_features = num_features
        self.eps = eps
        self.scale = torch.nn.Parameter(
            torch.ones(num_features, device=device, dtype=torch.float32)
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        assert x.shape[-1] == self.num_features
        t, dtype = x.float(), x.dtype
        t = t * torch.rsqrt(torch.mean(t**2, dim=-1, keepdim=True) + self.eps)
        return (t * self.scale).to(dtype)


def _apply_rotary_emb(
    x: torch.Tensor,
    cos: torch.Tensor,
    sin: torch.Tensor,
) -> torch.Tensor:
    cos = cos.unsqueeze(-2).to(x.dtype)
    sin = sin.unsqueeze(-2).to(x.dtype)
    x1, x2 = torch.chunk(x, 2, dim=-1)
    o1 = x1 * cos - x2 * sin
    o2 = x2 * cos + x1 * sin
    return torch.cat((o1, o2), dim=-1)


class RotaryEmbedding(torch.nn.Module):
    def __init__(
        self,
        head_dim: int,
        base: int,
        dtype: torch.dtype,
        initial_context_length: int = 4096,
        scaling_factor: float = 1.0,
        ntk_alpha: float = 1.0,
        ntk_beta: float = 32.0,
        device: torch.device | None = None,
    ) -> None:
        super().__init__()
        self.head_dim = head_dim
        self.base = base
        self.dtype = dtype
        self.initial_context_length = initial_context_length
        self.scaling_factor = scaling_factor
        self.ntk_alpha = ntk_alpha
        self.ntk_beta = ntk_beta
        self.device = device

    def _compute_concentration_and_inv_freq(self) -> torch.Tensor:
        """See YaRN paper: https://arxiv.org/abs/2309.00071"""
        freq = self.base ** (
            torch.arange(0, self.head_dim, 2, dtype=torch.float, device=self.device)
            / self.head_dim
        )
        if self.scaling_factor > 1.0:
            concentration = (
                0.1 * math.log(self.scaling_factor) + 1.0
            )  # YaRN concentration

            d_half = self.head_dim / 2
            # NTK by parts
            low = (
                d_half
                * math.log(self.initial_context_length / (self.ntk_beta * 2 * math.pi))
                / math.log(self.base)
            )
            high = (
                d_half
                * math.log(self.initial_context_length / (self.ntk_alpha * 2 * math.pi))
                / math.log(self.base)
            )
            assert 0 < low < high < d_half - 1

            interpolation = 1.0 / (self.scaling_factor * freq)
            extrapolation = 1.0 / freq

            ramp = (
                torch.arange(d_half, dtype=torch.float32, device=freq.device) - low
            ) / (high - low)
            mask = 1 - ramp.clamp(0, 1)

            inv_freq = interpolation * (1 - mask) + extrapolation * mask
        else:
            concentration = 1.0
            inv_freq = 1.0 / freq

        return concentration, inv_freq

    def _compute_cos_sin(self, num_tokens: int):
        concentration, inv_freq = self._compute_concentration_and_inv_freq()
        t = torch.arange(num_tokens, dtype=torch.float32, device=self.device)
        freqs = torch.einsum("i,j->ij", t, inv_freq)
        cos = freqs.cos() * concentration
        sin = freqs.sin() * concentration
        return cos, sin

    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        num_tokens = query.shape[0]
        cos, sin = self._compute_cos_sin(num_tokens)

        query_shape = query.shape
        query = query.view(num_tokens, -1, self.head_dim)
        query = _apply_rotary_emb(query, cos, sin)
        query = query.reshape(query_shape)

        key_shape = key.shape
        key = key.view(num_tokens, -1, self.head_dim)
        key = _apply_rotary_emb(key, cos, sin)
        key = key.reshape(key_shape)
        return query, key


def sdpa(Q, K, V, S, sm_scale, sliding_window=0):
    # sliding_window == 0 means no sliding window
    n_tokens, n_heads, q_mult, d_head = Q.shape
    assert K.shape == (n_tokens, n_heads, d_head)
    assert V.shape == (n_tokens, n_heads, d_head)
    K = K[:, :, None, :].expand(-1, -1, q_mult, -1)
    V = V[:, :, None, :].expand(-1, -1, q_mult, -1)
    S = S.reshape(n_heads, q_mult, 1, 1).expand(-1, -1, n_tokens, -1)
    mask = torch.triu(Q.new_full((n_tokens, n_tokens), -float("inf")), diagonal=1)
    if sliding_window > 0:
        mask += torch.tril(
            mask.new_full((n_tokens, n_tokens), -float("inf")), diagonal=-sliding_window
        )
    QK = torch.einsum("qhmd,khmd->hmqk", Q, K)
    QK *= sm_scale
    QK += mask[None, None, :, :]
    QK = torch.cat([QK, S], dim=-1)
    W = torch.softmax(QK, dim=-1)
    W = W[..., :-1]
    attn = torch.einsum("hmqk,khmd->qhmd", W, V)
    return attn.reshape(n_tokens, -1)


class AttentionBlock(torch.nn.Module):
    def __init__(
        self,
        config: ModelConfig,
        layer_idx: int = 0,
        device: torch.device | None = None,
    ):
        super().__init__()
        self.head_dim = config.head_dim
        self.num_attention_heads = config.num_attention_heads
        self.num_key_value_heads = config.num_key_value_heads
        # Only apply sliding window to every other layer
        self.sliding_window = config.sliding_window if layer_idx % 2 == 0 else 0
        self.sinks = torch.nn.Parameter(
            torch.empty(config.num_attention_heads, device=device, dtype=torch.bfloat16)
        )
        self.norm = RMSNorm(config.hidden_size, device=device)
        qkv_dim = config.head_dim * (
            config.num_attention_heads + 2 * config.num_key_value_heads
        )
        self.qkv = torch.nn.Linear(
            config.hidden_size, qkv_dim, device=device, dtype=torch.bfloat16
        )
        self.out = torch.nn.Linear(
            config.head_dim * config.num_attention_heads,
            config.hidden_size,
            device=device,
            dtype=torch.bfloat16,
        )
        self.sm_scale = 1 / math.sqrt(config.head_dim)
        self.rope = RotaryEmbedding(
            config.head_dim,
            config.rope_theta,
            torch.float32,
            initial_context_length=config.initial_context_length,
            scaling_factor=config.rope_scaling_factor,
            ntk_alpha=config.rope_ntk_alpha,
            ntk_beta=config.rope_ntk_beta,
            device=device,
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        t = self.norm(x)
        qkv = self.qkv(t)
        q = qkv[:, : self.num_attention_heads * self.head_dim].contiguous()
        k = qkv[
            :,
            self.num_attention_heads
            * self.head_dim : (self.num_attention_heads + self.num_key_value_heads)
            * self.head_dim,
        ].contiguous()
        v = qkv[
            :,
            (self.num_attention_heads + self.num_key_value_heads)
            * self.head_dim : (self.num_attention_heads + 2 * self.num_key_value_heads)
            * self.head_dim,
        ].contiguous()

        q = q.view(
            -1,
            self.num_key_value_heads,
            self.num_attention_heads // self.num_key_value_heads,
            self.head_dim,
        )
        k = k.view(-1, self.num_key_value_heads, self.head_dim)
        v = v.view(-1, self.num_key_value_heads, self.head_dim)
        q, k = self.rope(q, k)
        t = sdpa(q, k, v, self.sinks, self.sm_scale, self.sliding_window)
        t = self.out(t)
        t = x + t
        return t


def swiglu(x, alpha: float = 1.702, limit: float = 7.0):
    x_glu, x_linear = x[..., ::2], x[..., 1::2]
    # Clamp the input values
    x_glu = x_glu.clamp(min=None, max=limit)
    x_linear = x_linear.clamp(min=-limit, max=limit)
    out_glu = x_glu * torch.sigmoid(alpha * x_glu)
    # Note we add an extra bias of 1 to the linear layer
    return out_glu * (x_linear + 1)


class MLPBlock(torch.nn.Module):
    def __init__(
        self,
        config: ModelConfig,
        device: torch.device | None = None,
    ):
        super().__init__()
        self.num_experts = config.num_experts
        self.experts_per_token = config.experts_per_token
        self.swiglu_limit = config.swiglu_limit
        self.world_size = dist.get_world_size() if dist.is_initialized() else 1
        self.norm = RMSNorm(config.hidden_size, device=device)
        self.gate = torch.nn.Linear(
            config.hidden_size, config.num_experts, device=device, dtype=torch.bfloat16
        )
        assert config.intermediate_size % self.world_size == 0
        self.mlp1_weight = torch.nn.Parameter(
            torch.empty(
                (
                    config.num_experts,
                    config.intermediate_size * 2 // self.world_size,
                    config.hidden_size,
                ),
                device=device,
                dtype=torch.bfloat16,
            )
        )
        self.mlp1_bias = torch.nn.Parameter(
            torch.empty(
                (config.num_experts, config.intermediate_size * 2 // self.world_size),
                device=device,
                dtype=torch.bfloat16,
            )
        )
        self.mlp2_weight = torch.nn.Parameter(
            torch.empty(
                (
                    config.num_experts,
                    config.hidden_size,
                    config.intermediate_size // self.world_size,
                ),
                device=device,
                dtype=torch.bfloat16,
            )
        )
        self.mlp2_bias = torch.nn.Parameter(
            torch.empty(
                (config.num_experts, config.hidden_size),
                device=device,
                dtype=torch.bfloat16,
            )
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        t = self.norm(x)
        g = self.gate(t)
        experts = torch.topk(g, k=self.experts_per_token, dim=-1, sorted=True)
        expert_weights = torch.nn.functional.softmax(experts.values, dim=1)
        expert_indices = experts.indices

        # MLP #1
        mlp1_weight = self.mlp1_weight[expert_indices, ...]
        mlp1_bias = self.mlp1_bias[expert_indices, ...]
        t = torch.einsum("beck,bk->bec", mlp1_weight, t) + mlp1_bias
        t = swiglu(t, limit=self.swiglu_limit)

        # MLP #2
        mlp2_weight = self.mlp2_weight[expert_indices, ...]
        mlp2_bias = self.mlp2_bias[expert_indices, ...]
        t = torch.einsum("beck,bek->bec", mlp2_weight, t)
        if self.world_size > 1:
            dist.all_reduce(t, op=dist.ReduceOp.SUM)
        t += mlp2_bias

        # Weighted sum of experts
        t = torch.einsum("bec,be->bc", t, expert_weights)

        return x + t


class TransformerBlock(torch.nn.Module):
    def __init__(
        self,
        config: ModelConfig,
        layer_idx: int,
        device: torch.device | None = None,
    ):
        super().__init__()
        self.layer_idx = layer_idx
        self.attn = AttentionBlock(config, layer_idx, device)
        self.mlp = MLPBlock(config, device)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.attn(x)
        x = self.mlp(x)
        return x


class Transformer(torch.nn.Module):
    def __init__(
        self,
        config: ModelConfig,
        device: torch.device | None = None,
    ):
        super().__init__()
        self.embedding = torch.nn.Embedding(
            config.vocab_size, config.hidden_size, device=device, dtype=torch.bfloat16
        )
        self.block = torch.nn.ModuleList(
            [
                TransformerBlock(config, layer_idx, device)
                for layer_idx in range(config.num_hidden_layers)
            ]
        )
        self.norm = RMSNorm(config.hidden_size, device=device)
        self.unembedding = torch.nn.Linear(
            config.hidden_size,
            config.vocab_size,
            bias=False,
            device=device,
            dtype=torch.bfloat16,
        )

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.embedding(x)
        for block in self.block:
            x = block(x)
        x = self.norm(x)
        x = self.unembedding(x)
        return x

    @staticmethod
    def from_checkpoint(
        path: str, device: str | torch.device = "cuda"
    ) -> "Transformer":
        if not isinstance(device, torch.device):
            device = torch.device(device)

        config_path = os.path.join(path, "config.json")
        with open(config_path, "r") as f:
            json_config = json.load(f)
            config = ModelConfig(**json_config)

        model = Transformer(
            config=config,
            device=device,
        )
        model.eval()

        # Load weights
        my_rank = dist.get_rank() if dist.is_initialized() else 0
        world_size = dist.get_world_size() if dist.is_initialized() else 1
        per_rank_intermediate_size = config.intermediate_size // world_size

        checkpoint = Checkpoint(path, device)

        for name, param in model.named_parameters():
            loaded_tensor = checkpoint.get(name)

            # Note: it would be more efficient to do sharding before upcasting from MXFP4,
            # but for simplicity we do it after.
            if "mlp1" in name:  # both weight and bias
                loaded_tensor = loaded_tensor[
                    :,
                    my_rank * 2
                    * per_rank_intermediate_size : (my_rank + 1) * 2
                    * per_rank_intermediate_size,
                    ...,
                ]
            elif "mlp2_weight" in name:  # only weight
                loaded_tensor = loaded_tensor[
                    ...,
                    my_rank
                    * per_rank_intermediate_size : (my_rank + 1)
                    * per_rank_intermediate_size,
                ]
            try:
                param.data.copy_(loaded_tensor)
            except:
                print(f"{name=} {param.data.shape=} {loaded_tensor.shape=}")
                raise

        return model


class TokenGenerator:
    @torch.inference_mode()
    def __init__(self, checkpoint: str, device: torch.device):
        self.device = device
        self.model = Transformer.from_checkpoint(checkpoint, device=self.device)

    @torch.inference_mode()
    def generate(self,
                 prompt_tokens: list[int],
                 stop_tokens: list[int],
                 temperature: float = 1.0,
                 max_tokens: int = 0,
                 return_logprobs: bool = False):
        tokens = list(prompt_tokens)
        num_generated_tokens = 0
        while max_tokens == 0 or num_generated_tokens < max_tokens:
            logits = self.model(torch.as_tensor(tokens, dtype=torch.int32, device=self.device))[-1]
            if temperature == 0.0:
                predicted_token = torch.argmax(logits, dim=-1).item()
            else:
                probs = torch.softmax(logits * (1.0 / temperature), dim=-1)
                predicted_token = torch.multinomial(probs, num_samples=1).item()
            tokens.append(predicted_token)
            num_generated_tokens += 1

            if return_logprobs:
                logprobs = torch.log_softmax(logits, dim=-1)
                selected_logprobs = logprobs[predicted_token].item()
                yield predicted_token, selected_logprobs
            else:
                yield predicted_token

            if predicted_token in stop_tokens:
                break


================================================
FILE: gpt_oss/torch/utils.py
================================================
import os
import torch
import torch.distributed as dist


def suppress_output(rank):
    """Suppress printing on the current device. Force printing with `force=True`."""
    import builtins as __builtin__
    builtin_print = __builtin__.print

    def print(*args, **kwargs):
        force = kwargs.pop('force', False)
        if force:
            builtin_print("rank #%d:" % rank, *args, **kwargs)
        elif rank == 0:
            builtin_print(*args, **kwargs)

    __builtin__.print = print


def init_distributed() -> torch.device:
    """Initialize the model for distributed inference."""
    # Initialize distributed inference
    world_size = int(os.environ.get("WORLD_SIZE", 1))
    rank = int(os.environ.get("RANK", 0))
    if world_size > 1:
        dist.init_process_group(
            backend="nccl", init_method="env://", world_size=world_size, rank=rank
        )
    torch.cuda.set_device(rank)
    device = torch.device(f"cuda:{rank}")

    # Warm up NCCL to avoid first-time latency
    if world_size > 1:
        x = torch.ones(1, device=device)
        dist.all_reduce(x)
        torch.cuda.synchronize(device)

    suppress_output(rank)
    return device


================================================
FILE: gpt_oss/torch/weights.py
================================================
import math
import os

import torch
from safetensors import safe_open


# Bytes per MXFP4 block: 32 FP4 numbers packed in 16 bytes
BYTES_PER_BLOCK = 16

FP4_VALUES = [
    +0.0, +0.5, +1.0, +1.5, +2.0, +3.0, +4.0, +6.0,
    -0.0, -0.5, -1.0, -1.5, -2.0, -3.0, -4.0, -6.0,
]

# Map the names assumed in this implementation to the checkpoint names.
PARAM_NAME_MAP = {
    f"block.{n}.mlp.mlp1_bias": f"block.{n}.mlp.mlp1_bias" for n in range(36)
} | {
    f"block.{n}.mlp.mlp1_weight": (f"block.{n}.mlp.mlp1_weight.blocks", f"block.{n}.mlp.mlp1_weight.scales") for n in range(36)
} | {
    f"block.{n}.mlp.mlp2_bias": f"block.{n}.mlp.mlp2_bias" for n in range(36)
} | {
    f"block.{n}.mlp.mlp2_weight": (f"block.{n}.mlp.mlp2_weight.blocks", f"block.{n}.mlp.mlp2_weight.scales") for n in range(36)
}


class Checkpoint:
    def __init__(self, path: str, device: torch.device):
        device_str = (
            device.type
            if device.index is None
            else device.type + ":" + str(device.index)
        )
        self.device_str = device_str

        # Read from all files ending with .safetensors in the checkpoint directory
        safetensor_files = [
            os.path.join(path, fname)
            for fname in os.listdir(path)
            if fname.endswith(".safetensors")
        ]
        # Build a mapping from tensor name to (file, key)
        tensor_name_to_file = {}
        for safetensor_file in safetensor_files:
            with safe_open(safetensor_file, framework="pt", device=device_str) as f:
                for key in f.keys():
                    tensor_name_to_file[key] = safetensor_file

        self.tensor_name_to_file = tensor_name_to_file

    def get(self, name: str) -> torch.Tensor:
        match PARAM_NAME_MAP.get(name, name):
            case (blocks_name, scales_name):
                # MoE weights: are in block-based MXFP4 format
                return self._get_mxfp4_tensor(blocks_name, scales_name, dtype=torch.bfloat16)
            case tensor_name:
                # MoE biases and other weights
                return self._get_tensor(tensor_name)

    def _get_tensor(self, name: str) -> str:
        assert name in self.tensor_name_to_file, f"Tensor {name} not found in checkpoint."
        with safe_open(
            self.tensor_name_to_file[name], framework="pt", device=self.device_str
        ) as f:
            return f.get_tensor(name)

    def _get_mxfp4_tensor(
        self,
        blocks_name: str,
        scales_name: str,
        *,
        dtype: torch.dtype = torch.bfloat16,
        rows_per_chunk: int = 16384 * 512,
    ) -> torch.Tensor:
        assert blocks_name in self.tensor_name_to_file, (
            f"Blocks tensor {blocks_name} not found in checkpoint."
        )
        assert scales_name in self.tensor_name_to_file, (
            f"Scales tensor {scales_name} not found in checkpoint."
        )

        blocks = self._get_tensor(blocks_name)
        scales = self._get_tensor(scales_name).to(torch.int32) - 127

        assert blocks.shape[:-1] == scales.shape, (
            f"{blocks.shape=} does not match {scales.shape=}"
        )

        lut = torch.tensor(FP4_VALUES, dtype=dtype, device=blocks.device)

        *prefix_shape, G, B = blocks.shape
        rows_total   = math.prod(prefix_shape) * G

        blocks = blocks.reshape(rows_total, B)
        scales = scales.reshape(rows_total, 1)

        out = torch.empty(rows_total, B * 2, dtype=dtype, device=blocks.device)

        for r0 in range(0, rows_total, rows_per_chunk):
            r1 = min(r0 + rows_per_chunk, rows_total)

            blk = blocks[r0:r1]
            exp = scales[r0:r1]

            # nibble indices -> int64
            idx_lo = (blk & 0x0F).to(torch.long)
            idx_hi = (blk >> 4).to(torch.long)

            sub = out[r0:r1]
            sub[:, 0::2] = lut[idx_lo]
            sub[:, 1::2] = lut[idx_hi]

            torch.ldexp(sub, exp, out=sub)
            del idx_lo, idx_hi, blk, exp

        return out.reshape(*prefix_shape, G, B * 2).view(*prefix_shape, G * B * 2)

    def _get_mxfp4_tensor_copy(self, blocks_name: str, scales_name: str, dtype: torch.dtype = torch.bfloat16):
        "short version that uses a lot of memory"

        loaded_blocks = self._get_tensor(blocks_name)
        # Split it into low and high nibbles, upcast to bytes, and interleave (for swiglu)
        loaded_blocks_lo = loaded_blocks & 0x0F
        loaded_blocks_hi = loaded_blocks >> 4
        loaded_blocks = torch.stack((loaded_blocks_lo, loaded_blocks_hi), dim=-1)
        loaded_blocks = loaded_blocks.view(*loaded_blocks.shape[:-2], loaded_blocks.shape[-2] * 2)

        loaded_scales = self._get_tensor(scales_name)
        # Upcast to int32 and subtract bias
        loaded_scales = loaded_scales.int() - 127

        # Convert MXFP4 numbers into target dtype
        fp4_values = torch.tensor(FP4_VALUES, dtype=dtype, device=self.device_str)
        loaded_tensor = torch.ldexp(fp4_values[loaded_blocks.int()], loaded_scales.unsqueeze(-1))
        loaded_tensor = loaded_tensor.view(*loaded_tensor.shape[:-2], -1)
        return loaded_tensor


================================================
FILE: gpt_oss/triton/__init__.py
================================================


================================================
FILE: gpt_oss/triton/attention.py
================================================
"""FlashAttention w/support for learned sinks and banded attention.

This is an expanded version of the Flash Attention v2 implementation (see https://tridao.me/publications/flash2/flash2.pdf)
which can be found at https://triton-lang.org/main/getting-started/tutorials/06-fused-attention.html.

This version has been extended to support banded attention and learned attention sinks.
"""

import pytest
import torch

import triton
import triton.language as tl
from triton.tools.tensor_descriptor import TensorDescriptor


@triton.jit
def _attn_fwd(
    Q,
    K,
    V,
    Sinks,
    sm_scale,
    M,
    Out,  #
    Start_q,
    Z,
    H,
    N_Q_CTX,
    N_KV_CTX,
    HEAD_DIM: tl.constexpr,  #
    BLOCK_M: tl.constexpr,  #
    BLOCK_N: tl.constexpr,  #
    BANDWIDTH: tl.constexpr,
):
    tl.static_assert(BLOCK_N <= HEAD_DIM)
    start_q = tl.load(Start_q).to(tl.int32)
    start_m = tl.program_id(0)
    off_hz = tl.program_id(1)
    off_z = off_hz // H
    off_h = off_hz % H

    # load attention sinks
    if Sinks is not None:
        sink = tl.load(Sinks + off_h).to(tl.float32)
    else:
        sink = 0

    # initialize offsets
    offs_m = start_m * BLOCK_M + tl.arange(0, BLOCK_M)
    offs_n = tl.arange(0, BLOCK_N)
    # initialize pointer to m and l
    m_i = tl.zeros([BLOCK_M], dtype=tl.float32) + sink
    l_i = tl.zeros([BLOCK_M], dtype=tl.float32)
    acc = tl.zeros([BLOCK_M, HEAD_DIM], dtype=tl.float32)
    # load scales
    qk_scale = sm_scale
    q = Q.load([off_z, off_h, start_m * BLOCK_M, 0]).reshape([BLOCK_M, HEAD_DIM])

    if BANDWIDTH:
        lo, hi = tl.maximum(start_q, start_q + start_m * BLOCK_M - BANDWIDTH), start_q + (start_m + 1) * BLOCK_M
    else:
        lo, hi = start_q, start_q + (start_m + 1) * BLOCK_M

    for start_n in range(lo, hi, BLOCK_N):
        start_n = tl.multiple_of(start_n, BLOCK_N)

        mask = (start_n + offs_n)[None, :] > (start_q + offs_m)[:, None]

        if BANDWIDTH:
            too_old = (start_n + offs_n[None, :]) < (start_q + offs_m[:, None] - BANDWIDTH + 1)
            mask = mask | too_old

        k = K.load([off_z, off_h, start_n, 0]).reshape([BLOCK_N, HEAD_DIM]).T
        qk = tl.dot(q, k, allow_tf32=False)

        qk = qk * qk_scale + tl.where(mask, -1.0e6, 0.0)
        m_ij = tl.maximum(m_i, tl.max(qk, 1))
        qk -= m_ij[:, None]

        p = tl.math.exp(qk)
        alpha = tl.math.exp(m_i - m_ij)
        l_ij = tl.sum(p, 1)
        acc = acc * alpha[:, None]

        v = V.load([off_z, off_h, start_n, 0]).reshape([BLOCK_N, HEAD_DIM])
        v = v.to(tl.float32)
        acc = tl.dot(p, v, acc, allow_tf32=False)

        l_i = l_i * alpha + l_ij
        m_i = m_ij

    sink = tl.math.exp(sink - m_i)
    z = l_i + sink
    acc = acc / z[:, None]
    m_i += tl.math.log(l_i)
    m_ptrs = M + off_hz * N_Q_CTX + offs_m
    tl.store(m_ptrs, m_i)
    acc = acc.to(Out.dtype)[None, None, :, :]
    Out.store([off_z, off_h, start_m * BLOCK_M, 0], acc)


class _attention(torch.autograd.Function):
    @staticmethod
    def forward(ctx, q, k, v, sinks, sm_scale, bandwidth, start_q):
        assert len(start_q) == 1
        bs, n_ctx, n_kv_heads, repeat_kv, HEAD_DIM_Q = q.shape
        bs, n_kv_ctx, n_kv_heads, HEAD_DIM_K = k.shape
        bs, n_kv_ctx, n_kv_heads, HEAD_DIM_V = v.shape
        n_heads = n_kv_heads * repeat_kv
        q = q.view(bs, n_ctx, n_heads, HEAD_DIM_Q)
        k = k.view(bs, n_kv_ctx, n_kv_heads, HEAD_DIM_K)
        assert HEAD_DIM_Q == HEAD_DIM_K and HEAD_DIM_K == HEAD_DIM_V
        assert HEAD_DIM_K in {16, 32, 64, 128, 256}

        q = q.transpose(1, 2).contiguous()
        k = k.repeat_interleave(repeat_kv, dim=2).transpose(1, 2).contiguous()
        v = v.repeat_interleave(repeat_kv, dim=2).transpose(1, 2).contiguous()

        BLOCK_M = 64
        BLOCK_N = 64
        m_pad_size = BLOCK_M - n_ctx % BLOCK_M if n_ctx % BLOCK_M != 0 else 0
        # pad q to multiple of its block size in the n_ctx dimension (-2)
        q = torch.nn.functional.pad(q, (0, 0, 0, m_pad_size))
        n_pad_size = BLOCK_N - n_kv_ctx % BLOCK_N if n_kv_ctx % BLOCK_N != 0 else 0
        # pad k and v to multiple of their block size in the n_kv_ctx dimension
        k = torch.nn.functional.pad(k, (0, 0, 0, n_pad_size))
        v = torch.nn.functional.pad(v, (0, 0, 0, n_pad_size))

        o = torch.empty_like(q)
        M = torch.empty((bs, n_heads, n_ctx + m_pad_size), device=q.device, dtype=torch.float32)
        grid = (triton.cdiv(n_ctx, BLOCK_M), bs * n_heads, 1)
        _attn_fwd[grid](
            TensorDescriptor.from_tensor(q, [1, 1, BLOCK_M, HEAD_DIM_K]),
            TensorDescriptor.from_tensor(k, [1, 1, BLOCK_N, HEAD_DIM_K]),
            TensorDescriptor.from_tensor(v, [1, 1, BLOCK_N, HEAD_DIM_K]),
            sinks,
            sm_scale,
            M,
            TensorDescriptor.from_tensor(o, [1, 1, BLOCK_M, HEAD_DIM_K]),
            start_q,
            q.shape[0],
            q.shape[1],
            N_Q_CTX=n_ctx + m_pad_size,
            N_KV_CTX=n_kv_ctx,
            HEAD_DIM=HEAD_DIM_K,
            BANDWIDTH=bandwidth,
            BLOCK_M=BLOCK_M,
            BLOCK_N=BLOCK_N,
        )

        ctx.save_for_backward(q, k, v, sinks, o, M, start_q)
        ctx.sm_scale = sm_scale
        ctx.bandwidth = bandwidth

        o = o[:, :, :n_ctx, :].transpose(1, 2).contiguous()
        o = o.view(bs, n_ctx, n_heads * HEAD_DIM_V)
        return o


attention = _attention.apply


def attention_ref(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    sinks: torch.Tensor,
    sm_scale: float = 0.125,
    sliding_window: int | None = None,
    start_q: torch.LongTensor = 0,
):
    batch_size, num_queries, num_key_value_heads, num_key_value_groups, head_dim = query.shape
    batch_size, num_keys, num_key_value_heads, head_dim = key.shape

    sinks = sinks.view(1, num_key_value_heads, num_key_value_groups, 1, 1).float()
    key = key.unsqueeze(3)
    value = value.unsqueeze(3)

    pos_keys = torch.arange(num_keys, device=query.device)
    pos_queries = torch.arange(num_queries, device=query.device) + start_q
    mask = pos_keys[None, :] > pos_queries[:, None]
    mask = mask.float().masked_fill(mask, float("-inf"))

    if sliding_window:
        too_old = pos_keys[None, :] < (pos_queries[:, None] - sliding_window + 1)
        mask.masked_fill_(too_old, float("-inf"))

    logits = torch.einsum("bqhmd,bkhmd->bhmqk", query.float(), key.float()) * sm_scale
    logits = logits + mask[None, None, None, :, :]

    logits_max = torch.max(logits, dim=-1, keepdim=True).values
    logits_or_sinks_max = torch.maximum(sinks, logits_max)
    sinks = torch.exp(sinks - logits_or_sinks_max)
    unnormalized_scores = torch.exp(logits - logits_or_sinks_max)
    normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks
    scores = unnormalized_scores / normalizer

    output = torch.einsum("bhmqk,bkhmd->bqhmd", scores, value.float())

    output = output.reshape(batch_size, num_queries, num_key_value_heads * num_key_value_groups * head_dim).bfloat16()
    return output


@pytest.mark.parametrize("batch_size", [1, 2])
@pytest.mark.parametrize("num_queries", [1, 128])
@pytest.mark.parametrize("num_keys", [128, 32])
@pytest.mark.parametrize("num_key_value_heads", [8])
@pytest.mark.parametrize("num_key_value_groups", [8])
@pytest.mark.parametrize("head_dim", [64])
@pytest.mark.parametrize("sm_scale", [0.125])
@pytest.mark.parametrize("sliding_window", [None, 128])
@pytest.mark.parametrize("start_q", [0, 5])
def test_eq(batch_size, num_queries, num_keys, num_key_value_heads, num_key_value_groups, head_dim, sm_scale, sliding_window, start_q):
    if num_queries > num_keys:
        pytest.skip("too many queries")

    q = torch.randn(batch_size, num_queries, num_key_value_heads, num_key_value_groups, head_dim).bfloat16().cuda()
    k = torch.randn(batch_size, num_keys, num_key_value_heads, head_dim).bfloat16().cuda()
    v = torch.randn(batch_size, num_keys, num_key_value_heads, head_dim).bfloat16().cuda()
    sinks = torch.randn(num_key_value_heads * num_key_value_groups).bfloat16().cuda()

    start_q = torch.tensor([start_q], dtype=torch.int32).cuda()

    o1 = attention(q, k, v, sinks, sm_scale, sliding_window, start_q)
    o2 = attention_ref(q, k, v, sinks, sm_scale, sliding_window, start_q)

    torch.testing.assert_close(o1, o2)


================================================
FILE: gpt_oss/triton/model.py
================================================
import json
import math
import os

import torch
from torch.profiler import record_function

from gpt_oss.torch.model import ModelConfig, RMSNorm
from gpt_oss.torch.weights import Checkpoint
from gpt_oss.triton.attention import attention, attention_ref
from gpt_oss.triton.moe import quantize_mx4, moe


class RotaryEmbedding(torch.nn.Module):
    def __init__(
        self,
        head_dim: int,
        base: int,
        dtype: torch.dtype,
        initial_context_length: int = 4096,
        max_context_length: int = 131072,
        scaling_factor: float = 1.0,
        ntk_alpha: float = 1.0,
        ntk_beta: float = 32.0,
        device: torch.device | None = None,
    ) -> None:
        super().__init__()
        self.head_dim = head_dim
        self.base = base
        self.dtype = dtype
        self.initial_context_length = initial_context_length
        self.max_context_length = max_context_length
        self.scaling_factor = scaling_factor
        self.ntk_alpha = ntk_alpha
        self.ntk_beta = ntk_beta
        self.device = device
        self.cos, self.sin = self._compute_cos_sin(0, self.max_context_length)

    def _compute_concentration_and_inv_freq(self) -> torch.Tensor:
        """See YaRN paper: https://arxiv.org/abs/2309.00071"""
        freq = self.base ** (
            torch.arange(0, self.head_dim, 2, dtype=torch.float, device=self.device)
            / self.head_dim
        )
        if self.scaling_factor > 1.0:
            concentration = (
                0.1 * math.log(self.scaling_factor) + 1.0
            )  # YaRN concentration

            d_half = self.head_dim / 2
            # NTK by parts
            low = (
                d_half
                * math.log(self.initial_context_length / (self.ntk_beta * 2 * math.pi))
                / math.log(self.base)
            )
            high = (
                d_half
                * math.log(self.initial_context_length / (self.ntk_alpha * 2 * math.pi))
                / math.log(self.base)
            )
            assert 0 < low < high < d_half - 1

            interpolation = 1.0 / (self.scaling_factor * freq)
            extrapolation = 1.0 / freq

            ramp = (
                torch.arange(d_half, dtype=torch.float32, device=freq.device) - low
            ) / (high - low)
            mask = 1 - ramp.clamp(0, 1)

            inv_freq = interpolation * (1 - mask) + extrapolation * mask
        else:
            concentration = 1.0
            inv_freq = 1.0 / freq

        return concentration, inv_freq

    def _compute_cos_sin(self, start: int, num_tokens: int):
        concentration, inv_freq = self._compute_concentration_and_inv_freq()
        t = torch.arange(start, start + num_tokens, dtype=torch.float32, device=self.device)
        freqs = torch.einsum("i,j->ij", t, inv_freq)
        cos = freqs.cos() * concentration
        sin = freqs.sin() * concentration
        return cos, sin

    @record_function("rotate")
    def _rotate(
        self,
        x: torch.Tensor,
        cos: torch.Tensor,
        sin: torch.Tensor,
    ) -> torch.Tensor:
        cos = cos[None, :, None, :].to(x.dtype)
        sin = sin[None, :, None, :].to(x.dtype)
        x1, x2 = torch.chunk(x, 2, dim=-1)
        o1 = x1 * cos - x2 * sin
        o2 = x2 * cos + x1 * sin
        return torch.cat((o1, o2), dim=-1)

    @record_function("rope")
    def forward(
        self,
        query: torch.Tensor,
        key: torch.Tensor,
        offset: torch.LongTensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        batch_size, num_tokens, num_heads, head_dim = query.shape
        batch_size, num_tokens, num_key_value_heads, head_dim = key.shape

        idx = torch.arange(num_tokens, device=query.device, dtype=torch.long) + offset
        idx = idx % self.max_context_length
        cos = self.cos.index_select(0, idx)
        sin = self.sin.index_select(0, idx)

        query = self._rotate(query, cos, sin)
        key = self._rotate(key, cos, sin)
        return query, key


class Cache:
    def __init__(self, batch_size, n_ctx, n_kv_heads, d_head=64, device: torch.device | None = None):
        self.k = torch.zeros((batch_size, n_ctx, n_kv_heads, d_head), dtype=torch.bfloat16, device=device)
        self.v = torch.zeros((batch_size, n_ctx, n_kv_heads, d_head), dtype=torch.bfloat16, device=device)
        self.offset = torch.zeros((1,), dtype=torch.long, device=device)

    def reset(self):
        self.k.zero_()
        self.v.zero_()
        self.offset.zero_()

    def repeat_interleave(self, n):
        """Repeat each cache entry n times along the batch dimension."""
        self.k = self.k.repeat_interleave(n, dim=0)
        self.v = self.v.repeat_interleave(n, dim=0)

    def truncate(self, n_ctx):
        """Truncate the cache to the first n_ctx tokens."""
        batch_size, _, n_kv_heads, d_head = self.k.shape
        assert batch_size == self.v.shape[0]
        assert n_ctx <= self.k.shape[1]
        self.k[:, n_ctx:, :, :].zero_()
        self.v[:, n_ctx:, :, :].zero_()
        self.offset.fill_(n_ctx)
        return self.k, self.v

    def extend(self, k, v):
        batch_size, n_ctx, *_rest = k.shape
        assert batch_size == self.k.shape[0]
        indices = torch.arange(0, n_ctx, device=k.device, dtype=torch.long) + self.offset
        self.k.index_copy_(1, indices, k)
        self.v.index_copy_(1, indices, v)
        self.offset.add_(n_ctx)
        return self.k, self.v


class AttentionBlock(torch.nn.Module):
    def __init__(
        self,
        config: ModelConfig,
        layer_idx: int = 0,
        device: torch.device | None = None,
    ):
        super().__init__()
        self.head_dim = config.head_dim
        self.num_attention_heads = config.num_attention_heads
        self.num_key_value_heads = config.num_key_value_heads
        # Only apply sliding window to every other layer
        self.sliding_window = config.sliding_window if layer_idx % 2 == 0 else 0
        self.layer_idx = layer_idx
        self.sinks = torch.nn.Parameter(
            torch.empty(config.num_attention_heads, device=device, dtype=torch.bfloat16)
        )
        self.norm = RMSNorm(config.hidden_size, device=device)
        qkv_dim = config.head_dim * (
            config.num_attention_heads + 2 * config.num_key_value_heads
        )
        self.qkv = torch.nn.Linear(
            config.hidden_size, qkv_dim, device=device, dtype=torch.bfloat16
        )
        self.out = torch.nn.Linear(
            config.head_dim * config.num_attention_heads,
            config.hidden_size,
            device=device,
            dtype=torch.bfloat16,
        )
        self.sm_scale = 1 / math.sqrt(config.head_dim)
        self.rope = RotaryEmbedding(
            config.head_dim,
            config.rope_theta,
            torch.float32,
            initial_context_length=config.initial_context_length,
            scaling_factor=config.rope_scaling_factor,
            ntk_alpha=config.rope_ntk_alpha,
            ntk_beta=config.rope_ntk_beta,
            device=device,
        )

    @record_function("attn")
    def forward(self, x: torch.Tensor, cache: Cache | None = None) -> torch.Tensor:
        batch_size, n_ctx, dim = x.shape

        t = self.norm(x)
        with record_function("qkv"):
            qkv = self.qkv(t)
            qkv_parts = (
                self.num_attention_heads * self.head_dim,
                self.num_key_value_heads * self.head_dim,
                self.num_key_value_heads * self.head_dim
            )
            q, k, v = torch.split(qkv, qkv_parts, dim=-1)
            q, k, v = q.contiguous(), k.contiguous(), v.contiguous()

        q = q.view(batch_size, n_ctx, self.num_attention_heads, self.head_dim)
        k = k.view(batch_size, n_ctx, self.num_key_value_heads, self.head_dim)
        v = v.view(batch_size, n_ctx, self.num_key_value_heads, self.head_dim)

        if cache is not None:
            offset = cache.offset.clone()
            q, k = self.rope(q, k, offset=offset)
            k, v = cache.extend(k, v)
        else:
            offset = torch.zeros((1,), dtype=torch.long, device=x.device)
            q, k = self.rope(q, k, offset=offset)

        q = q.view(
            batch_size,
            n_ctx,
            self.num_attention_heads // self.num_key_value_heads,
            self.num_key_value_heads,
            self.head_dim,
        )
        with record_function("attn_kernel"):
            if n_ctx == 1:
                t = attention_ref(
                    q,
                    k,
                    v,
                    self.sinks,
                    self.sm_scale,
                    self.sliding_window,
                    offset,
                )
            else:
                t = attention(
                    q,
                    k,
                    v,
                    self.sinks,
                    self.sm_scale,
                    self.sliding_window,
                    offset,
                )
                if n_ctx < 64:
                    t1 = attention_ref(
                        q,
                        k,
                        v,
                        self.sinks,
                        self.sm_scale,
                        self.sliding_window,
                        offset,
                    )
                    torch.testing.assert_close(t, t1)
                    t = t1

        with record_function("c_proj"):
            t = self.out(t)
        t = x + t
        return t


class MLPBlock(torch.nn.Module):
    def __init__(
        self,
        config: ModelConfig,
        layer_idx: int = 0,
        device: torch.device | None = None,
    ):
        super().__init__()
        self.layer_idx = layer_idx
        self.num_experts = config.num_experts
        self.experts_per_token = config.experts_per_token
        self.swiglu_limit = config.swiglu_limit
        self.norm = RMSNorm(config.hidden_size, device=device)
        self.gate = torch.nn.ParameterDict({
            "weight": torch.nn.Parameter(
                torch.empty(
                    (config.hidden_size, config.num_experts),
                    device=device,
                    dtype=torch.bfloat16,
                )
            ),
            "bias": torch.nn.Parameter(
                torch.empty(
                    (config.num_experts,),
                    device=device,
                    dtype=torch.bfloat16,
                )
            ),
        })
        self.mlp1_weight_tensor, self.mlp1_weight_mx = quantize_mx4(
            torch.empty(
                (
                    config.num_experts,
                    config.hidden_size,
                    config.intermediate_size * 2,
                ),
                device=device,
                dtype=torch.bfloat16,
            ),
        )
        self.mlp1_weight = torch.nn.Parameter(self.mlp1_weight_tensor.storage.data, requires_grad=False)
        self.mlp1_bias = torch.nn.Parameter(
            torch.empty(
                (config.num_experts, config.intermediate_size * 2),
                device=device,
                dtype=torch.bfloat16,
            )
        )
        self.mlp2_weight_tensor, self.mlp2_weight_mx = quantize_mx4(
            torch.empty(
                (
                    config.num_experts,
                    config.intermediate_size,
                    config.hidden_size,
                ),
                device=device,
                dtype=torch.bfloat16,
            ),
        )
        self.mlp2_weight = torch.nn.Parameter(self.mlp2_weight_tensor.storage.data, requires_grad=False)
        self.mlp2_bias = torch.nn.Parameter(
            torch.empty(
                (config.num_experts, config.hidden_size),
                device=device,
                dtype=torch.bfloat16,
            )
        )

    @record_function("mlp")
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        batch_size, n_ctx, dim = x.shape
        t = self.norm(x)

        t = t.view(batch_size * n_ctx, dim)
        t = moe(
            t,
            self.gate["weight"],
            self.mlp1_weight_tensor, self.mlp1_weight_mx,
            self.mlp2_weight_tensor, self.mlp2_weight_mx,
            self.gate["bias"].float(),
            self.mlp1_bias.float(),
            self.mlp2_bias.float(),
            experts_per_token=self.experts_per_token,
            num_experts=self.num_experts,
            swiglu_limit=self.swiglu_limit,
        )
        t = t.view(batch_size, n_ctx, dim)

        return x + t


class TransformerBlock(torch.nn.Module):
    def __init__(
        self,
        config: ModelConfig,
        layer_idx: int,
        device: torch.device | None = None,
    ):
        super().__init__()
        self.layer_idx = layer_idx
        self.attn = AttentionBlock(config, layer_idx, device)
        self.mlp = MLPBlock(config, layer_idx, device)

    def forward(self, x: torch.Tensor, cache: Cache | None = None) -> torch.Tensor:
        x = self.attn(x, cache=cache)
        x = self.mlp(x)
        return x


class Transformer(torch.nn.Module):
    def __init__(
        self,
        config: ModelConfig,
        device: torch.device | None = None,
    ):
        super().__init__()
        self.config = config
        self.embedding = torch.nn.Embedding(
            config.vocab_size, config.hidden_size, device=device, dtype=torch.bfloat16
        )
        self.block = torch.nn.ModuleList(
            [
                TransformerBlock(config, layer_idx, device)
                for layer_idx in range(config.num_hidden_layers)
            ]
        )
        self.norm = RMSNorm(config.hidden_size, device=device)
        self.unembedding = torch.nn.Linear(
            config.hidden_size,
            config.vocab_size,
            bias=False,
            device=device,
            dtype=torch.bfloat16,
        )

    def forward(self, x: torch.Tensor, caches: list[Cache] | None = None) -> torch.Tensor:
        caches=caches or [None] * len(self.block)
        with record_function("embedding"):
            x = self.embedding(x)
        for block, cache in zip(self.block, caches):
            with record_function("block"):
                x = block(x, cache=cache)
        with record_function("norm_f"):
            x = self.norm(x)
        with record_function("unembedding"):
            x = self.unembedding(x)
        return x.float()

    @staticmethod
    def from_checkpoint(
        path: str, config: ModelConfig | None = None, device: str | torch.device = "cuda",
    ) -> "Transformer":
        if not isinstance(device, torch.device):
            device = torch.device(device)

        if config is None:
            config_path = os.path.join(path, "config.json")
            with open(config_path, "r") as f:
                json_config = json.load(f)
                config = ModelConfig(**json_config)

        model = Transformer(config=config, device=device)
        model.eval()

        checkpoint = Checkpoint(path, device)

        for name, param in model.named_parameters():
            torch.cuda.empty_cache()
            loaded_tensor = checkpoint.get(name)

            if "mlp1" in name:
                if "weight" in name:
                    loaded_tensor, scales = quantize_mx4(loaded_tensor.mT.contiguous())
                    _, block_index, _, _ = name.split(".")
                    model.block[int(block_index)].mlp.mlp1_weight_mx = scales
                    param.data.copy_(loaded_tensor.storage.data)
                else:
                    param.data.copy_(loaded_tensor)

            elif "mlp2_weight" in name:
                loaded_tensor, scales = quantize_mx4(loaded_tensor.mT.contiguous())
                _, block_index, _, _ = name.split(".")
                model.block[int(block_index)].mlp.mlp2_weight_mx = scales
                param.data.copy_(loaded_tensor.storage.data)

            elif "gate" in name and loaded_tensor.ndim == 2:
                loaded_tensor = loaded_tensor.mT.contiguous()
                param.data.copy_(loaded_tensor)

            else:
                param.data.copy_(loaded_tensor)

        # NOTE: Required to avoid OOM errors
        torch.cuda.empty_cache()
        return model


class TokenGenerator:
    @torch.inference_mode()
    def __init__(self, checkpoint: str, context: int, device: torch.device):
        self.device = device
        self.model = Transformer.from_checkpoint(checkpoint, device=self.device)
        self.caches = [Cache(1, context, self.model.config.num_key_value_heads, device=self.device) for _ in range(len(self.model.block))]
        self.input_token = torch.zeros(1, dtype=torch.int32, device=self.device)
        # warmup
        self.model(self.input_token[None, :], caches=self.caches)
        # capture for sampling
        self.graph = torch.cuda.CUDAGraph()
        with torch.cuda.graph(self.graph):
            self.logits = self.model(self.input_token[None, :], caches=self.caches)[0]

    @torch.inference_mode()
    def generate(self,
                 prompt_tokens: list[int],
                 stop_tokens: list[int] | None = None,
                 temperature: float = 1.0,
                 max_tokens: int = 0,
                 return_logprobs: bool = False):
        stop_tokens = stop_tokens or []
        for cache in self.caches:
            cache.reset()
        prompt_tokens = torch.as_tensor(prompt_tokens, dtype=torch.int32, device=self.device)
        self.model(prompt_tokens[None, :-1], self.caches)
        predicted_token = prompt_tokens[-1]
        num_generated_tokens = 0
        while max_tokens == 0 or num_generated_tokens < max_tokens:
            self.input_token[0] = predicted_token
            self.graph.replay()
            if temperature == 0.0:
                predicted_token = torch.argmax(self.logits[-1, :], dim=-1).item()
            else:
                probs = torch.softmax(self.logits * (1.0 / temperature), dim=-1)
                predicted_token = torch.multinomial(probs[-1, :], num_samples=1).item()
            num_generated_tokens += 1

            if return_logprobs:
                logprobs = torch.log_softmax(self.logits[-1, :], dim=-1)
                selected_logprobs = logprobs[predicted_token].item()
                yield predicted_token, selected_logprobs
            else:
                yield predicted_token

            if predicted_token in stop_tokens:
                break


================================================
FILE: gpt_oss/triton/moe.py
================================================
import torch
from torch.profiler import record_function

import triton_kernels
import triton_kernels.swiglu
from triton_kernels.numerics_details.mxfp import downcast_to_mxfp
from triton_kernels.matmul_ogs import PrecisionConfig, FlexCtx, FnSpecs, FusedActivation
from triton_kernels.matmul_ogs import matmul_ogs
from triton_kernels.numerics import InFlexData
from triton_kernels.routing import routing
from triton_kernels.tensor import convert_layout
from triton_kernels.tensor_details.layout import StridedLayout, HopperMXScaleLayout, HopperMXValueLayout
from triton_kernels.tensor import wrap_torch_tensor, FP4


def quantize_mx4(w):
    w, w_scale = downcast_to_mxfp(w.to(torch.bfloat16), torch.uint8, axis=1)
    w = convert_layout(wrap_torch_tensor(w, dtype=FP4), HopperMXValueLayout, mx_axis=1)
    w_scale = convert_layout(wrap_torch_tensor(w_scale), StridedLayout)
    return w, w_scale


def swiglu(x, alpha: float = 1.702, limit: float = 7.0, interleaved: bool = True):
    if interleaved:
        x_glu, x_linear = x[..., ::2], x[..., 1::2]
    else:
        x_glu, x_linear = torch.chunk(x, 2, dim=-1)
    x_glu = x_glu.clamp(min=None, max=limit)
    x_linear = x_linear.clamp(min=-limit, max=limit)
    out_glu = x_glu * torch.sigmoid(alpha * x_glu)
    return out_glu * (x_linear + 1)


def moe(x, wg, w1, w1_mx, w2, w2_mx, bg, b1, b2, experts_per_token=4, num_experts=128, swiglu_limit=7.0, fused_act=True, interleaved=True):
    if x.numel() == 0:
        return x

    pc1 = PrecisionConfig(weight_scale=w1_mx, flex_ctx=FlexCtx(rhs_data=InFlexData()))
    pc2 = PrecisionConfig(weight_scale=w2_mx, flex_ctx=FlexCtx(rhs_data=InFlexData()))
    pcg = PrecisionConfig(flex_ctx=FlexCtx(rhs_data=InFlexData()))

    with record_function("wg"):
        logits = matmul_ogs(x, wg, bg, precision_config=pcg)
    with record_function("routing"):
        rdata, gather_indx, scatter_indx = routing(logits, experts_per_token, simulated_ep=1)

    if fused_act:
        assert interleaved, "Fused activation requires interleaved weights"
        with record_function("w1+swiglu"):
            act = FusedActivation(FnSpecs("swiglu", triton_kernels.swiglu.swiglu_fn, ("alpha", "limit")), (1.702, swiglu_limit), 2)
            x = matmul_ogs(x, w1, b1, rdata, gather_indx=gather_indx, precision_config=pc1, fused_activation=act)
    else:
        with record_function("w1"):
            x = matmul_ogs(x, w1, b1, rdata, gather_indx=gather_indx, precision_config=pc1)
        with record_function("swiglu"):
            x = swiglu(x, limit=swiglu_limit, interleaved=interleaved)

    with record_function("w2"):
        x = matmul_ogs(x, w2, b2, rdata, scatter_indx=scatter_indx, precision_config=pc2, gammas=rdata.gate_scal)
    return x


================================================
FILE: gpt_oss/vllm/token_generator.py
================================================
from vllm import LLMEngine, EngineArgs, SamplingParams, TokensPrompt


class TokenGenerator:
    def __init__(self, model_path: str, tensor_parallel_size: int = 1):
        args = EngineArgs(
            model=model_path,
            tensor_parallel_size=tensor_parallel_size,
        )
        self.engine = LLMEngine.from_engine_args(args)
        self.request_id = 0

    def generate(self,
                 prompt_tokens: list[int],
                 stop_tokens: list[int] | None = None,
                 temperature: float = 1.0,
                 max_tokens: int = 0,
                 return_logprobs: bool = False):
        if max_tokens == 0:
            max_tokens = None
        request_id = str(self.request_id)
        self.request_id += 1
        sampling_params = SamplingParams(temperature=temperature,
                                         max_tokens=max_tokens,
                                         stop_token_ids=stop_tokens,
                                         logprobs=0 if return_logprobs else None)
        prompt = TokensPrompt(prompt_token_ids=prompt_tokens)
        self.engine.add_request(request_id, prompt, sampling_params)
        last_token_id = []
        while self.engine.has_unfinished_requests():
            step_outputs = self.engine.step()
            output = step_outputs[0].outputs[0]
            token_ids = output.token_ids
            logprobs_list = output.logprobs if hasattr(output, "logprobs") else None
            new_token_ids = token_ids[len(last_token_id):]
            new_logprobs = logprobs_list[len(last_token_id):] if logprobs_list is not None else [None] * len(new_token_ids)
            for token_id, logprobs in zip(new_token_ids, new_logprobs):
                last_token_id.append(token_id)
                if return_logprobs:
                    logprob_val = None
                    if logprobs is not None and token_id in logprobs:
                        logprob_val = logprobs[token_id].logprob
                    yield (token_id, logprob_val)
                else:
                    yield token_id
                if stop_tokens is not None and token_id in stop_tokens:
                    break


================================================
FILE: pyproject.toml
================================================
[project]
name = "gpt-oss"
description = "A collection of reference inference implementations for gpt-oss by OpenAI"

dependencies = [
  "openai-harmony",
  "tiktoken>=0.9.0",
  "aiohttp>=3.12.14",
  "chz>=0.3.0",
  "docker>=7.1.0",
  "fastapi>=0.116.1",
  "html2text>=2025.4.15",
  "lxml>=4.9.4",
  "pydantic>=2.11.7",
  "structlog>=25.4.0",
  "tenacity>=9.1.2",
  "uvicorn>=0.35.0",
  "requests>=2.31.0",
  "termcolor",
  "jupyter-client>=8.6.3",
]
readme = "README.md"
requires-python = ">=3.12"
version = "0.0.9"

[project.optional-dependencies]
triton = ["triton>=3.4", "safetensors>=0.5.3", "torch>=2.7.0"]
torch = ["safetensors>=0.5.3", "torch>=2.7.0"]
metal = ["numpy", "tqdm", "safetensors", "torch"]
test = ["pytest>=8.4.1", "httpx>=0.28.1"]
eval = ["pandas", "numpy", "openai", "jinja2", "tqdm", "blobfile"]

[build-system]
requires = ["setuptools>=68"]
build-backend = "gpt_oss_build_backend.backend"
backend-path = ["_build"]

[tool.setuptools.packages.find]
include = ["gpt_oss*"]

[tool.scikit-build]
cmake.source-dir = "." # pick up the root CMakeLists.txt
cmake.args = [
  "-DGPTOSS_BUILD_PYTHON=ON",
  "-DCMAKE_BUILD_TYPE=Release",
  "-DBUILD_SHARED_LIBS=OFF",
]
[tool.scikit-build.wheel]
packages = ["gpt_oss"] # copy the whole Python package tree


================================================
FILE: tests/conftest.py
================================================
import os
import sys
import pytest
from typing import Generator, Any
from unittest.mock import Mock, MagicMock
from fastapi.testclient import TestClient

sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))

from openai_harmony import (
    HarmonyEncodingName,
    load_harmony_encoding,
)
from gpt_oss.responses_api.api_server import create_api_server


@pytest.fixture(scope="session")
def harmony_encoding():
    return load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)


@pytest.fixture
def mock_infer_token(harmony_encoding):
    fake_tokens = harmony_encoding.encode(
        "<|channel|>final<|message|>Test response<|return|>", 
        allowed_special="all"
    )
    token_queue = fake_tokens.copy()
    
    def _mock_infer(tokens: list[int], temperature: float = 0.0, new_request: bool = False) -> int:
        nonlocal token_queue
        if len(token_queue) == 0:
            token_queue = fake_tokens.copy()
        return token_queue.pop(0)
    return _mock_infer


@pytest.fixture
def api_client(harmony_encoding, mock_infer_token) -> Generator[TestClient, None, None]:
    app = create_api_server(
        infer_next_token=mock_infer_token,
        encoding=harmony_encoding
    )
    with TestClient(app) as client:
        yield client


@pytest.fixture
def sample_request_data():
    return {
        "model": "gpt-oss-120b",
        "input": "Hello, how can I help you today?",
        "stream": False,
        "reasoning_effort": "low",
        "temperature": 0.7,
        "tools": []
    }


@pytest.fixture
def mock_browser_tool():
    mock = MagicMock()
    mock.search.return_value = ["Result 1", "Result 2"]
    mock.open_page.return_value = "Page content"
    mock.find_on_page.return_value = "Found text"
    return mock


@pytest.fixture
def mock_python_tool():
    mock = MagicMock()
    mock.execute.return_value = {
        "output": "print('Hello')",
        "error": None,
        "exit_code": 0
    }
    return mock


@pytest.fixture(autouse=True)
def reset_test_environment():
    test_env_vars = ['OPENAI_API_KEY', 'GPT_OSS_MODEL_PATH']
    original_values = {}
    
    for var in test_env_vars:
        if var in os.environ:
            original_values[var] = os.environ[var]
            del os.environ[var]
    
    yield
    
    for var, value in original_values.items():
        os.environ[var] = value


@pytest.fixture
def performance_timer():
    import time
    
    class Timer:
        def __init__(self):
            self.start_time = None
            self.end_time = None
        
        def start(self):
            self.start_time = time.time()
        
        def stop(self):
            self.end_time = time.time()
            return self.elapsed
        
        @property
        def elapsed(self):
            if self.start_time and self.end_time:
                return self.end_time - self.start_time
            return None
    
    return Timer()

================================================
FILE: tests/gpt_oss/tools/simple_browser/test_backend.py
================================================
import pytest
from typing import Generator, Any
from unittest import mock
from aiohttp import ClientSession

from gpt_oss.tools.simple_browser.backend import YouComBackend

class MockAiohttpResponse:
    """Mocks responses for get/post requests from async libraries."""

    def __init__(self, json: dict, status: int):
        self._json = json
        self.status = status

    async def json(self):
        return self._json

    async def __aexit__(self, exc_type, exc, tb):
        pass

    async def __aenter__(self):
        return self

def mock_os_environ_get(name: str, default: Any = "test_api_key"):
    assert name in ["YDC_API_KEY"]
    return default

def test_youcom_backend():
    backend = YouComBackend(source="web")
    assert backend.source == "web"

@pytest.mark.asyncio
@mock.patch("aiohttp.ClientSession.get")
async def test_youcom_backend_search(mock_session_get):
    backend = YouComBackend(source="web")
    api_response = {
        "results": {
            "web": [
                {"title": "Web Result 1", "url": "https://www.example.com/web1", "snippets": "Web Result 1 snippets"},
                {"title": "Web Result 2", "url": "https://www.example.com/web2", "snippets": "Web Result 2 snippets"},
            ],
            "news": [
                {"title": "News Result 1", "url": "https://www.example.com/news1", "description": "News Result 1 description"},
                {"title": "News Result 2", "url": "https://www.example.com/news2", "description": "News Result 2 description"},
            ],
        }
    }
    with mock.patch("os.environ.get", wraps=mock_os_environ_get):
        mock_session_get.return_value = MockAiohttpResponse(api_response, 200)
        async with ClientSession() as session:
            result = await backend.search(query="test", topn=10, session=session)
        assert result.title == "test"
        assert result.urls == {"0": "https://www.example.com/web1", "1": "https://www.example.com/web2", "2": "https://www.example.com/news1", "3": "https://www.example.com/news2"}

@pytest.mark.asyncio
@mock.patch("aiohttp.ClientSession.post")
async def test_youcom_backend_fetch(mock_session_get):
    backend = YouComBackend(source="web")
    api_response = [
        {"title": "Fetch Result 1", "url": "https://www.example.com/fetch1", "html": "<div>Fetch Result 1 text</div>"},
    ]
    with mock.patch("os.environ.get", wraps=mock_os_environ_get):
        mock_session_get.return_value = MockAiohttpResponse(api_response, 200)
        async with ClientSession() as session:
            result = await backend.fetch(url="https://www.example.com/fetch1", session=session)
        assert result.title == "Fetch Result 1"
        assert result.text == "\nURL: https://www.example.com/fetch1\nFetch Result 1 text"


================================================
FILE: tests/test_api_endpoints.py
================================================
import pytest
import json
import asyncio
from fastapi import status
from unittest.mock import patch, MagicMock, AsyncMock


class TestResponsesEndpoint:
    
    def test_basic_response_creation(self, api_client, sample_request_data):
        response = api_client.post("/v1/responses", json=sample_request_data)
        assert response.status_code == status.HTTP_200_OK
        data = response.json()
        assert "id" in data
        assert data["object"] == "response"
        assert data["model"] == sample_request_data["model"]
    
    def test_response_with_high_reasoning(self, api_client, sample_request_data):
        sample_request_data["reasoning_effort"] = "high"
        response = api_client.post("/v1/responses", json=sample_request_data)
        assert response.status_code == status.HTTP_200_OK
        data = response.json()
        assert "id" in data
        assert data["status"] == "completed"
    
    def test_response_with_medium_reasoning(self, api_client, sample_request_data):
        sample_request_data["reasoning_effort"] = "medium"
        response = api_client.post("/v1/responses", json=sample_request_data)
        assert response.status_code == status.HTTP_200_OK
        data = response.json()
        assert "id" in data
        assert data["status"] == "completed"
    
    def test_response_with_invalid_model(self, api_client, sample_request_data):
        sample_request_data["model"] = "invalid-model"
        response = api_client.post("/v1/responses", json=sample_request_data)
        # Should still accept but might handle differently
        assert response.status_code == status.HTTP_200_OK
    
    def test_response_with_empty_input(self, api_client, sample_request_data):
        sample_request_data["input"] = ""
        response = api_client.post("/v1/responses", json=sample_request_data)
        assert response.status_code == status.HTTP_200_OK
    
    def test_response_with_tools(self, api_client, sample_request_data):
        sample_request_data["tools"] = [
            {
                "type": "browser_search"
            }
        ]
        response = api_client.post("/v1/responses", json=sample_request_data)
        assert response.status_code == status.HTTP_200_OK
    
    def test_response_with_custom_temperature(self, api_client, sample_request_data):
        for temp in [0.0, 0.5, 1.0, 1.5, 2.0]:
            sample_request_data["temperature"] = temp
            response = api_client.post("/v1/responses", json=sample_request_data)
            assert response.status_code == status.HTTP_200_OK
            data = response.json()
            assert "usage" in data
    
    def test_streaming_response(self, api_client, sample_request_data):
        sample_request_data["stream"] = True
        with api_client.stream("POST", "/v1/responses", json=sample_request_data) as response:
            assert response.status_code == status.HTTP_200_OK
            # Verify we get SSE events
            for line in response.iter_lines():
                if line and line.startswith("data: "):
                    event_data = line[6:]  # Remove "data: " prefix
                    if event_data != "[DONE]":
                        json.loads(event_data)  # Should be valid JSON
                        break


class TestResponsesWithSession:
    
    def test_response_with_session_id(self, api_client, sample_request_data):
        session_id = "test-session-123"
        sample_request_data["session_id"] = session_id
        
        # First request
        response1 = api_client.post("/v1/responses", json=sample_request_data)
        assert response1.status_code == status.HTTP_200_OK
        data1 = response1.json()
        
        # Second request with same session
        sample_request_data["input"] = "Follow up question"
        response2 = api_client.post("/v1/responses", json=sample_request_data)
        assert response2.status_code == status.HTTP_200_OK
        data2 = response2.json()
        
        # Should have different response IDs
        assert data1["id"] != data2["id"]
    
    def test_response_continuation(self, api_client, sample_request_data):
        # Create initial response
        response1 = api_client.post("/v1/responses", json=sample_request_data)
        assert response1.status_code == status.HTTP_200_OK
        data1 = response1.json()
        response_id = data1["id"]
        
        # Continue the response
        continuation_request = {
            "model": sample_request_data["model"],
            "response_id": response_id,
            "input": "Continue the previous thought"
        }
        response2 = api_client.post("/v1/responses", json=continuation_request)
        assert response2.status_code == status.HTTP_200_OK


class TestErrorHandling:
    
    def test_missing_required_fields(self, api_client):
        # Model field has default, so test with empty JSON
        response = api_client.post("/v1/responses", json={})
        assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
    
    def test_invalid_reasoning_effort(self, api_client, sample_request_data):
        sample_request_data["reasoning_effort"] = "invalid"
        response = api_client.post("/v1/responses", json=sample_request_data)
        # May handle gracefully or return error
        assert response.status_code in [status.HTTP_200_OK, status.HTTP_422_UNPROCESSABLE_ENTITY]
    
    def test_malformed_json(self, api_client):
        response = api_client.post(
            "/v1/responses",
            data="not json",
            headers={"Content-Type": "application/json"}
        )
        assert response.status_code == status.HTTP_422_UNPROCESSABLE_ENTITY
    
    def test_extremely_long_input(self, api_client, sample_request_data):
        # Test with very long input
        sample_request_data["input"] = "x" * 100000
        response = api_client.post("/v1/responses", json=sample_request_data)
        assert response.status_code == status.HTTP_200_OK


class TestToolIntegration:
    
    def test_browser_search_tool(self, api_client, sample_request_data):
        sample_request_data["tools"] = [
            {
                "type": "browser_search"
            }
        ]
        response = api_client.post("/v1/responses", json=sample_request_data)
        assert response.status_code == status.HTTP_200_OK
    
    def test_function_tool_integration(self, api_client, sample_request_data):
        sample_request_data["tools"] = [
            {
                "type": "function",
                "name": "test_function",
                "parameters": {"type": "object", "properties": {}},
                "description": "Test function"
            }
        ]
        response = api_client.post("/v1/responses", json=sample_request_data)
        assert response.status_code == status.HTTP_200_OK
    
    def test_multiple_tools(self, api_client, sample_request_data):
        sample_request_data["tools"] = [
            {
                "type": "browser_search"
            },
            {
                "type": "function",
                "name": "test_function",
                "parameters": {"type": "object", "properties": {}},
                "description": "Test function"
            }
        ]
        response = api_client.post("/v1/responses", json=sample_request_data)
        assert response.status_code == status.HTTP_200_OK


class TestPerformance:
    
    def test_response_time_under_threshold(self, api_client, sample_request_data, performance_timer):
        performance_timer.start()
        response = api_client.post("/v1/responses", json=sample_request_data)
        elapsed = performance_timer.stop()
        
        assert response.status_code == status.HTTP_200_OK
        # Response should be reasonably fast for mock inference
        assert elapsed < 5.0  # 5 seconds threshold
    
    def test_multiple_sequential_requests(self, api_client, sample_request_data):
        # Test multiple requests work correctly
        for i in range(3):
            data = sample_request_data.copy()
            data["input"] = f"Request {i}"
            response = api_client.post("/v1/responses", json=data)
            assert response.status_code == status.HTTP_200_OK


class TestUsageTracking:
    
    def test_usage_object_structure(self, api_client, sample_request_data):
        response = api_client.post("/v1/responses", json=sample_request_data)
        assert response.status_code == status.HTTP_200_OK
        data = response.json()
        
        assert "usage" in data
        usage = data["usage"]
        assert "input_tokens" in usage
        assert "output_tokens" in usage
        assert "total_tokens" in usage
        # reasoning_tokens may not always be present
        # assert "reasoning_tokens" in usage
        
        # Basic validation
        assert usage["input_tokens"] >= 0
        assert usage["output_tokens"] >= 0
        assert usage["total_tokens"] == usage["input_tokens"] + usage["output_tokens"]
    
    def test_usage_increases_with_longer_input(self, api_client, sample_request_data):
        # Short input
        response1 = api_client.post("/v1/responses", json=sample_request_data)
        usage1 = response1.json()["usage"]
        
        # Longer input
        sample_request_data["input"] = sample_request_data["input"] * 10
        response2 = api_client.post("/v1/responses", json=sample_request_data)
        usage2 = response2.json()["usage"]
        
        # Longer input should use more tokens
        assert usage2["input_tokens"] > usage1["input_tokens"]

================================================
FILE: tests/test_responses_api.py
================================================
import time

import pytest
from fastapi.testclient import TestClient
from openai_harmony import (
    HarmonyEncodingName,
    load_harmony_encoding,
)

from gpt_oss.responses_api.api_server import create_api_server

encoding = load_harmony_encoding(HarmonyEncodingName.HARMONY_GPT_OSS)

fake_tokens = encoding.encode(
    "<|channel|>final<|message|>Hey there<|return|>", allowed_special="all"
)

token_queue = fake_tokens.copy()


def stub_infer_next_token(
    tokens: list[int], temperature: float = 0.0, new_request: bool = False
) -> int:
    global token_queue
    next_tok = token_queue.pop(0)
    if len(token_queue) == 0:
        token_queue = fake_tokens.copy()
    time.sleep(0.1)
    return next_tok


@pytest.fixture
def test_client():
    return TestClient(
        create_api_server(infer_next_token=stub_infer_next_token, encoding=encoding)
    )


def test_health_check(test_client):
    response = test_client.post(
        "/v1/responses",
        json={
            "model": "gpt-oss-120b",
            "input": "Hello, world!",
        },
    )
    print(response.json())
    assert response.status_code == 200


================================================
FILE: tests-data/basic-event-stream.txt
================================================
event: response.created
data: {"type":"response.created","sequence_number":0,"response":{"id":"resp_687937d6852c819199d18805b160d13e0d28eb600b6e01a0","object":"response","created_at":1752774614,"status":"in_progress","background":false,"error":null,"incomplete_details":null,"instructions":"You are a helpful assistant.","max_output_tokens":null,"max_tool_calls":null,"model":"o4-mini-2025-04-16","output":[],"parallel_tool_calls":true,"previous_response_id":null,"reasoning":{"effort":"low","summary":"detailed"},"service_tier":"auto","store":true,"temperature":1.0,"text":{"format":{"type":"text"}},"tool_choice":"auto","tools":[],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":null,"user":null,"metadata":{}}}

event: response.in_progress
data: {"type":"response.in_progress","sequence_number":1,"response":{"id":"resp_687937d6852c819199d18805b160d13e0d28eb600b6e01a0","object":"response","created_at":1752774614,"status":"in_progress","background":false,"error":null,"incomplete_details":null,"instructions":"You are a helpful assistant.","max_output_tokens":null,"max_tool_calls":null,"model":"o4-mini-2025-04-16","output":[],"parallel_tool_calls":true,"previous_response_id":null,"reasoning":{"effort":"low","summary":"detailed"},"service_tier":"auto","store":true,"temperature":1.0,"text":{"format":{"type":"text"}},"tool_choice":"auto","tools":[],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":null,"user":null,"metadata":{}}}

event: response.output_item.added
data: {"type":"response.output_item.added","sequence_number":2,"output_index":0,"item":{"id":"rs_687937d6ed748191b23a96ac7b1b9bb60d28eb600b6e01a0","type":"reasoning","summary":[]}}

event: response.output_item.done
data: {"type":"response.output_item.done","sequence_number":3,"output_index":0,"item":{"id":"rs_687937d6ed748191b23a96ac7b1b9bb60d28eb600b6e01a0","type":"reasoning","summary":[]}}

event: response.output_item.added
data: {"type":"response.output_item.added","sequence_number":4,"output_index":1,"item":{"id":"msg_687937d95cc08191aa918aa59c886a270d28eb600b6e01a0","type":"message","status":"in_progress","content":[],"role":"assistant"}}

event: response.content_part.added
data: {"type":"response.content_part.added","sequence_number":5,"item_id":"msg_687937d95cc08191aa918aa59c886a270d28eb600b6e01a0","output_index":1,"content_index":0,"part":{"type":"output_text","annotations":[],"logprobs":[],"text":""}}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":6,"item_id":"msg_687937d95cc08191aa918aa59c886a270d28eb600b6e01a0","output_index":1,"content_index":0,"delta":"Hello","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":7,"item_id":"msg_687937d95cc08191aa918aa59c886a270d28eb600b6e01a0","output_index":1,"content_index":0,"delta":" there","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":8,"item_id":"msg_687937d95cc08191aa918aa59c886a270d28eb600b6e01a0","output_index":1,"content_index":0,"delta":"!","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":9,"item_id":"msg_687937d95cc08191aa918aa59c886a270d28eb600b6e01a0","output_index":1,"content_index":0,"delta":" How","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":10,"item_id":"msg_687937d95cc08191aa918aa59c886a270d28eb600b6e01a0","output_index":1,"content_index":0,"delta":" can","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":11,"item_id":"msg_687937d95cc08191aa918aa59c886a270d28eb600b6e01a0","output_index":1,"content_index":0,"delta":" I","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":12,"item_id":"msg_687937d95cc08191aa918aa59c886a270d28eb600b6e01a0","output_index":1,"content_index":0,"delta":" assist","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":13,"item_id":"msg_687937d95cc08191aa918aa59c886a270d28eb600b6e01a0","output_index":1,"content_index":0,"delta":" you","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":14,"item_id":"msg_687937d95cc08191aa918aa59c886a270d28eb600b6e01a0","output_index":1,"content_index":0,"delta":" today","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":15,"item_id":"msg_687937d95cc08191aa918aa59c886a270d28eb600b6e01a0","output_index":1,"content_index":0,"delta":"?","logprobs":[]}

event: response.output_text.done
data: {"type":"response.output_text.done","sequence_number":16,"item_id":"msg_687937d95cc08191aa918aa59c886a270d28eb600b6e01a0","output_index":1,"content_index":0,"text":"Hello there! How can I assist you today?","logprobs":[]}

event: response.content_part.done
data: {"type":"response.content_part.done","sequence_number":17,"item_id":"msg_687937d95cc08191aa918aa59c886a270d28eb600b6e01a0","output_index":1,"content_index":0,"part":{"type":"output_text","annotations":[],"logprobs":[],"text":"Hello there! How can I assist you today?"}}

event: response.output_item.done
data: {"type":"response.output_item.done","sequence_number":18,"output_index":1,"item":{"id":"msg_687937d95cc08191aa918aa59c886a270d28eb600b6e01a0","type":"message","status":"completed","content":[{"type":"output_text","annotations":[],"logprobs":[],"text":"Hello there! How can I assist you today?"}],"role":"assistant"}}

event: response.completed
data: {"type":"response.completed","sequence_number":19,"response":{"id":"resp_687937d6852c819199d18805b160d13e0d28eb600b6e01a0","object":"response","created_at":1752774614,"status":"completed","background":false,"error":null,"incomplete_details":null,"instructions":"You are a helpful assistant.","max_output_tokens":null,"max_tool_calls":null,"model":"o4-mini-2025-04-16","output":[{"id":"rs_687937d6ed748191b23a96ac7b1b9bb60d28eb600b6e01a0","type":"reasoning","summary":[]},{"id":"msg_687937d95cc08191aa918aa59c886a270d28eb600b6e01a0","type":"message","status":"completed","content":[{"type":"output_text","annotations":[],"logprobs":[],"text":"Hello there! How can I assist you today?"}],"role":"assistant"}],"parallel_tool_calls":true,"previous_response_id":null,"reasoning":{"effort":"low","summary":"detailed"},"service_tier":"default","store":true,"temperature":1.0,"text":{"format":{"type":"text"}},"tool_choice":"auto","tools":[],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":{"input_tokens":18,"input_tokens_details":{"cached_tokens":0},"output_tokens":16,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":34},"user":null,"metadata":{}}}


================================================
FILE: tests-data/web-search-event-stream.txt
================================================
event: response.created
data: {"type":"response.created","sequence_number":0,"response":{"id":"resp_688867b6fb90819e92212445bb8289840b8311511b435264","object":"response","created_at":1753769911,"status":"in_progress","background":false,"error":null,"incomplete_details":null,"instructions":"You are a helpful assistant.","max_output_tokens":null,"max_tool_calls":null,"model":"gpt-4.1-2025-04-14","output":[],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"reasoning":{"effort":null,"summary":null},"safety_identifier":null,"service_tier":"auto","store":true,"temperature":1.0,"text":{"format":{"type":"text"}},"tool_choice":"auto","tools":[{"type":"web_search_preview","search_context_size":"medium","user_location":{"type":"approximate","city":null,"country":"US","region":null,"timezone":null}}],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":null,"user":null,"metadata":{}}}

event: response.in_progress
data: {"type":"response.in_progress","sequence_number":1,"response":{"id":"resp_688867b6fb90819e92212445bb8289840b8311511b435264","object":"response","created_at":1753769911,"status":"in_progress","background":false,"error":null,"incomplete_details":null,"instructions":"You are a helpful assistant.","max_output_tokens":null,"max_tool_calls":null,"model":"gpt-4.1-2025-04-14","output":[],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"reasoning":{"effort":null,"summary":null},"safety_identifier":null,"service_tier":"auto","store":true,"temperature":1.0,"text":{"format":{"type":"text"}},"tool_choice":"auto","tools":[{"type":"web_search_preview","search_context_size":"medium","user_location":{"type":"approximate","city":null,"country":"US","region":null,"timezone":null}}],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":null,"user":null,"metadata":{}}}

event: response.output_item.added
data: {"type":"response.output_item.added","sequence_number":2,"output_index":0,"item":{"id":"ws_688867b77b7c819ebd9791fd981b6b560b8311511b435264","type":"web_search_call","status":"in_progress","action":{"type":"search"}}}

event: response.web_search_call.in_progress
data: {"type":"response.web_search_call.in_progress","sequence_number":3,"output_index":0,"item_id":"ws_688867b77b7c819ebd9791fd981b6b560b8311511b435264"}

event: response.web_search_call.searching
data: {"type":"response.web_search_call.searching","sequence_number":4,"output_index":0,"item_id":"ws_688867b77b7c819ebd9791fd981b6b560b8311511b435264"}

event: response.web_search_call.completed
data: {"type":"response.web_search_call.completed","sequence_number":5,"output_index":0,"item_id":"ws_688867b77b7c819ebd9791fd981b6b560b8311511b435264"}

event: response.output_item.done
data: {"type":"response.output_item.done","sequence_number":6,"output_index":0,"item":{"id":"ws_688867b77b7c819ebd9791fd981b6b560b8311511b435264","type":"web_search_call","status":"completed","action":{"type":"search","query":"positive news stories today"}}}

event: response.output_item.added
data: {"type":"response.output_item.added","sequence_number":7,"output_index":1,"item":{"id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","type":"message","status":"in_progress","content":[],"role":"assistant"}}

event: response.content_part.added
data: {"type":"response.content_part.added","sequence_number":8,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"part":{"type":"output_text","annotations":[],"logprobs":[],"text":""}}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":9,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"As","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":10,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" of","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":11,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" July","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":12,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" ","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":13,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"29","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":14,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":",","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":15,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" ","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":16,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"202","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":17,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"5","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":18,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":",","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":19,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" one","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":20,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" uplifting","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":21,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" news","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":22,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" story","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":23,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" is","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":24,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" the","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":25,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" re","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":26,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"int","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":27,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"roduction","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":28,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" of","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":29,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" giant","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":30,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" river","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":31,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" ot","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":32,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"ters","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":33,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" to","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":34,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" Argentina","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":35,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"'s","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":36,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" Iber","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":37,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"á","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":38,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" wetlands","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":39,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":".","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":40,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" ","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":41,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"After","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":42,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" an","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":43,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" absence","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":44,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" of","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":45,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" over","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":46,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" ","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":47,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"40","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":48,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" years","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":49,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" due","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":50,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" to","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":51,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" habitat","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":52,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" loss","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":53,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" and","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":54,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" illegal","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":55,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" hunting","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":56,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":",","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":57,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" a","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":58,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" family","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":59,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" of","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":60,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" four","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":61,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" ot","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":62,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"ters","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":63,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":",","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":64,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" including","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":65,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" two","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":66,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" pups","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":67,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" born","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":68,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" in","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":69,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" captivity","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":70,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":",","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":71,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" has","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":72,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" been","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":73,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" released","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":74,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" into","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":75,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" their","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":76,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" original","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":77,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" habitat","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":78,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":".","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":79,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" ","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":80,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"This","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":81,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" marks","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":82,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" a","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":83,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" significant","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":84,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" step","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":85,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" in","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":86,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" conservation","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":87,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" efforts","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":88,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" to","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":89,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" restore","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":90,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" the","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":91,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" species","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":92,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" in","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":93,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" the","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":94,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" region","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":95,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":".","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":96,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" ","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":97,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"([conservationoptimism.org](https://conservationoptimism.org/7-stories-of-optimism-this-week-08-07-25-14-07-25/?utm_source=openai))","logprobs":[]}

event: response.output_text.annotation.added
data: {"type":"response.output_text.annotation.added","sequence_number":98,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"annotation_index":0,"annotation":{"type":"url_citation","end_index":529,"start_index":398,"title":"7 stories of optimism this week (08.07.25-14.07.25) - Conservation Optimism","url":"https://conservationoptimism.org/7-stories-of-optimism-this-week-08-07-25-14-07-25/?utm_source=openai"}}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":99,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"\n\nAdditionally","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":100,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":",","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":101,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" the","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":102,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" River","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":103,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" Seine","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":104,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" in","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":105,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" Paris","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":106,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" has","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":107,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" reopened","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":108,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" to","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":109,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" swimmers","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":110,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" for","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":111,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" the","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":112,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" first","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":113,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" time","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":114,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" since","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":115,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" ","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":116,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"192","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":117,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"3","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":118,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":".","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":119,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" ","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":120,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"Following","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":121,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" a","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":122,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" $","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":123,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"1","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":124,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":".","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":125,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"6","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":126,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" billion","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":127,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" cleanup","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":128,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":",","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":129,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" three","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":130,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" designated","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":131,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" areas","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":132,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" near","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":133,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" landmarks","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":134,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" like","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":135,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" the","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":136,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" Eiffel","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":137,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" Tower","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":138,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" and","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":139,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" Notre","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":140,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" Dame","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":141,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" Cathedral","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":142,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" now","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":143,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" allow","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":144,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" public","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":145,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" swimming","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":146,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":",","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":147,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" providing","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":148,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" Par","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":149,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"isi","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":150,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"ans","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":151,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" and","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":152,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" visitors","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":153,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" a","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":154,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" unique","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":155,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" way","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":156,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" to","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":157,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" enjoy","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":158,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" the","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":159,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" city","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":160,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":".","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":161,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" ","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":162,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"([onlygoodnewsdaily.com](https://www.onlygoodnewsdaily.com/post/just-good-news-7-july-2025?utm_source=openai))","logprobs":[]}

event: response.output_text.annotation.added
data: {"type":"response.output_text.annotation.added","sequence_number":163,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"annotation_index":1,"annotation":{"type":"url_citation","end_index":947,"start_index":837,"title":"Today's Good News | OGN Daily","url":"https://www.onlygoodnewsdaily.com/post/just-good-news-7-july-2025?utm_source=openai"}}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":164,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":"\n\nThese","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":165,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" stories","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":166,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" highlight","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":167,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" ongoing","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":168,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" global","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":169,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" efforts","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":170,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" to","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":171,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" restore","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":172,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" natural","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":173,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" habitats","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":174,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" and","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":175,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" enhance","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":176,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" urban","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":177,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" environments","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":178,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" for","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":179,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" public","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":180,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" enjoyment","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":181,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":".","logprobs":[]}

event: response.output_text.delta
data: {"type":"response.output_text.delta","sequence_number":182,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"delta":" ","logprobs":[]}

event: response.output_text.done
data: {"type":"response.output_text.done","sequence_number":183,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"text":"As of July 29, 2025, one uplifting news story is the reintroduction of giant river otters to Argentina's Iberá wetlands. After an absence of over 40 years due to habitat loss and illegal hunting, a family of four otters, including two pups born in captivity, has been released into their original habitat. This marks a significant step in conservation efforts to restore the species in the region. ([conservationoptimism.org](https://conservationoptimism.org/7-stories-of-optimism-this-week-08-07-25-14-07-25/?utm_source=openai))\n\nAdditionally, the River Seine in Paris has reopened to swimmers for the first time since 1923. Following a $1.6 billion cleanup, three designated areas near landmarks like the Eiffel Tower and Notre Dame Cathedral now allow public swimming, providing Parisians and visitors a unique way to enjoy the city. ([onlygoodnewsdaily.com](https://www.onlygoodnewsdaily.com/post/just-good-news-7-july-2025?utm_source=openai))\n\nThese stories highlight ongoing global efforts to restore natural habitats and enhance urban environments for public enjoyment. ","logprobs":[]}

event: response.content_part.done
data: {"type":"response.content_part.done","sequence_number":184,"item_id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","output_index":1,"content_index":0,"part":{"type":"output_text","annotations":[{"type":"url_citation","end_index":529,"start_index":398,"title":"7 stories of optimism this week (08.07.25-14.07.25) - Conservation Optimism","url":"https://conservationoptimism.org/7-stories-of-optimism-this-week-08-07-25-14-07-25/?utm_source=openai"},{"type":"url_citation","end_index":947,"start_index":837,"title":"Today's Good News | OGN Daily","url":"https://www.onlygoodnewsdaily.com/post/just-good-news-7-july-2025?utm_source=openai"}],"logprobs":[],"text":"As of July 29, 2025, one uplifting news story is the reintroduction of giant river otters to Argentina's Iberá wetlands. After an absence of over 40 years due to habitat loss and illegal hunting, a family of four otters, including two pups born in captivity, has been released into their original habitat. This marks a significant step in conservation efforts to restore the species in the region. ([conservationoptimism.org](https://conservationoptimism.org/7-stories-of-optimism-this-week-08-07-25-14-07-25/?utm_source=openai))\n\nAdditionally, the River Seine in Paris has reopened to swimmers for the first time since 1923. Following a $1.6 billion cleanup, three designated areas near landmarks like the Eiffel Tower and Notre Dame Cathedral now allow public swimming, providing Parisians and visitors a unique way to enjoy the city. ([onlygoodnewsdaily.com](https://www.onlygoodnewsdaily.com/post/just-good-news-7-july-2025?utm_source=openai))\n\nThese stories highlight ongoing global efforts to restore natural habitats and enhance urban environments for public enjoyment. "}}

event: response.output_item.done
data: {"type":"response.output_item.done","sequence_number":185,"output_index":1,"item":{"id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","type":"message","status":"completed","content":[{"type":"output_text","annotations":[{"type":"url_citation","end_index":529,"start_index":398,"title":"7 stories of optimism this week (08.07.25-14.07.25) - Conservation Optimism","url":"https://conservationoptimism.org/7-stories-of-optimism-this-week-08-07-25-14-07-25/?utm_source=openai"},{"type":"url_citation","end_index":947,"start_index":837,"title":"Today's Good News | OGN Daily","url":"https://www.onlygoodnewsdaily.com/post/just-good-news-7-july-2025?utm_source=openai"}],"logprobs":[],"text":"As of July 29, 2025, one uplifting news story is the reintroduction of giant river otters to Argentina's Iberá wetlands. After an absence of over 40 years due to habitat loss and illegal hunting, a family of four otters, including two pups born in captivity, has been released into their original habitat. This marks a significant step in conservation efforts to restore the species in the region. ([conservationoptimism.org](https://conservationoptimism.org/7-stories-of-optimism-this-week-08-07-25-14-07-25/?utm_source=openai))\n\nAdditionally, the River Seine in Paris has reopened to swimmers for the first time since 1923. Following a $1.6 billion cleanup, three designated areas near landmarks like the Eiffel Tower and Notre Dame Cathedral now allow public swimming, providing Parisians and visitors a unique way to enjoy the city. ([onlygoodnewsdaily.com](https://www.onlygoodnewsdaily.com/post/just-good-news-7-july-2025?utm_source=openai))\n\nThese stories highlight ongoing global efforts to restore natural habitats and enhance urban environments for public enjoyment. "}],"role":"assistant"}}

event: response.completed
data: {"type":"response.completed","sequence_number":186,"response":{"id":"resp_688867b6fb90819e92212445bb8289840b8311511b435264","object":"response","created_at":1753769911,"status":"completed","background":false,"error":null,"incomplete_details":null,"instructions":"You are a helpful assistant.","max_output_tokens":null,"max_tool_calls":null,"model":"gpt-4.1-2025-04-14","output":[{"id":"ws_688867b77b7c819ebd9791fd981b6b560b8311511b435264","type":"web_search_call","status":"completed","action":{"type":"search","query":"positive news stories today"}},{"id":"msg_688867b99c54819e8db837fcf08da9040b8311511b435264","type":"message","status":"completed","content":[{"type":"output_text","annotations":[{"type":"url_citation","end_index":529,"start_index":398,"title":"7 stories of optimism this week (08.07.25-14.07.25) - Conservation Optimism","url":"https://conservationoptimism.org/7-stories-of-optimism-this-week-08-07-25-14-07-25/?utm_source=openai"},{"type":"url_citation","end_index":947,"start_index":837,"title":"Today's Good News | OGN Daily","url":"https://www.onlygoodnewsdaily.com/post/just-good-news-7-july-2025?utm_source=openai"}],"logprobs":[],"text":"As of July 29, 2025, one uplifting news story is the reintroduction of giant river otters to Argentina's Iberá wetlands. After an absence of over 40 years due to habitat loss and illegal hunting, a family of four otters, including two pups born in captivity, has been released into their original habitat. This marks a significant step in conservation efforts to restore the species in the region. ([conservationoptimism.org](https://conservationoptimism.org/7-stories-of-optimism-this-week-08-07-25-14-07-25/?utm_source=openai))\n\nAdditionally, the River Seine in Paris has reopened to swimmers for the first time since 1923. Following a $1.6 billion cleanup, three designated areas near landmarks like the Eiffel Tower and Notre Dame Cathedral now allow public swimming, providing Parisians and visitors a unique way to enjoy the city. ([onlygoodnewsdaily.com](https://www.onlygoodnewsdaily.com/post/just-good-news-7-july-2025?utm_source=openai))\n\nThese stories highlight ongoing global efforts to restore natural habitats and enhance urban environments for public enjoyment. "}],"role":"assistant"}],"parallel_tool_calls":true,"previous_response_id":null,"prompt_cache_key":null,"reasoning":{"effort":null,"summary":null},"safety_identifier":null,"service_tier":"default","store":true,"temperature":1.0,"text":{"format":{"type":"text"}},"tool_choice":"auto","tools":[{"type":"web_search_preview","search_context_size":"medium","user_location":{"type":"approximate","city":null,"country":"US","region":null,"timezone":null}}],"top_logprobs":0,"top_p":1.0,"truncation":"disabled","usage":{"input_tokens":320,"input_tokens_details":{"cached_tokens":0},"output_tokens":256,"output_tokens_details":{"reasoning_tokens":0},"total_tokens":576},"user":null,"metadata":{}}}