Repository: OpenPipe/ART Branch: main Commit: 621e82b2d38e Files: 341 Total size: 25.0 MB Directory structure: gitextract_7jngjjdm/ ├── .agents/ │ └── skills/ │ ├── fix-art-issues/ │ │ └── SKILL.md │ ├── train-rl/ │ │ └── SKILL.md │ └── train-sft/ │ └── SKILL.md ├── .dockerignore ├── .github/ │ └── workflows/ │ ├── create-draft-release.yml │ ├── package-install.yml │ ├── prek.yml │ └── release.yml ├── .gitignore ├── .pre-commit-config.yaml ├── .python-version ├── .skyignore ├── AGENT.md ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── THIRD-PARTY-NOTICES ├── dev/ │ ├── demo_logging.py │ ├── math-vista/ │ │ ├── math-vista.ipynb │ │ └── math-vista.py │ ├── new_models/ │ │ ├── benchmark_inference.py │ │ ├── gemma3.py │ │ ├── prompts.json │ │ ├── qwen3_try.ipynb │ │ └── qwen3_try.py │ ├── profile.ipynb │ ├── run_yes_no_maybe_kl_advantage.py │ ├── sft/ │ │ ├── dataset.jsonl │ │ ├── distillation.py │ │ ├── sft-from-file.py │ │ └── sft-warmup.py │ ├── yes-no-maybe-kl-advantage.py │ ├── yes-no-maybe-megatron.py │ ├── yes-no-maybe-metrics.py │ ├── yes-no-maybe-vision/ │ │ ├── generate_images.py │ │ └── train.ipynb │ ├── yes-no-maybe.ipynb │ └── yes-no-maybe.py ├── docs/ │ ├── .gitignore │ ├── README.md │ ├── analytics.js │ ├── docs.json │ ├── experimental/ │ │ └── gspo.mdx │ ├── features/ │ │ ├── additional-histories.mdx │ │ ├── checkpoint-deletion.mdx │ │ ├── checkpoint-forking.mdx │ │ ├── mcp-rl.mdx │ │ └── tracking-metrics.mdx │ ├── fundamentals/ │ │ ├── art-backend.mdx │ │ ├── art-client.mdx │ │ ├── ruler.mdx │ │ ├── sft-training.mdx │ │ └── training-loop.mdx │ ├── getting-started/ │ │ ├── about.mdx │ │ ├── faq.mdx │ │ ├── installation-setup.mdx │ │ ├── notebooks.mdx │ │ └── quick-start.mdx │ ├── integrations/ │ │ ├── langgraph-integration.mdx │ │ └── openenv-integration.mdx │ ├── package.json │ ├── proposals/ │ │ └── backend-first-training-api.md │ ├── resources/ │ │ ├── glossary.mdx │ │ └── models.mdx │ ├── style.css │ └── tutorials/ │ ├── open-deep-research.mdx │ └── summarizer.mdx ├── examples/ │ ├── 2048/ │ │ ├── display_benchmarks.ipynb │ │ ├── generate_benchmarks.py │ │ ├── rollout.py │ │ ├── train.py │ │ └── utils.py │ ├── benchmarking_comparison_models.py │ ├── codenames/ │ │ ├── codenames_words.json │ │ └── dictionary.json │ ├── data/ │ │ ├── greentext/ │ │ │ └── prompts.txt │ │ └── temporal-clue/ │ │ └── puzzles.json │ ├── hn_title_generator/ │ │ ├── reference_grpo_trainer.py │ │ ├── train.py │ │ └── utils.py │ ├── just-the-facts/ │ │ ├── .gitignore │ │ ├── README.md │ │ ├── just_the_facts/ │ │ │ ├── __init__.py │ │ │ ├── checks.py │ │ │ ├── display_benchmarks.ipynb │ │ │ ├── experiments.py │ │ │ ├── find_articles.py │ │ │ ├── generate_benchmarks.py │ │ │ ├── rollout.py │ │ │ ├── scenarios.py │ │ │ ├── train.py │ │ │ └── utils.py │ │ ├── main.py │ │ ├── pyproject.toml │ │ └── test_scraper.py │ ├── mcp-rl/ │ │ ├── README.md │ │ ├── all_experiments.py │ │ ├── mcp_rl/ │ │ │ ├── __init__.py │ │ │ ├── benchmarks/ │ │ │ │ ├── display_benchmarks/ │ │ │ │ │ ├── mcp_alphavantage.ipynb │ │ │ │ │ └── mcp_balldontlie.ipynb │ │ │ │ └── generate_benchmarks.py │ │ │ ├── checks.py │ │ │ ├── mcp_server.py │ │ │ ├── rollout.py │ │ │ ├── scenario_generator.py │ │ │ ├── train.py │ │ │ └── utils.py │ │ ├── pyproject.toml │ │ ├── servers/ │ │ │ └── python/ │ │ │ ├── mcp_alphavantage/ │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── scenarios/ │ │ │ │ │ ├── train.jsonl │ │ │ │ │ └── val.jsonl │ │ │ │ ├── scenarios.jsonl │ │ │ │ ├── server.py │ │ │ │ └── server_params.py │ │ │ ├── mcp_balldontlie/ │ │ │ │ ├── README.md │ │ │ │ ├── __init__.py │ │ │ │ ├── scenarios/ │ │ │ │ │ ├── train.jsonl │ │ │ │ │ └── val.jsonl │ │ │ │ ├── server.py │ │ │ │ └── server_params.py │ │ │ └── mcp_googlemaps/ │ │ │ ├── README.md │ │ │ ├── __init__.py │ │ │ ├── pyproject.toml │ │ │ ├── server.py │ │ │ └── server_params.py │ │ └── test_scenario_generation.py │ ├── openenv_echo.py │ ├── prisoners-dilemma.ipynb │ ├── rock-paper-tool-use.ipynb │ ├── roflbot/ │ │ └── .gitignore │ ├── temporal_clue/ │ │ ├── temporal-clue-7b-async.ipynb │ │ ├── temporal-clue-7b.ipynb │ │ └── temporal-clue.py │ ├── tic_tac_toe/ │ │ ├── display-benchmarks.ipynb │ │ ├── game_utils.py │ │ ├── rollout.py │ │ └── tic-tac-toe.py │ └── tic_tac_toe_self_play/ │ ├── deploy_step.py │ ├── game_utils.py │ ├── gather_trajectory_groups_by_index.py │ ├── rollout.py │ ├── train.py │ └── train_o4_mini.py ├── licenses/ │ ├── GPL-3.0.txt │ └── LGPL-3.0.txt ├── pyproject.toml ├── requirements/ │ └── backend.vcs.txt ├── scripts/ │ ├── bump_version.py │ ├── ci/ │ │ ├── build_and_push_uv_cache.sh │ │ └── compute_uv_fingerprint.py │ ├── deploy-model.py │ ├── kill-gpu-processes.sh │ ├── migrate-s3-checkpoints.py │ ├── publish.sh │ └── setup.sh ├── skypilot-config.yaml ├── src/ │ ├── art/ │ │ ├── __init__.py │ │ ├── api_costs.py │ │ ├── auto_trajectory.py │ │ ├── backend.py │ │ ├── batches.py │ │ ├── cli.py │ │ ├── costs.py │ │ ├── dev/ │ │ │ ├── __init__.py │ │ │ ├── engine.py │ │ │ ├── get_model_config.py │ │ │ ├── model.py │ │ │ ├── openai_server.py │ │ │ ├── train.py │ │ │ └── validate.py │ │ ├── errors.py │ │ ├── gather.py │ │ ├── guided_completion.py │ │ ├── langgraph/ │ │ │ ├── __init__.py │ │ │ ├── llm_wrapper.py │ │ │ ├── logging.py │ │ │ └── message_utils.py │ │ ├── local/ │ │ │ ├── __init__.py │ │ │ ├── backend.py │ │ │ ├── checkpoints.py │ │ │ └── service.py │ │ ├── loss.py │ │ ├── mcp/ │ │ │ ├── __init__.py │ │ │ ├── default_tools.py │ │ │ ├── generate_scenarios.py │ │ │ └── types.py │ │ ├── megatron/ │ │ │ ├── __init__.py │ │ │ ├── backend.py │ │ │ ├── flex_attention.py │ │ │ ├── lora.py │ │ │ ├── offload.py │ │ │ ├── provider.py │ │ │ ├── service.py │ │ │ ├── setup.sh │ │ │ └── train.py │ │ ├── metrics.py │ │ ├── metrics_taxonomy.py │ │ ├── model.py │ │ ├── openai.py │ │ ├── pipeline_trainer/ │ │ │ ├── __init__.py │ │ │ ├── binary_prefix_tool_pipeline.py │ │ │ ├── state.py │ │ │ ├── status.py │ │ │ ├── trainer.py │ │ │ ├── types.py │ │ │ └── yes_no_maybe_pipeline.py │ │ ├── preprocessing/ │ │ │ ├── __init__.py │ │ │ ├── inputs.py │ │ │ ├── pack.py │ │ │ └── tokenize.py │ │ ├── py.typed │ │ ├── rewards/ │ │ │ ├── __init__.py │ │ │ └── ruler.py │ │ ├── serverless/ │ │ │ ├── __init__.py │ │ │ ├── backend.py │ │ │ └── client.py │ │ ├── test/ │ │ │ ├── test_kl_advantage.py │ │ │ └── test_step_skipping.py │ │ ├── tinker/ │ │ │ ├── __init__.py │ │ │ ├── backend.py │ │ │ ├── cookbook_v/ │ │ │ │ ├── __init__.py │ │ │ │ ├── hyperparam_utils.py │ │ │ │ ├── image_processing_utils.py │ │ │ │ ├── renderers/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── base.py │ │ │ │ │ ├── deepseek_v3.py │ │ │ │ │ ├── gpt_oss.py │ │ │ │ │ ├── kimi_k2.py │ │ │ │ │ ├── kimi_k25.py │ │ │ │ │ ├── kimi_k2_5_tool_declaration_ts.py │ │ │ │ │ ├── llama3.py │ │ │ │ │ ├── qwen3.py │ │ │ │ │ ├── qwen3_5.py │ │ │ │ │ └── role_colon.py │ │ │ │ ├── tokenizer_utils.py │ │ │ │ └── utils/ │ │ │ │ ├── __init__.py │ │ │ │ └── misc_utils.py │ │ │ ├── prefix_cache.py │ │ │ ├── renderers.py │ │ │ ├── server.py │ │ │ └── service.py │ │ ├── tinker_native/ │ │ │ ├── __init__.py │ │ │ ├── backend.py │ │ │ └── data.py │ │ ├── trajectories.py │ │ ├── transformers/ │ │ │ ├── __init__.py │ │ │ └── patches.py │ │ ├── types.py │ │ ├── unsloth/ │ │ │ ├── __init__.py │ │ │ ├── service.py │ │ │ └── train.py │ │ ├── utils/ │ │ │ ├── __init__.py │ │ │ ├── benchmark_rollout.py │ │ │ ├── benchmarking/ │ │ │ │ ├── aggregate_trajectories.py │ │ │ │ ├── charts/ │ │ │ │ │ ├── __init__.py │ │ │ │ │ ├── percentage_comparison_bar_chart.py │ │ │ │ │ └── training_progress_chart.py │ │ │ │ ├── filter_model_split.py │ │ │ │ ├── load_trajectories.py │ │ │ │ ├── log_constant_metrics_wandb.py │ │ │ │ ├── pull_model_trajectories.py │ │ │ │ └── types.py │ │ │ ├── convert_moe_lora.py │ │ │ ├── deploy_model.py │ │ │ ├── deployment/ │ │ │ │ ├── __init__.py │ │ │ │ ├── common.py │ │ │ │ ├── legacy.py │ │ │ │ ├── together.py │ │ │ │ └── wandb.py │ │ │ ├── format_message.py │ │ │ ├── get_model_step.py │ │ │ ├── get_repo_root_path.py │ │ │ ├── group_aggregate.py │ │ │ ├── iterate_dataset.py │ │ │ ├── limit_concurrency.py │ │ │ ├── litellm.py │ │ │ ├── log_http_errors.py │ │ │ ├── logging.py │ │ │ ├── model_config.py │ │ │ ├── old_benchmarking/ │ │ │ │ ├── calculate_step_metrics.py │ │ │ │ ├── display_image_grid.py │ │ │ │ ├── generate_comparison_table.py │ │ │ │ ├── generate_line_graphs.py │ │ │ │ ├── load_benchmarked_models.py │ │ │ │ └── types.py │ │ │ ├── output_dirs.py │ │ │ ├── record_provenance.py │ │ │ ├── retry.py │ │ │ ├── s3.py │ │ │ ├── s3_checkpoint_utils.py │ │ │ ├── sft.py │ │ │ ├── strip_logprobs.py │ │ │ ├── suppress_litellm_serialization_warnings.py │ │ │ ├── trajectory_logging.py │ │ │ └── trajectory_migration.py │ │ ├── vllm/ │ │ │ ├── __init__.py │ │ │ ├── dedicated_server.py │ │ │ ├── engine.py │ │ │ ├── patches.py │ │ │ └── server.py │ │ └── yield_trajectory.py │ └── mp_actors/ │ ├── __init__.py │ ├── move.py │ └── traceback.py └── tests/ ├── integration/ │ ├── __init__.py │ ├── test_live_api_cost.py │ ├── test_multi_checkpoint_training.py │ ├── test_pipeline_localbackend_dedicated.py │ ├── test_provenance.py │ ├── test_push_and_fork.py │ ├── test_tinker_native_backend.py │ └── test_vllm_contract.py ├── integration.py ├── test_backend_train_api.py └── unit/ ├── test_auto_trajectory.py ├── test_benchmarking_loader.py ├── test_dedicated_config.py ├── test_dedicated_server.py ├── test_frontend_logging.py ├── test_metric_routing.py ├── test_metrics_builder.py ├── test_metrics_taxonomy.py ├── test_model_openai_client_costs.py ├── test_mp_actors.py ├── test_multi_checkpoint_inference.py ├── test_pipeline_trainer_batching.py ├── test_pipeline_trainer_local_backend.py ├── test_pipeline_trainer_metrics.py ├── test_prefix_cache.py ├── test_ruler_metrics.py ├── test_sft.py ├── test_strip_logprobs.py ├── test_tinker_renderers.py ├── test_tokenize_trajectory_groups.ipynb ├── test_track_api_cost.py ├── test_trajectory_copy.py ├── test_trajectory_parquet.py ├── test_unsloth_metrics.py ├── test_vllm_patches_contract.py └── test_yield_trajectory.py ================================================ FILE CONTENTS ================================================ ================================================ FILE: .agents/skills/fix-art-issues/SKILL.md ================================================ --- name: fix-art-issues description: > Fix a GitHub issue on OpenPipe/ART and open a PR. Use when the user asks to fix, solve, or work on an ART issue, or references a GitHub issue URL containing "OpenPipe/ART". Triggers: "fix ART issue", "solve this issue" with an OpenPipe/ART URL, "work on ART #N". --- # Fix ART Issue Fix a GitHub issue on `OpenPipe/ART` and open a PR. - **Repo**: `OpenPipe/ART` - **Base branch**: `main` Assumes the workspace is already set up with the correct branch checked out and `.env` in place (handled by the system-level `fix-art-workspace` skill). ## Workflow ### 1. Read the Issue ``` gh issue view --repo OpenPipe/ART --json title,body,labels,assignees,comments ``` ### 2. Explore, Plan, Implement - Use the Explore agent to understand relevant code before making changes. - Plan clearly, implement with minimal focused changes. No over-engineering. ### 3. Commit and Push - Commit with a message that includes `Closes #`. - Push the feature branch. If HTTPS push fails due to SAML SSO, set SSH remote: `git remote set-url origin git@github.com:OpenPipe/ART.git` ### 4. Open a Draft PR - `gh pr create --base main --draft`. - PR body: `## Summary`, `Closes #`, `## Changes`, `## Test plan`. ### 5. Testing - **No test artifacts in the final PR**: debug prints, test scripts, and temporary changes must NOT be committed. - Update the PR's test plan section with detailed results. - When testing passes, mark the PR as ready: `gh pr ready`. ## Reference Read `CONTRIBUTING.md` at the repo root for guidance on code quality checks (prek), CI cache refresh, and the release process. ## Dependency Management Tips - **Pin versions strictly** (`==`) for critical deps like `transformers`, `trl`, `unsloth`, `unsloth-zoo`, `vllm` to avoid surprise breakage from new releases. - **Don't loosen pins without reason**: if a dep was `==X.Y.Z`, keep it pinned unless there's a specific reason to change. Don't use `>=` just because it seems more flexible. - **`uv run` fails on macOS** for backend deps (apex/torch need CUDA). This is expected — use `uvx ruff` for linting locally, test on GPU cluster. ## Deploying a GPU Cluster Name the SkyPilot cluster after the branch name without the `fix/` prefix, replacing `/` with `-` (SkyPilot doesn't allow slashes). For example, if the branch is `fix/short-description`: ``` uv run sky launch -c short-description skypilot-config.yaml -y ``` To connect: `ssh short-description` To tear down when done: `uv run sky down short-description` ## GPU Cluster Testing Tips - **Kill stale GPU processes** before re-running tests: `nvidia-smi --query-compute-apps=pid --format=csv,noheader | xargs -r kill -9`. Previous failed runs leave processes holding GPU memory. - **Set `gpu_memory_utilization`** in test scripts (e.g. `0.7`) — the default `0.9` is too high when Unsloth's training model is also loaded on the same GPU. - **Redirect test output to a log file**: `nohup python test.py > /tmp/output.log 2>&1 &` then `tail -f /tmp/output.log`. SSH background tasks lose output when connection drops. - **Git on cluster**: SSH keys may not be configured. Use HTTPS with token: `git remote set-url origin https://${GITHUB_TOKEN}@github.com/OpenPipe/ART.git` - **Tear down clusters** when done: `sky down -y` $ARGUMENTS ================================================ FILE: .agents/skills/train-rl/SKILL.md ================================================ --- name: train-rl description: RL training reference for the ART framework. Use when the user asks to create, write, or help with an RL training script, reinforcement learning, GRPO, reward functions, RULER scoring, rollout functions, or anything related to RL fine-tuning. --- # RL Training Wizard You are guiding the user through setting up Reinforcement Learning (RL) training for a language model using the ART framework. Act as an interactive wizard: ask questions, validate inputs, and generate a complete runnable script. **Important**: Ask ONE question at a time. Wait for the user's response before asking the next question. Never bundle multiple questions into a single message. **Adaptability note**: Some steps reference tools like AskUserQuestion, Glob, or Bash. If you don't have access to these tools, simply ask the user the same questions as plain text and skip any steps that require running code (e.g., file search, dataset validation, hyperparameter computation). Do NOT fabricate results — never pretend you ran a tool or searched for files when you didn't. ## Step 1: Single-turn or Multi-turn Ask the user using AskUserQuestion: 1. **Single-turn** — The agent responds to a prompt once. A reward is assigned based on that single response (e.g., solve a math problem, classify text, answer a question). 2. **Multi-turn** — The agent interacts over multiple turns with an environment, tools, or a game. A reward is assigned at the end of all turns (e.g., play a board game, use tools to complete a task, navigate a conversation). ## Step 2: Describe the Task Ask the user to describe what the agent needs to do. Tell them you will help create a draft of the rollout function and environment, but they will likely need to edit it once the script is generated. Gather: - **Task description** — What does the agent need to accomplish? - **Scenarios/inputs** — How are training inputs generated or provided? (e.g., a list of problems, a game generator, a dataset of tasks) - **System prompt** (optional) — Any system-level instructions for the agent For **multi-turn** scenarios, also ask: - **Does the agent use tool calling?** — If yes, gather tool names, descriptions, parameter schemas (OpenAI function calling format), and how tool calls are executed (local function, API, MCP server, etc.) - **How does the environment work?** — What observations does the agent receive? What actions can it take? How does a turn work? - **When does an episode end?** — Win/loss conditions, turn limits, max tool calls, etc. (default max turns: 10) Help the user flesh out incomplete descriptions. Offer to write helper functions (game logic, tool execution, scenario generators) as part of the final script. ## Step 3: Reward Method Ask the user using AskUserQuestion: 1. **Programmatic reward** — You have a ground truth or scoring function to compute the reward (e.g., check correctness against an answer, game win/loss, composite score) 2. **RULER (LLM-as-judge)** — An LLM judge scores and compares the trajectories. No manual reward function needed. Requires an OpenAI API key (`OPENAI_API_KEY` env var). If they choose **programmatic reward**, help them design a reward function. Common patterns: - **Binary**: 1 for correct, 0 for incorrect - **Accuracy**: fraction of correct sub-answers (0.0 to 1.0) - **Game outcome**: 1 for win, 0.5 for draw, 0 for loss, -1 for invalid move - **Scaled score**: logarithmic or normalized continuous score - **Composite**: weighted combination of multiple signals The reward must be a float assigned to `trajectory.reward`. Additional signals can go in `trajectory.metrics` for W&B logging. **Important: `metrics` values must be numeric (`float`, `int`) or `bool` — strings are not allowed and will cause a Pydantic validation error.** If they choose **RULER**, ask for: - **Judge model**: Recommend `openai/o4-mini` (default) or `openai/o3` for higher quality ## Step 4: Gather Base Parameters Do NOT ask the user to review or confirm their answers after collecting them — just proceed to the next step. - **Base model**: Recommend ONLY these models: - `OpenPipe/Qwen3-14B-Instruct` - `Qwen/Qwen3-30B-A3B-Instruct-2507` - `meta-llama/Llama-3.1-8B-Instruct` - **Project name**: A name for this training project (default: `rl-project`) - **Run name**: A static, descriptive name (e.g., `math-solver-001`, `game-agent-001`). Ask the user for a meaningful name. Do NOT generate random names. ## Step 5: Gather Hyperparameters Present these defaults to the user, then ask using AskUserQuestion: - **Use defaults (Recommended)** — show all values in the description - **Customize** — adjust individual hyperparameters Default values: - **Learning rate**: `1e-5` - **Number of training steps**: `50` - **Rollouts per group**: `8` (number of trajectories per scenario per step; more = better advantage estimation but slower). For RULER, default to `16`. - **Groups per step**: `1` (number of different scenarios per training step) If they choose "Customize", ask which parameters to change. ## Step 6: Generate the Training Script Write a complete, runnable Python script by combining the appropriate **rollout pattern** (from Step 1/2) with the appropriate **reward method** (from Step 3) and the **training loop**. Every script MUST: - Call `await backend.close()` at the end so the process doesn't hang - Print post-training info and usage examples (see shared block below) ### Post-training block (append to ALL scripts before `backend.close()`): ```python # --- Training complete --- step = await model.get_step() inference_name = model.get_inference_name() client = model.openai_client() print("\n" + "=" * 60) print("RL TRAINING COMPLETE") print("=" * 60) print(f" Model: {inference_name}") print(f" Base model: ") print(f" Training step: {step}") print(f" Inference URL: {client.base_url}") print("=" * 60) print("\n--- Python usage (openai SDK) ---\n") print(f'''\ from openai import OpenAI client = OpenAI( base_url="{client.base_url}", api_key="not-needed", ) response = client.chat.completions.create( model="{inference_name}", messages=[ {{"role": "user", "content": "Your prompt here"}}, ], ) print(response.choices[0].message.content) ''') print("--- curl usage ---\n") print(f'''\ curl {client.base_url}chat/completions \\ -H "Content-Type: application/json" \\ -d '{{ "model": "{inference_name}", "messages": [ {{"role": "user", "content": "Your prompt here"}} ] }}' ''') await backend.close() ``` ### Rollout pattern: Single-turn ```python async def rollout(model: art.Model, scenario: dict) -> art.Trajectory: client = AsyncOpenAI( base_url=model.inference_base_url, api_key=model.inference_api_key, ) messages = [ # {"role": "system", "content": ""}, {"role": "user", "content": scenario["prompt"]}, ] response = await client.chat.completions.create( model=model.get_inference_name(), messages=messages, temperature=0.7, ) choice = response.choices[0] # --- Compute reward (if programmatic) --- reward = # e.g., 1.0 if correct else 0.0 return art.Trajectory( messages_and_choices=[*messages, choice], reward=reward, metrics={"acc": reward}, ) ``` ### Rollout pattern: Multi-turn (environment/game loop) ```python async def rollout(model: art.Model, scenario) -> art.Trajectory: client = AsyncOpenAI( base_url=model.inference_base_url, api_key=model.inference_api_key, ) game = create_game() traj = art.Trajectory( messages_and_choices=[ {"role": "system", "content": ""}, ], reward=0.0, ) while not is_finished(game): traj.messages_and_choices.append( {"role": "user", "content": render_observation(game)} ) response = await client.chat.completions.create( model=model.get_inference_name(), messages=traj.messages(), temperature=0.7, max_completion_tokens=256, ) choice = response.choices[0] traj.messages_and_choices.append(choice) try: apply_action(game, choice.message.content) except ValueError: traj.reward = -1.0 return traj traj.reward = compute_reward(game) return traj ``` ### Rollout pattern: Multi-turn with tool calling ```python async def rollout(model: art.Model, scenario: dict) -> art.Trajectory: client = AsyncOpenAI( base_url=model.inference_base_url, api_key=model.inference_api_key, ) MAX_TURNS = traj = art.Trajectory( messages_and_choices=[ # {"role": "system", "content": ""}, {"role": "user", "content": scenario["task"]}, ], tools=tools, reward=0.0, ) for turn in range(MAX_TURNS): response = await client.chat.completions.create( model=model.get_inference_name(), messages=traj.messages(), tools=tools, temperature=0.7, ) choice = response.choices[0] traj.messages_and_choices.append(choice) if choice.message.tool_calls: for tc in choice.message.tool_calls: args = json.loads(tc.function.arguments) result = execute_tool(tc.function.name, args) traj.messages_and_choices.append({ "role": "tool", "tool_call_id": tc.id, "content": str(result), }) else: break # Agent finished (no more tool calls) # --- Compute reward (if programmatic) --- traj.reward = return traj ``` ### Reward method: RULER addition When using RULER, the rollout function should set `reward=0.0` (RULER fills it in). Add this scoring block inside the training loop, after `gather_trajectory_groups` and before `model.train`: ```python from art.rewards import ruler_score_group # Score with RULER (LLM judge assigns relative rewards 0-1) judged_groups = [] for group in finished_groups: judged = await ruler_score_group( group, judge_model=JUDGE_MODEL, debug=True, ) judged_groups.append(judged) finished_groups = judged_groups ``` ### Training loop (shared by all patterns): ```python """RL training script generated by /train-rl wizard.""" import asyncio import json from openai import AsyncOpenAI import art from art.local import LocalBackend # --- Scenarios --- scenarios = [ # Define or load your training scenarios here. ] # --- Rollout function --- # (insert the appropriate rollout pattern here) # --- Training loop --- async def main(): backend = LocalBackend() model = art.TrainableModel( name="", project="", base_model="", _internal_config=art.dev.InternalModelConfig( engine_args={"gpu_memory_utilization": 0.7}, ), ) await model.register(backend) NUM_STEPS = ROLLOUTS_PER_GROUP = GROUPS_PER_STEP = for step in range(await model.get_step(), NUM_STEPS): groups = [ art.TrajectoryGroup( rollout(model, scenarios[ (step * GROUPS_PER_STEP + i) % len(scenarios) ]) for _ in range(ROLLOUTS_PER_GROUP) ) for i in range(GROUPS_PER_STEP) ] finished_groups = await art.gather_trajectory_groups( groups, pbar_desc=f"step {step}" ) # (insert RULER scoring block here if using LLM-as-judge) avg_reward = sum( t.reward for g in finished_groups for t in g.trajectories ) / max(1, sum(len(g.trajectories) for g in finished_groups)) print(f"Step {step}: avg_reward={avg_reward:.3f}") await model.delete_checkpoints() await model.train( finished_groups, config=art.TrainConfig(learning_rate=), ) # ... post-training block + backend.close() ... if __name__ == "__main__": asyncio.run(main()) ``` ### Alternative loop: Dataset-driven with iterate_dataset When the user has a fixed list of training scenarios and wants epoch-based iteration, use `iterate_dataset` instead of the manual step loop. This can be combined with any rollout pattern and reward method. ```python from art.utils import iterate_dataset # Replace the manual for-loop with: training_iterator = iterate_dataset( scenarios, groups_per_step=, num_epochs=, initial_step=await model.get_step(), ) for batch in training_iterator: groups = [ art.TrajectoryGroup( rollout(model, item) for _ in range(ROLLOUTS_PER_GROUP) ) for item in batch.items ] finished_groups = await art.gather_trajectory_groups( groups, pbar_desc=f"epoch {batch.epoch} step {batch.step}" ) # (insert RULER scoring block here if using LLM-as-judge) avg_reward = sum( t.reward for g in finished_groups for t in g.trajectories ) / max(1, sum(len(g.trajectories) for g in finished_groups)) print(f"Step {batch.step} (epoch {batch.epoch}): avg_reward={avg_reward:.3f}") await model.delete_checkpoints() await model.train( finished_groups, config=art.TrainConfig(learning_rate=), ) ``` ## Step 7: Write and Offer to Run 1. Write the script to a file (suggest `rl_train.py`) 2. Ask the user if they want to run it now with `uv run python ` 3. If yes, run it **directly using the Bash tool** (do NOT delegate to a Task subagent) so training logs stream live to the user. Use a **2-minute timeout**. If it times out, check progress and decide whether to continue. 4. **GPU memory errors**: If training fails with OOM, lower `gpu_memory_utilization` in the existing `_internal_config` (e.g. from `0.7` to `0.5`). 5. **Stale GPU memory**: If available GPU memory looks too small, previous training runs may still be occupying memory. Before retrying, run `nvidia-smi` to check, and if needed kill leftover processes with `kill ` to free memory. ## Important Notes - LocalBackend requires a GPU. - RL uses **GRPO** (Group Relative Policy Optimization) under the hood. It needs multiple trajectories per scenario (a `TrajectoryGroup`) to compute relative advantages. More rollouts per group = better advantage estimation. - **RULER** eliminates the need for manual reward engineering by using an LLM judge to compare trajectories within a group. It requires an OpenAI API key (`OPENAI_API_KEY` env var). - The `@art.retry` decorator can wrap rollout functions to handle transient errors: `@art.retry(exceptions=(openai.LengthFinishReasonError,))`. - **Validation**: To log validation metrics without training, use `await model.log(val_groups)` or `await model.log(val_groups, split="val")`. - **Resuming**: All patterns use `await model.get_step()` as the loop start, so training resumes from the last checkpoint automatically. ================================================ FILE: .agents/skills/train-sft/SKILL.md ================================================ --- name: train-sft description: SFT training reference for the ART framework. Use when the user asks to create, write, or help with an SFT training script, fine-tune a model, train from a JSONL dataset, do distillation, or anything related to supervised fine-tuning. --- # SFT Training Wizard You are guiding the user through setting up Supervised Fine-Tuning (SFT) for a language model using the ART framework. Act as an interactive wizard: ask questions, validate inputs, and generate a complete runnable script. **Important**: Ask ONE question at a time. Wait for the user's response before asking the next question. Never bundle multiple questions into a single message. **Adaptability note**: Some steps reference tools like AskUserQuestion, Glob, or Bash. If you don't have access to these tools, simply ask the user the same questions as plain text and skip any steps that require running code (e.g., file search, dataset validation, hyperparameter computation). Do NOT fabricate results — never pretend you ran a tool or searched for files when you didn't. ## Step 1: Determine Training Scenario Ask the user ONE question at a time. Wait for their response before moving to the next question. **Training scenario:** 1. **Train from a JSONL file** — They have a dataset file with chat-formatted examples 2. **Distillation** — They want to train a smaller model using outputs from a larger teacher model ## Step 2: Determine Backend **Backend:** 1. **ServerlessBackend (Recommended)** — Train on remote managed GPUs. No local GPU needed, production-ready inference endpoint. 2. **LocalBackend** — Train on your local GPU. Full control, fast iteration. ## Step 3: Select and Validate Dataset (JSONL scenario) **IMPORTANT**: Do NOT assume a dataset. Do NOT make up or hallucinate file paths. Never pretend you searched for files if you didn't actually run a search tool. If you have access to file system tools (Glob) and can actually execute them, search for `.jsonl` files using Glob (`**/*.jsonl`). Present real results as options. Always include "Provide my own file path" as the last option. Otherwise, ask the user: "What is the path to your JSONL training file?" — nothing more. Once the user has provided a file path, validate it if you can run code using the script below. If you cannot run code, skip validation and move on. ```python import json, sys ROLES = {"system", "user", "assistant", "developer", "tool", "function"} errors = [] for i, line in enumerate(open(sys.argv[1]), 1): try: r = json.loads(line) msgs = r.get("input", r).get("messages", []) assert isinstance(msgs, list) and msgs, "no messages" for j, m in enumerate(msgs): assert m.get("role") in ROLES, f"messages[{j}]: invalid role {m.get('role')!r}" assert m.get("content") or m.get("function_call") or m.get("tool_calls"), f"messages[{j}]: no content" if "input" not in r: assert msgs[-1]["role"] == "assistant", "last message must be from assistant" tools = r.get("tools") if tools is not None: assert isinstance(tools, list), "tools must be a list" except Exception as e: errors.append(f" Line {i}: {e}") print(f"{len(errors)} error(s):\n" + "\n".join(errors) if errors else f"Valid! {i} rows") sys.exit(1 if errors else 0) ``` The JSONL format supports these fields per row: - **`messages`** (required): List of chat messages - **`tools`** (optional): List of tool/function definitions for tool-call training - **`response_format`** (optional): Structured output schema (not used during training, but useful as metadata) Report the row count and validation result to the user. Do NOT read the whole dataset file. Do NOT name the dataset. If the format is wrong, help them fix it or convert their data. ## Step 4: Gather Base Parameters Do NOT ask the user to review or confirm their answers after collecting them — just proceed to the next step. - **Base model**: Recommend ONLY these models: - `OpenPipe/Qwen3-14B-Instruct` - `Qwen/Qwen3-30B-A3B-Instruct-2507` - `meta-llama/Llama-3.1-8B-Instruct` - **Project name**: A name for this training project (default: `sft-project`) - **Run name**: A static, descriptive name (e.g., `agent-001`, `pii-redactor-001`, `math-tutor-001`). Ask the user for a meaningful name. Do NOT generate random names. For **distillation** also ask: - **Teacher model**: The larger model to distill from (e.g., an OpenRouter model) - **Teacher API base URL and key**: If using a third-party provider - **Prompts**: What prompts to send to the teacher model ## Step 5: Gather Hyperparameters This step only applies if you can run code AND know the row count from validation. If you cannot run code, skip this step entirely — do NOT make up or guess hyperparameter values. The `train_sft_from_file` function has sensible built-in defaults. Run this Python snippet via Bash to compute defaults (replace `NUM_ROWS` with the actual row count). Do NOT show any formulas or calculation steps to the user — only show the final values. ```python import math, sys n = int(sys.argv[1]) epochs = max(1, min(10, round(10000 / n))) batch_size = 2 total_steps = math.ceil(n * epochs / batch_size) steps_per_epoch = math.ceil(n / batch_size) warmup_steps = max(10, min(1000, round(steps_per_epoch * 0.05))) warmup_ratio = round(warmup_steps / total_steps, 4) print(f"epochs={epochs} batch_size={batch_size} lr=2e-4 schedule=linear warmup_ratio={warmup_ratio}") ``` Present the output values to the user, then ask: - **Use defaults (Recommended)** — show all values in the description - **Customize** — adjust individual hyperparameters If they choose "Customize", ask which parameters to change. ### For distillation: Use the same defaults computation as JSONL (replace `NUM_ROWS` with the number of trajectories). `create_sft_dataset_iterator` handles the LR schedule automatically. ## Step 6: Generate the Training Script Write a complete, runnable Python script. Use the patterns below. Every script MUST: - Call `await backend.close()` at the end so the process doesn't hang - Print post-training info and usage examples (see shared block below) ### Post-training block (append to ALL scripts before `backend.close()`): ```python # --- Training complete --- step = await model.get_step() inference_name = model.get_inference_name() client = model.openai_client() print("\n" + "=" * 60) print("SFT TRAINING COMPLETE") print("=" * 60) print(f" Model: {inference_name}") print(f" Base model: ") print(f" Training step: {step}") print(f" Inference URL: {client.base_url}") print(f" W&B run: https://wandb.ai///runs/") print("=" * 60) print("\n--- Python usage (openai SDK) ---\n") print(f'''\ from openai import OpenAI client = OpenAI( base_url="{client.base_url}", api_key="not-needed", ) response = client.chat.completions.create( model="{inference_name}", messages=[ {{"role": "user", "content": "Your prompt here"}}, ], ) print(response.choices[0].message.content) ''') print("--- curl usage ---\n") print(f'''\ curl {client.base_url}chat/completions \\ -H "Content-Type: application/json" \\ -d '{{ "model": "{inference_name}", "messages": [ {{"role": "user", "content": "Your prompt here"}} ] }}' ''') await backend.close() ``` ### Backend setup Use the appropriate backend based on the user's choice: **LocalBackend:** ```python from art.local import LocalBackend backend = LocalBackend() model = art.TrainableModel( name="", project="", base_model="", _internal_config=art.dev.InternalModelConfig( engine_args={"gpu_memory_utilization": 0.7}, ), ) await model.register(backend) ``` **ServerlessBackend:** ```python from art.serverless.backend import ServerlessBackend backend = ServerlessBackend() # uses WANDB_API_KEY env var model = art.TrainableModel( name="", project="", base_model="", ) await model.register(backend) ``` Note: `_internal_config` with `gpu_memory_utilization` is only used with LocalBackend. Do NOT include it for ServerlessBackend. ### JSONL file training pattern: If hyperparameters were computed in Step 5, pass them explicitly. If Step 5 was skipped, omit them — `train_sft_from_file` has sensible defaults. ```python """SFT training script generated by /train-sft wizard.""" import asyncio import art from art.utils.sft import train_sft_from_file async def main(): await train_sft_from_file( model=model, file_path="", # Only include these if hyperparameters were computed: # epochs=, # batch_size=, # peak_lr=, # schedule_type="", # warmup_ratio=, verbose=True, ) # ... post-training block + backend.close() ... if __name__ == "__main__": asyncio.run(main()) ``` ### Distillation pattern: ```python """Distillation SFT script generated by /train-sft wizard.""" import asyncio, os from dotenv import load_dotenv from openai import AsyncOpenAI import art from art.utils.sft import create_sft_dataset_iterator load_dotenv() async def main(): teacher_client = AsyncOpenAI( api_key=os.environ[""], base_url="", ) prompts = ["", ""] trajectories = [] for prompt in prompts: completion = await teacher_client.chat.completions.create( model="", messages=[{"role": "user", "content": prompt}], ) trajectories.append( art.Trajectory( messages_and_choices=[ {"role": "user", "content": prompt}, {"role": "assistant", "content": completion.choices[0].message.content}, ], tools=, ) ) for chunk in create_sft_dataset_iterator( trajectories, epochs=, batch_size=, peak_lr=, schedule_type="", warmup_ratio=, ): await model.train_sft(chunk.trajectories, chunk.config, verbose=True) # ... post-training block + backend.close() ... if __name__ == "__main__": asyncio.run(main()) ``` ## Step 7: Write and Offer to Run 1. Write the script to a file (suggest `sft_train.py`) 2. Ask the user if they want to run it now with `uv run python ` 3. If yes, run it **directly using the Bash tool** (do NOT delegate to a Task subagent) so training logs stream live to the user. Use a **2-minute timeout**. If it times out, check progress and decide whether to continue. 4. **LocalBackend only — GPU memory errors**: If training fails with OOM, lower `gpu_memory_utilization` in the existing `_internal_config` (e.g. from `0.7` to `0.5`). 5. **LocalBackend only — Stale GPU memory**: If available GPU memory looks too small, previous training runs may still be occupying memory. Before retrying, run `nvidia-smi` to check, and if needed kill leftover processes with `kill ` to free memory. ## Important Notes - LocalBackend requires a GPU. - ServerlessBackend requires a `WANDB_API_KEY` environment variable. ================================================ FILE: .dockerignore ================================================ __pycache__/ .art/ # .env .venv/ grpo_trainer_lora_model/ logs/ shared_cache.db streaming-chat-completions/ unsloth_compiled_cache/ wandb/ docs/node_modules/ dist/ replays/ trajectories/ .DS_Store # .local/ # .claude/ .vscode/ ================================================ FILE: .github/workflows/create-draft-release.yml ================================================ name: Create Draft Release on: workflow_dispatch: inputs: version_type: description: 'Version bump type' required: true default: 'patch' type: choice options: - patch - minor - major permissions: contents: write pull-requests: write jobs: create-draft: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v5 with: python-version: '3.11' - name: Install uv run: | curl -LsSf https://astral.sh/uv/install.sh | sh echo "$HOME/.cargo/bin" >> $GITHUB_PATH - name: Install dependencies run: | uv venv uv pip install -e . - name: Bump version id: bump run: | python scripts/bump_version.py ${{ github.event.inputs.version_type }} NEW_VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml', 'rb'))['project']['version'])") echo "NEW_VERSION=$NEW_VERSION" >> $GITHUB_OUTPUT - name: Generate release notes template id: release_notes env: GH_TOKEN: ${{ github.token }} run: | # Get the previous tag PREVIOUS_TAG=$(git describe --tags --abbrev=0 2>/dev/null || echo "") # Get merged PRs since last release if [ -z "$PREVIOUS_TAG" ]; then PRS=$(gh pr list --state merged --limit 100 --json number,title,url | jq -r '.[] | "- \(.title) ([#\(.number)](\(.url)))"') else LAST_TAG_DATE=$(git log -1 --format=%aI $PREVIOUS_TAG) PRS=$(gh pr list --state merged --limit 100 --json number,title,url,mergedAt | jq -r --arg date "$LAST_TAG_DATE" '.[] | select(.mergedAt > $date) | "- \(.title) ([#\(.number)](\(.url)))"') fi # Create release notes template cat << EOF > release_notes_template.md ## Release Highlights ## What's Changed $PRS **Full Changelog**: https://github.com/OpenPipe/ART/compare/$PREVIOUS_TAG...v${{ steps.bump.outputs.NEW_VERSION }} EOF - name: Create draft release env: GH_TOKEN: ${{ github.token }} run: | gh release create v${{ steps.bump.outputs.NEW_VERSION }} \ --title "v${{ steps.bump.outputs.NEW_VERSION }}" \ --notes-file release_notes_template.md \ --draft - name: Create PR with version bump env: GH_TOKEN: ${{ github.token }} run: | git config --local user.email "action@github.com" git config --local user.name "GitHub Action" git checkout -b release/v${{ steps.bump.outputs.NEW_VERSION }} git add pyproject.toml uv.lock git commit -m "Bump version to ${{ steps.bump.outputs.NEW_VERSION }}" git push origin release/v${{ steps.bump.outputs.NEW_VERSION }} gh pr create \ --title "Release v${{ steps.bump.outputs.NEW_VERSION }}" \ --body "This PR bumps the version to ${{ steps.bump.outputs.NEW_VERSION }}. **Next steps:** 1. Review and edit the [draft release](https://github.com/OpenPipe/ART/releases) 2. Add release highlights and curate the changelog 3. Merge this PR to publish the release automatically" \ --base main \ --head release/v${{ steps.bump.outputs.NEW_VERSION }} - name: Output instructions run: | echo "::notice::Draft release created! Next steps:" echo "::notice::1. Go to https://github.com/OpenPipe/ART/releases and edit the draft" echo "::notice::2. Add release highlights and curate the auto-generated PR list" echo "::notice::3. Merge the PR to publish the release" ================================================ FILE: .github/workflows/package-install.yml ================================================ name: Package Install on: pull_request: push: branches: [main] workflow_dispatch: permissions: contents: read jobs: install-smoke-test: runs-on: ubuntu-latest steps: - uses: actions/checkout@v4 - name: Set up Python uses: actions/setup-python@v5 with: python-version: "3.11" - name: Install uv run: | curl -LsSf https://astral.sh/uv/install.sh | sh echo "$HOME/.cargo/bin" >> "$GITHUB_PATH" - name: Build wheel run: uv build --wheel --out-dir dist - name: Smoke test uv add + sync for backend extra run: | wheel_path="$(python - <<'PY' from pathlib import Path print(next(Path("dist").glob("openpipe_art-*.whl")).resolve()) PY )" project_dir="$(mktemp -d)" cd "$project_dir" uv init --name art-install-smoke --python 3.11 --bare uv add "openpipe-art[backend] @ file://${wheel_path}" uv sync ================================================ FILE: .github/workflows/prek.yml ================================================ name: Prek on: pull_request: push: branches: [main] permissions: contents: write env: CI_BASE_IMAGE: "pytorch/pytorch:2.9.0-cuda12.8-cudnn9-devel" CI_PYTHON_MM: "3.11" CI_UV_CACHE_RELEASE_TAG: "prek-uv-cache" CI_UV_CACHE_ASSET_PREFIX: "prek-uv-cache" UV_CACHE_DIR: "/root/.cache/uv" UV_LINK_MODE: "copy" TORCH_CUDA_ARCH_LIST: "8.0" jobs: cache-status: runs-on: art-large-runner outputs: cache-hit: ${{ steps.check.outputs.cache-hit }} fingerprint: ${{ steps.fingerprint.outputs.fingerprint }} steps: - name: Checkout code uses: actions/checkout@v4 - name: Compute expected uv cache fingerprint id: fingerprint run: | fp="$(python3 scripts/ci/compute_uv_fingerprint.py \ --pyproject pyproject.toml \ --uv-lock uv.lock \ --base-image "${CI_BASE_IMAGE}" \ --python-mm "${CI_PYTHON_MM}")" echo "fingerprint=${fp}" >> "${GITHUB_OUTPUT}" echo "Expected uv cache fingerprint: ${fp}" - name: Check if uv cache exists id: check env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | fingerprint="${{ steps.fingerprint.outputs.fingerprint }}" part_prefix="${CI_UV_CACHE_ASSET_PREFIX}-${fingerprint}.tar.zst.part-" release_api="https://api.github.com/repos/${GITHUB_REPOSITORY}/releases/tags/${CI_UV_CACHE_RELEASE_TAG}" release_json="$(curl -fsSL \ -H "Authorization: Bearer ${GITHUB_TOKEN}" \ -H "Accept: application/vnd.github+json" \ "${release_api}" || true)" if [ -z "${release_json}" ]; then echo "Cache release '${CI_UV_CACHE_RELEASE_TAG}' not found." echo "cache-hit=false" >> "${GITHUB_OUTPUT}" exit 0 fi hit="$(RELEASE_JSON="${release_json}" PART_PREFIX="${part_prefix}" python3 -c " import json, os, re payload = json.loads(os.environ['RELEASE_JSON']) prefix = os.environ['PART_PREFIX'] pattern = re.compile(r'^' + re.escape(prefix) + r'(\d{3})$') parts = sorted( int(m.group(1)) for a in payload.get('assets', []) for m in [pattern.match(a.get('name', ''))] if m and a.get('id') is not None ) print('true' if parts and parts == list(range(len(parts))) else 'false') ")" echo "cache-hit=${hit}" >> "${GITHUB_OUTPUT}" echo "Cache hit: ${hit}" build-cache: needs: cache-status if: needs.cache-status.outputs.cache-hit != 'true' runs-on: art-cache-builder container: image: pytorch/pytorch:2.9.0-cuda12.8-cudnn9-devel steps: - name: Install CI dependencies run: | apt-get update apt-get install -y --no-install-recommends ca-certificates curl git zstd rm -rf /var/lib/apt/lists/* curl -LsSf https://astral.sh/uv/install.sh | sh echo "/root/.local/bin" >> "${GITHUB_PATH}" - name: Install gh CLI env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | GH_DL_URL="$(curl -fsSL \ -H "Authorization: Bearer ${GH_TOKEN}" \ https://api.github.com/repos/cli/cli/releases/latest \ | python3 -c "import json,sys;r=json.load(sys.stdin);print([a['browser_download_url'] for a in r['assets'] if a['name'].endswith('_linux_amd64.tar.gz')][0])")" curl -fsSL "${GH_DL_URL}" | tar xz --strip-components=1 -C /usr/local gh version - name: Checkout code uses: actions/checkout@v4 - name: Mark workspace as a safe git directory run: | git config --global --add safe.directory "${GITHUB_WORKSPACE}" - name: Build and upload uv cache env: GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | bash scripts/ci/build_and_push_uv_cache.sh \ --base-image "${CI_BASE_IMAGE}" \ --python-mm "${CI_PYTHON_MM}" quality-checks: needs: [cache-status, build-cache] if: ${{ !failure() && !cancelled() }} runs-on: art-large-runner container: image: pytorch/pytorch:2.9.0-cuda12.8-cudnn9-devel steps: - name: Install CI dependencies run: | apt-get update apt-get install -y --no-install-recommends ca-certificates curl git zstd rm -rf /var/lib/apt/lists/* curl -LsSf https://astral.sh/uv/install.sh | sh echo "/root/.local/bin" >> "${GITHUB_PATH}" - name: Checkout code uses: actions/checkout@v4 - name: Mark workspace as a safe git directory run: | git config --global --add safe.directory "${GITHUB_WORKSPACE}" - name: Restore prebuilt uv cache env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} run: | release_api="https://api.github.com/repos/${GITHUB_REPOSITORY}/releases/tags/${CI_UV_CACHE_RELEASE_TAG}" fingerprint="${{ needs.cache-status.outputs.fingerprint }}" part_prefix="${CI_UV_CACHE_ASSET_PREFIX}-${fingerprint}.tar.zst.part-" release_json="$(curl -fsSL \ -H "Authorization: Bearer ${GITHUB_TOKEN}" \ -H "Accept: application/vnd.github+json" \ "${release_api}" || true)" if [ -z "${release_json}" ]; then echo "::error::Missing cache release '${CI_UV_CACHE_RELEASE_TAG}'." exit 1 fi part_selection_file="/tmp/uv-cache-part-selection.txt" if ! RELEASE_JSON="${release_json}" PART_PREFIX="${part_prefix}" python3 -c "import json, os, re, sys; payload=json.loads(os.environ['RELEASE_JSON']); part_prefix=os.environ['PART_PREFIX']; pattern=re.compile(r'^' + re.escape(part_prefix) + r'(\\d{3})$'); parts=[]; [parts.append((int(m.group(1)), int(a.get('id')), a.get('name'))) for a in payload.get('assets', []) for m in [pattern.match(a.get('name', ''))] if m and a.get('id') is not None]; parts.sort(key=lambda x: x[0]); indices=[p[0] for p in parts]; expected=list(range(len(parts))); print('\\n'.join(f'{asset_id} {name}' for _, asset_id, name in parts)) if parts and indices == expected else (_ for _ in ()).throw(SystemExit(2 if not parts else 3))" > "${part_selection_file}"; then echo "::error::No complete uv cache part set found for prefix '${part_prefix}'." exit 1 fi part_count="$(wc -l < "${part_selection_file}" | tr -d ' ')" echo "Using uv cache part set '${part_prefix}*' (${part_count} parts)." parts_dir="/tmp/uv-cache-parts" part_paths_file="/tmp/uv-cache-part-paths.txt" rm -rf "${parts_dir}" mkdir -p "${parts_dir}" awk -v d="${parts_dir}" '{print d "/" $2}' "${part_selection_file}" > "${part_paths_file}" PARTS_DIR="${parts_dir}" GITHUB_TOKEN="${GITHUB_TOKEN}" GITHUB_REPOSITORY="${GITHUB_REPOSITORY}" \ xargs -n 2 -P 8 sh -c ' asset_id="$1" asset_name="$2" part_path="${PARTS_DIR}/${asset_name}" curl -fsSL -L \ -H "Authorization: Bearer ${GITHUB_TOKEN}" \ -H "Accept: application/octet-stream" \ "https://api.github.com/repos/${GITHUB_REPOSITORY}/releases/assets/${asset_id}" \ -o "${part_path}" ' sh < "${part_selection_file}" while IFS= read -r part_path; do [ -s "${part_path}" ] || { echo "::error::Missing or empty cache part: ${part_path}" exit 1 } done < "${part_paths_file}" rm -rf "${UV_CACHE_DIR}" mkdir -p "${UV_CACHE_DIR}" while IFS= read -r part_path; do cat "${part_path}" done < "${part_paths_file}" | zstd -d -c | tar -xf - -C "${UV_CACHE_DIR}" du -sh "${UV_CACHE_DIR}" - name: Install dependencies (with all optional extras for complete type checking) run: | py_mm="$(python -c 'import sys; print(f"{sys.version_info.major}.{sys.version_info.minor}")')" cudnn_path="${GITHUB_WORKSPACE}/.venv/lib/python${py_mm}/site-packages/nvidia/cudnn" export CUDNN_PATH="${cudnn_path}" export CUDNN_HOME="${cudnn_path}" export CUDNN_INCLUDE_PATH="${cudnn_path}/include" export CUDNN_LIBRARY_PATH="${cudnn_path}/lib" export CPLUS_INCLUDE_PATH="${CUDNN_INCLUDE_PATH}${CPLUS_INCLUDE_PATH:+:${CPLUS_INCLUDE_PATH}}" export LIBRARY_PATH="${CUDNN_LIBRARY_PATH}${LIBRARY_PATH:+:${LIBRARY_PATH}}" export LD_LIBRARY_PATH="${CUDNN_LIBRARY_PATH}${LD_LIBRARY_PATH:+:${LD_LIBRARY_PATH}}" uv --version uv sync --all-extras --group dev --frozen - name: Run prek hooks (lint, format, typecheck, uv.lock, tests) run: | uv run prek run --all-files - name: Run unit tests (via prek) run: | uv run prek run pytest ================================================ FILE: .github/workflows/release.yml ================================================ name: Release on: pull_request: types: [closed] branches: [main] permissions: contents: write id-token: write jobs: release: runs-on: ubuntu-latest if: github.event.pull_request.merged == true && startsWith(github.event.pull_request.head.ref, 'release/') steps: - uses: actions/checkout@v4 with: fetch-depth: 0 - name: Set up Python uses: actions/setup-python@v5 with: python-version: '3.11' - name: Install uv run: | curl -LsSf https://astral.sh/uv/install.sh | sh echo "$HOME/.cargo/bin" >> $GITHUB_PATH - name: Install dependencies run: | uv venv uv pip install -e . uv pip install hatch - name: Build package run: uv run hatch build - name: Get version from pyproject.toml id: get_version run: | VERSION=$(python -c "import tomllib; print(tomllib.load(open('pyproject.toml', 'rb'))['project']['version'])") echo "VERSION=$VERSION" >> $GITHUB_OUTPUT - name: Create git tag run: | git config --local user.email "action@github.com" git config --local user.name "GitHub Action" git tag v${{ steps.get_version.outputs.VERSION }} git push origin v${{ steps.get_version.outputs.VERSION }} - name: Publish draft release env: GH_TOKEN: ${{ github.token }} run: | # Check if draft release exists and publish it if gh release view v${{ steps.get_version.outputs.VERSION }} --json isDraft | jq -r '.isDraft' | grep -q true; then gh release edit v${{ steps.get_version.outputs.VERSION }} --draft=false else echo "::error::No draft release found for v${{ steps.get_version.outputs.VERSION }}" exit 1 fi - name: Upload assets to release env: GH_TOKEN: ${{ github.token }} run: | gh release upload v${{ steps.get_version.outputs.VERSION }} dist/* - name: Publish to PyPI uses: pypa/gh-action-pypi-publish@release/v1 with: password: ${{ secrets.PYPI_ART_TOKEN }} ================================================ FILE: .gitignore ================================================ __pycache__/ .art/ .env .venv/ grpo_trainer_lora_model/ logs/ shared_cache.db data/cache.db streaming-chat-completions/ unsloth_compiled_cache/ wandb/ docs/node_modules/ dist/ replays/ trajectories/ .DS_Store .local/ .claude/settings.local.json .vscode/ .ruff_cache/ !/src/art/wandb/ !/src/art/wandb/** /src/art/wandb/__pycache__/ ================================================ FILE: .pre-commit-config.yaml ================================================ repos: - repo: https://github.com/astral-sh/ruff-pre-commit rev: v0.12.1 hooks: - id: ruff - id: ruff-format - repo: local hooks: - id: ty name: ty type checking entry: uv run ty check src tests language: system pass_filenames: false - id: uv-lock-check name: uv.lock sync check entry: uv lock --check language: system pass_filenames: false - id: pytest name: Unit tests (manual) entry: uv run pytest --nbval --current-env --tb=short tests/unit language: system pass_filenames: false stages: [manual] ================================================ FILE: .python-version ================================================ 3.11 ================================================ FILE: .skyignore ================================================ __pycache__/ .art/ # .env .venv/ grpo_trainer_lora_model/ logs/ shared_cache.db streaming-chat-completions/ unsloth_compiled_cache/ wandb/ docs/node_modules/ dist/ dev/art-e/data/ replays/ trajectories/ .DS_Store # .local/ ================================================ FILE: AGENT.md ================================================ ## uv package manager by default This project uses the `uv` package manager. - To add a dependency, run `uv add `. - To run a script, run `uv run