Repository: axolotl-ai-cloud/axolotl
Branch: main
Commit: b0294b3427da
Files: 1070
Total size: 5.4 MB

Directory structure:
gitextract_1sp7sr39/

├── .axolotl-complete.bash
├── .bandit
├── .coderabbit.yaml
├── .coveragerc
├── .editorconfig
├── .gitattributes
├── .github/
│   ├── CODE_OF_CONDUCT.md
│   ├── CONTRIBUTING.md
│   ├── FUNDING.yml
│   ├── ISSUE_TEMPLATE/
│   │   ├── bug-report.yaml
│   │   ├── config.yml
│   │   ├── docs.yml
│   │   └── feature-request.yaml
│   ├── PULL_REQUEST_TEMPLATE.md
│   ├── SECURITY.md
│   ├── SUPPORT.md
│   ├── release-drafter.yml
│   └── workflows/
│       ├── base.yml
│       ├── docs.yml
│       ├── lint.yml
│       ├── main.yml
│       ├── multi-gpu-e2e.yml
│       ├── nightlies.yml
│       ├── precommit-autoupdate.yml
│       ├── preview-docs.yml
│       ├── pypi.yml
│       ├── tests-nightly.yml
│       └── tests.yml
├── .gitignore
├── .mypy.ini
├── .pre-commit-config.yaml
├── .runpod/
│   ├── .gitignore
│   ├── Dockerfile
│   ├── README.md
│   ├── hub.json
│   ├── requirements.txt
│   ├── src/
│   │   ├── config/
│   │   │   └── config.yaml
│   │   ├── handler.py
│   │   ├── test_input.json
│   │   ├── train.py
│   │   └── utils.py
│   ├── test-input.json
│   └── tests.json
├── CITATION.cff
├── CNAME
├── FAQS.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── VERSION
├── _quarto.yml
├── benchmarks/
│   ├── bench_entropy.py
│   ├── bench_scattermoe_lora.py
│   └── bench_selective_logsoftmax.py
├── cicd/
│   ├── Dockerfile-uv.jinja
│   ├── Dockerfile.jinja
│   ├── __init__.py
│   ├── cicd.sh
│   ├── cleanup.py
│   ├── cleanup.sh
│   ├── e2e_tests.py
│   ├── multigpu.py
│   ├── multigpu.sh
│   └── single_gpu.py
├── codecov.yml
├── deepspeed_configs/
│   ├── zero1.json
│   ├── zero1_torch_compile.json
│   ├── zero2.json
│   ├── zero2_torch_compile.json
│   ├── zero3.json
│   ├── zero3_bf16.json
│   ├── zero3_bf16_cpuoffload_all.json
│   └── zero3_bf16_cpuoffload_params.json
├── devtools/
│   ├── README.md
│   └── dev_chat_template.yml
├── docker/
│   ├── Dockerfile
│   ├── Dockerfile-base
│   ├── Dockerfile-base-next
│   ├── Dockerfile-base-nightly
│   ├── Dockerfile-cloud
│   ├── Dockerfile-cloud-no-tmux
│   ├── Dockerfile-cloud-uv
│   ├── Dockerfile-tests
│   ├── Dockerfile-uv
│   └── Dockerfile-uv-base
├── docker-compose.yaml
├── docs/
│   ├── .gitignore
│   ├── amd_hpc.qmd
│   ├── attention.qmd
│   ├── batch_vs_grad.qmd
│   ├── checkpoint_saving.qmd
│   ├── cli.qmd
│   ├── custom_integrations.qmd
│   ├── dataset-formats/
│   │   ├── conversation.qmd
│   │   ├── index.qmd
│   │   ├── inst_tune.qmd
│   │   ├── pretraining.qmd
│   │   ├── stepwise_supervised.qmd
│   │   ├── template_free.qmd
│   │   └── tokenized.qmd
│   ├── dataset_loading.qmd
│   ├── dataset_preprocessing.qmd
│   ├── debugging.qmd
│   ├── docker.qmd
│   ├── expert_quantization.qmd
│   ├── faq.qmd
│   ├── fsdp_qlora.qmd
│   ├── getting-started.qmd
│   ├── gradient_checkpointing.qmd
│   ├── inference.qmd
│   ├── input_output.qmd
│   ├── installation.qmd
│   ├── lora_optims.qmd
│   ├── lr_groups.qmd
│   ├── mac.qmd
│   ├── mixed_precision.qmd
│   ├── multi-gpu.qmd
│   ├── multi-node.qmd
│   ├── multimodal.qmd
│   ├── multipack.qmd
│   ├── nccl.qmd
│   ├── nd_parallelism.qmd
│   ├── optimizations.qmd
│   ├── optimizers.qmd
│   ├── qat.qmd
│   ├── quantize.qmd
│   ├── ray-integration.qmd
│   ├── reward_modelling.qmd
│   ├── rlhf.qmd
│   ├── scripts/
│   │   ├── examples-allowlist.yml
│   │   ├── generate_config_docs.py
│   │   └── generate_examples_docs.py
│   ├── sequence_parallelism.qmd
│   ├── streaming.qmd
│   ├── telemetry.qmd
│   ├── torchao.qmd
│   └── unsloth.qmd
├── examples/
│   ├── LiquidAI/
│   │   ├── README.md
│   │   ├── lfm2-350m-fft.yaml
│   │   ├── lfm2-8b-a1b-lora.yaml
│   │   └── lfm2-vl-lora.yaml
│   ├── alst/
│   │   ├── README.md
│   │   ├── llama3-8b-deepspeed-alst.yaml
│   │   └── llama3-8b-fsdp2-alst.yaml
│   ├── apertus/
│   │   ├── README.md
│   │   └── apertus-8b-qlora.yaml
│   ├── arcee/
│   │   ├── README.md
│   │   └── afm-4.5b-qlora.yaml
│   ├── archived/
│   │   ├── README.md
│   │   ├── cerebras/
│   │   │   ├── btlm-ft.yml
│   │   │   └── qlora.yml
│   │   ├── code-llama/
│   │   │   ├── 13b/
│   │   │   │   ├── lora.yml
│   │   │   │   └── qlora.yml
│   │   │   ├── 34b/
│   │   │   │   ├── lora.yml
│   │   │   │   └── qlora.yml
│   │   │   ├── 7b/
│   │   │   │   ├── lora.yml
│   │   │   │   └── qlora.yml
│   │   │   └── README.md
│   │   ├── dbrx/
│   │   │   ├── 16bit-lora.yaml
│   │   │   ├── 8bit-lora.yaml
│   │   │   ├── README.md
│   │   │   └── fft-ds-zero3.yaml
│   │   ├── deepcoder/
│   │   │   └── deepcoder-14B-preview-lora.yml
│   │   ├── falcon/
│   │   │   ├── config-7b-lora.yml
│   │   │   ├── config-7b-qlora.yml
│   │   │   └── config-7b.yml
│   │   ├── gemma/
│   │   │   └── qlora.yml
│   │   ├── gptj/
│   │   │   └── qlora.yml
│   │   ├── jeopardy-bot/
│   │   │   └── config.yml
│   │   ├── mpt-7b/
│   │   │   ├── README.md
│   │   │   └── config.yml
│   │   ├── openllama-3b/
│   │   │   ├── README.md
│   │   │   ├── config.yml
│   │   │   ├── lora.yml
│   │   │   └── qlora.yml
│   │   ├── pythia/
│   │   │   └── lora.yml
│   │   ├── pythia-12b/
│   │   │   ├── README.md
│   │   │   └── config.yml
│   │   ├── qwen/
│   │   │   ├── README.md
│   │   │   ├── lora.yml
│   │   │   ├── qlora.yml
│   │   │   ├── qwen2-moe-lora.yaml
│   │   │   └── qwen2-moe-qlora.yaml
│   │   ├── redpajama/
│   │   │   ├── README.md
│   │   │   └── config-3b.yml
│   │   ├── replit-3b/
│   │   │   └── config-lora.yml
│   │   ├── stablelm-2/
│   │   │   ├── 1.6b/
│   │   │   │   ├── fft.yml
│   │   │   │   └── lora.yml
│   │   │   └── README.md
│   │   ├── starcoder2/
│   │   │   └── qlora.yml
│   │   ├── tiny-llama/
│   │   │   ├── README.md
│   │   │   ├── lora-mps.yml
│   │   │   ├── lora.yml
│   │   │   ├── pretrain.yml
│   │   │   └── qlora.yml
│   │   ├── xgen-7b/
│   │   │   └── xgen-7b-8k-qlora.yml
│   │   └── yi-34B-chat/
│   │       ├── README.md
│   │       └── qlora.yml
│   ├── cloud/
│   │   ├── baseten.yaml
│   │   └── modal.yaml
│   ├── cohere/
│   │   └── command-r-7b-qlora.yml
│   ├── colab-notebooks/
│   │   └── colab-axolotl-example.ipynb
│   ├── deepcogito/
│   │   ├── cogito-v1-preview-llama-3B-lora.yml
│   │   └── cogito-v1-preview-qwen-14B-lora.yml
│   ├── deepseek-v2/
│   │   ├── fft-fsdp-16b.yaml
│   │   └── qlora-fsdp-2_5.yaml
│   ├── devstral/
│   │   ├── README.md
│   │   └── devstral-small-qlora.yml
│   ├── distributed-parallel/
│   │   ├── README.md
│   │   ├── llama-3_1-8b-hsdp-tp.yaml
│   │   └── qwen3-8b-fsdp-tp-cp.yaml
│   ├── eaft/
│   │   └── eaft-example.yml
│   ├── falcon-h1/
│   │   ├── falcon-h1-1b-deep-qlora.yaml
│   │   ├── falcon-h1-1b-qlora.yaml
│   │   ├── falcon-h1-34b-qlora.yaml
│   │   ├── falcon-h1-3b-qlora.yaml
│   │   ├── falcon-h1-500m-qlora.yaml
│   │   └── falcon-h1-7b-qlora.yaml
│   ├── gemma2/
│   │   ├── qlora.yml
│   │   └── reward-model.yaml
│   ├── gemma3/
│   │   ├── gemma-3-1b-qlora.yml
│   │   ├── gemma-3-270m-qlora.yml
│   │   ├── gemma-3-4b-qlora.yml
│   │   └── gemma-3-4b-vision-qlora.yml
│   ├── gemma3n/
│   │   ├── README.md
│   │   ├── gemma-3n-e2b-qlora.yml
│   │   ├── gemma-3n-e2b-vision-audio-qlora.yml
│   │   └── gemma-3n-e2b-vision-qlora.yml
│   ├── glm4/
│   │   └── qlora-32b.yaml
│   ├── glm45/
│   │   ├── README.md
│   │   └── glm-45-air-qlora.yaml
│   ├── glm46v/
│   │   ├── README.md
│   │   ├── glm-4-6v-flash-ddp.yaml
│   │   └── glm-4-6v-flash-qlora.yaml
│   ├── glm47-flash/
│   │   ├── README.md
│   │   ├── lora.yaml
│   │   ├── lora_fsdp.yaml
│   │   ├── qlora.yaml
│   │   └── qlora_fsdp.yaml
│   ├── gpt-oss/
│   │   ├── README.md
│   │   ├── gpt-oss-120b-fft-fsdp2-offload.yaml
│   │   ├── gpt-oss-20b-fft-deepspeed-zero3.yaml
│   │   ├── gpt-oss-20b-fft-fsdp2-offload.yaml
│   │   ├── gpt-oss-20b-fft-fsdp2.yaml
│   │   ├── gpt-oss-20b-sft-lora-singlegpu.yaml
│   │   └── gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml
│   ├── granite4/
│   │   ├── README.md
│   │   └── granite-4.0-tiny-fft.yaml
│   ├── hunyuan/
│   │   ├── README.md
│   │   └── hunyuan-v1-dense-qlora.yaml
│   ├── internvl3_5/
│   │   ├── README.md
│   │   └── internvl3_5-8b-qlora.yml
│   ├── jamba/
│   │   ├── README.md
│   │   ├── qlora.yaml
│   │   ├── qlora_deepspeed.yaml
│   │   └── qlora_fsdp_large.yaml
│   ├── kimi-linear/
│   │   ├── README.md
│   │   └── kimi-48b-lora.yaml
│   ├── llama-2/
│   │   ├── README.md
│   │   ├── fft_optimized.yml
│   │   ├── gptq-lora.yml
│   │   ├── lisa.yml
│   │   ├── loftq.yml
│   │   ├── lora.yml
│   │   ├── qlora-fsdp.yml
│   │   ├── qlora.yml
│   │   └── relora.yml
│   ├── llama-3/
│   │   ├── 3b-fp8-fsdp2.yaml
│   │   ├── 3b-qat-fsdp2.yaml
│   │   ├── 3b-qat-mxfp4.yaml
│   │   ├── 3b-qat-nvfp4.yaml
│   │   ├── README.md
│   │   ├── diffusion/
│   │   │   ├── pretrain-1b.yaml
│   │   │   └── sft-1b.yaml
│   │   ├── fft-8b-liger-fsdp.yaml
│   │   ├── fft-8b.yaml
│   │   ├── instruct-dpo-lora-8b.yml
│   │   ├── instruct-lora-8b.yml
│   │   ├── lora-1b-deduplicate-dpo.yml
│   │   ├── lora-1b-deduplicate-sft.yml
│   │   ├── lora-1b-kernels.yml
│   │   ├── lora-1b-ray.yml
│   │   ├── lora-1b-sample-packing-sequentially.yml
│   │   ├── lora-1b.yml
│   │   ├── lora-8b.yml
│   │   ├── opentelemetry-qlora.yml
│   │   ├── qlora-1b-gdpo.yaml
│   │   ├── qlora-1b-kto.yaml
│   │   ├── qlora-1b.yml
│   │   ├── qlora-fsdp-405b.yaml
│   │   ├── qlora-fsdp-70b.yaml
│   │   ├── qlora.yml
│   │   └── sparse-finetuning.yaml
│   ├── llama-3-vision/
│   │   └── lora-11b.yaml
│   ├── llama-4/
│   │   ├── README.md
│   │   ├── do-no-use-fa2/
│   │   │   ├── maverick-qlora-fsdp1.yaml
│   │   │   ├── scout-qlora-fsdp1.yaml
│   │   │   ├── scout-qlora-single-h100.yaml
│   │   │   └── scout-vision-qlora-fsdp.yaml
│   │   ├── scout-qlora-flexattn-fsdp2.yaml
│   │   ├── scout-qlora-single-h100-flex.yaml
│   │   └── scout-vision-qlora-fsdp2-flex.yaml
│   ├── llava/
│   │   └── lora-7b.yaml
│   ├── magistral/
│   │   ├── README.md
│   │   ├── magistral-small-fsdp-qlora.yaml
│   │   ├── magistral-small-qlora.yaml
│   │   ├── think/
│   │   │   ├── README.md
│   │   │   └── magistral-small-think-qlora.yaml
│   │   └── vision/
│   │       ├── README.md
│   │       └── magistral-small-vision-24B-qlora.yml
│   ├── mamba/
│   │   └── config.yml
│   ├── mimo/
│   │   ├── README.md
│   │   └── mimo-7b-qlora.yaml
│   ├── ministral/
│   │   ├── README.md
│   │   └── ministral-small-qlora.yaml
│   ├── ministral3/
│   │   ├── README.md
│   │   ├── ministral3-3b-qlora.yaml
│   │   ├── think/
│   │   │   ├── README.md
│   │   │   └── ministral3-3b-think-qlora.yaml
│   │   └── vision/
│   │       ├── README.md
│   │       └── ministral3-3b-vision-qlora.yml
│   ├── mistral/
│   │   ├── README.md
│   │   ├── bigstral/
│   │   │   └── bigstral-ds-zero3.yaml
│   │   ├── config.yml
│   │   ├── dpo/
│   │   │   └── mistral-dpo-qlora.yml
│   │   ├── lora.yml
│   │   ├── mistral-qlora-fsdp.yml
│   │   ├── mixtral/
│   │   │   ├── mixtral-8x22b-qlora-fsdp.yml
│   │   │   ├── mixtral-qlora-fsdp.yml
│   │   │   ├── mixtral.yml
│   │   │   └── mixtral_22.yml
│   │   ├── mps/
│   │   │   └── lora-mps.yml
│   │   ├── orpo/
│   │   │   └── mistral-qlora-orpo.yml
│   │   └── qlora.yml
│   ├── mistral-small/
│   │   ├── README.md
│   │   └── mistral-small-3.1-24B-lora.yml
│   ├── mistral4/
│   │   ├── README.md
│   │   ├── fft-text.yml
│   │   ├── fft-vision.yml
│   │   ├── qlora-text.yml
│   │   └── qlora-vision.yml
│   ├── nemotron/
│   │   └── nemotron-mini-4b-qlora.yaml
│   ├── olmo3/
│   │   ├── README.md
│   │   └── olmo3-7b-qlora.yaml
│   ├── orpheus/
│   │   ├── README.md
│   │   └── finetune.yml
│   ├── phi/
│   │   ├── README.md
│   │   ├── lora-3.5.yaml
│   │   ├── phi-ft.yml
│   │   ├── phi-qlora.yml
│   │   ├── phi2-ft.yml
│   │   ├── phi3-ft-fsdp.yml
│   │   └── phi3-ft.yml
│   ├── pixtral/
│   │   └── lora-12b.yml
│   ├── plano/
│   │   ├── README.md
│   │   └── plano-4b-qlora.yaml
│   ├── qat_nvfp4/
│   │   ├── Gemma3-12B_baseline.yml
│   │   ├── Gemma3-12B_qat.yml
│   │   ├── Math-Gemma3-12B_baseline.yml
│   │   ├── Math-Gemma3-12B_qat.yml
│   │   ├── Math-Gemma3-27B_baseline.yml
│   │   ├── Math-Gemma3-27B_qat.yml
│   │   ├── Math-Qwen2.5-72B_baseline.yml
│   │   ├── Math-Qwen2.5-72B_qat.yml
│   │   ├── Qwen2.5-72B_baseline.yml
│   │   └── Qwen2.5-72B_qat.yml
│   ├── qwen2/
│   │   ├── adamw-pretrain-fsdp2.yaml
│   │   ├── dpo.yaml
│   │   ├── muon-pretrain-fsdp2.yaml
│   │   ├── prm.yaml
│   │   ├── qlora-fsdp.yaml
│   │   └── reward-model.yaml
│   ├── qwen2-vl/
│   │   └── lora-7b.yaml
│   ├── qwen2_5-vl/
│   │   └── lora-7b.yaml
│   ├── qwen3/
│   │   ├── 32b-qlora.yaml
│   │   ├── 8b-qat-fsdp2.yml
│   │   ├── README.md
│   │   ├── qlora-fsdp.yaml
│   │   └── reward-model.yaml
│   ├── qwen3-next/
│   │   ├── README.md
│   │   └── qwen3-next-80b-a3b-qlora.yaml
│   ├── qwen3.5/
│   │   ├── 122b-a10b-moe-qlora-fsdp.yaml
│   │   ├── 122b-a10b-moe-qlora.yaml
│   │   ├── 27b-fft.yaml
│   │   ├── 27b-qlora-fsdp.yaml
│   │   ├── 27b-qlora.yaml
│   │   ├── 35b-a3b-moe-qlora-fsdp.yaml
│   │   ├── 35b-a3b-moe-qlora.yaml
│   │   ├── 9b-fft-vision.yaml
│   │   ├── 9b-lora-vision.yaml
│   │   └── README.md
│   ├── seed-oss/
│   │   ├── README.md
│   │   └── seed-oss-36b-qlora.yaml
│   ├── slurm/
│   │   ├── README.md
│   │   └── axolotl.slurm
│   ├── smolvlm2/
│   │   ├── README.md
│   │   └── smolvlm2-2B-lora.yaml
│   ├── streaming/
│   │   ├── README.md
│   │   ├── pretrain.yaml
│   │   └── sft.yaml
│   ├── swanlab/
│   │   ├── README.md
│   │   ├── custom_trainer_profiling.py
│   │   ├── dpo-swanlab-completions.yml
│   │   ├── dpo-swanlab-full-featured.yml
│   │   └── lora-swanlab-profiling.yml
│   ├── trinity/
│   │   ├── README.md
│   │   └── trinity-nano-preview-qlora.yaml
│   └── voxtral/
│       ├── README.md
│       ├── voxtral-mini-audio-qlora.yml
│       └── voxtral-mini-qlora.yml
├── index.qmd
├── pyproject.toml
├── requirements-dev.txt
├── requirements-tests.txt
├── requirements.txt
├── scripts/
│   ├── chat_datasets.py
│   ├── cloud-entrypoint-term.sh
│   ├── cloud-entrypoint.sh
│   ├── cutcrossentropy_install.py
│   ├── motd
│   └── unsloth_install.py
├── setup.py
├── src/
│   ├── axolotl/
│   │   ├── __init__.py
│   │   ├── cli/
│   │   │   ├── __init__.py
│   │   │   ├── args.py
│   │   │   ├── art.py
│   │   │   ├── checks.py
│   │   │   ├── cloud/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   ├── baseten/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── template/
│   │   │   │   │       ├── run.sh
│   │   │   │   │       └── train_sft.py
│   │   │   │   └── modal_.py
│   │   │   ├── config.py
│   │   │   ├── delinearize_llama4.py
│   │   │   ├── evaluate.py
│   │   │   ├── inference.py
│   │   │   ├── main.py
│   │   │   ├── merge_lora.py
│   │   │   ├── merge_sharded_fsdp_weights.py
│   │   │   ├── preprocess.py
│   │   │   ├── quantize.py
│   │   │   ├── train.py
│   │   │   ├── utils/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   ├── diffusion.py
│   │   │   │   ├── fetch.py
│   │   │   │   ├── load.py
│   │   │   │   ├── sweeps.py
│   │   │   │   └── train.py
│   │   │   └── vllm_serve.py
│   │   ├── common/
│   │   │   ├── __init__.py
│   │   │   ├── architectures.py
│   │   │   ├── const.py
│   │   │   └── datasets.py
│   │   ├── convert.py
│   │   ├── core/
│   │   │   ├── __init__.py
│   │   │   ├── attention/
│   │   │   │   └── __init__.py
│   │   │   ├── builders/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   ├── causal.py
│   │   │   │   └── rl.py
│   │   │   ├── chat/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── format/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── chatml.py
│   │   │   │   │   ├── llama3x.py
│   │   │   │   │   └── shared.py
│   │   │   │   └── messages.py
│   │   │   ├── datasets/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── chat.py
│   │   │   │   └── transforms/
│   │   │   │       ├── __init__.py
│   │   │   │       └── chat_builder.py
│   │   │   ├── trainers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   ├── dpo/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── args.py
│   │   │   │   │   └── trainer.py
│   │   │   │   ├── grpo/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── args.py
│   │   │   │   │   ├── async_trainer.py
│   │   │   │   │   ├── fast_async_trainer.py
│   │   │   │   │   ├── replay_buffer.py
│   │   │   │   │   ├── sampler.py
│   │   │   │   │   └── trainer.py
│   │   │   │   ├── mamba.py
│   │   │   │   ├── mixins/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── activation_checkpointing.py
│   │   │   │   │   ├── checkpoints.py
│   │   │   │   │   ├── distributed_parallel.py
│   │   │   │   │   ├── optimizer.py
│   │   │   │   │   ├── packing.py
│   │   │   │   │   ├── rng_state_loader.py
│   │   │   │   │   └── scheduler.py
│   │   │   │   ├── trl.py
│   │   │   │   └── utils.py
│   │   │   ├── training_args.py
│   │   │   └── training_args_base.py
│   │   ├── datasets.py
│   │   ├── evaluate.py
│   │   ├── integrations/
│   │   │   ├── LICENSE.md
│   │   │   ├── __init__.py
│   │   │   ├── base.py
│   │   │   ├── config.py
│   │   │   ├── cut_cross_entropy/
│   │   │   │   ├── ACKNOWLEDGEMENTS.md
│   │   │   │   ├── LICENSE
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   └── args.py
│   │   │   ├── densemixer/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   └── plugin.py
│   │   │   ├── diffusion/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   ├── callbacks.py
│   │   │   │   ├── generation.py
│   │   │   │   ├── plugin.py
│   │   │   │   ├── trainer.py
│   │   │   │   └── utils.py
│   │   │   ├── grokfast/
│   │   │   │   ├── LICENSE
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   └── optimizer.py
│   │   │   ├── kd/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   ├── callbacks.py
│   │   │   │   ├── chat_template.py
│   │   │   │   ├── collator.py
│   │   │   │   ├── collator_online_teacher.py
│   │   │   │   ├── kernels/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── liger.py
│   │   │   │   │   └── models.py
│   │   │   │   ├── topk_logprob/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── forward_kl.py
│   │   │   │   ├── trainer.py
│   │   │   │   └── utils.py
│   │   │   ├── kernels/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   ├── autotune_callback.py
│   │   │   │   ├── autotune_collector.py
│   │   │   │   ├── constants.py
│   │   │   │   ├── libs/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── scattermoe_lora/
│   │   │   │   │       ├── __init__.py
│   │   │   │   │       ├── kernels/
│   │   │   │   │       │   ├── __init__.py
│   │   │   │   │       │   ├── lora_ops.py
│   │   │   │   │       │   ├── ops.py
│   │   │   │   │       │   └── single.py
│   │   │   │   │       ├── layers.py
│   │   │   │   │       ├── lora_ops.py
│   │   │   │   │       ├── parallel_experts.py
│   │   │   │   │       ├── parallel_linear_lora.py
│   │   │   │   │       ├── selective_dequant.py
│   │   │   │   │       └── selective_dequant_kernel.py
│   │   │   │   ├── plugin.py
│   │   │   │   └── sonicmoe/
│   │   │   │       ├── __init__.py
│   │   │   │       ├── patch.py
│   │   │   │       ├── routing.py
│   │   │   │       └── weight_converter.py
│   │   │   ├── liger/
│   │   │   │   ├── LICENSE
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   ├── models/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── base.py
│   │   │   │   │   ├── deepseekv2.py
│   │   │   │   │   ├── jamba.py
│   │   │   │   │   ├── llama4.py
│   │   │   │   │   ├── qwen3.py
│   │   │   │   │   └── qwen3_moe.py
│   │   │   │   ├── plugin.py
│   │   │   │   └── utils.py
│   │   │   ├── llm_compressor/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   ├── plugin.py
│   │   │   │   └── utils.py
│   │   │   ├── lm_eval/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   └── cli.py
│   │   │   ├── spectrum/
│   │   │   │   ├── LICENSE
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── args.py
│   │   │   │   └── model_snr_results/
│   │   │   │       ├── snr_results_Qwen-Qwen2.5-1.5B-Instruct.json
│   │   │   │       ├── snr_results_Qwen-Qwen2.5-1.5B.json
│   │   │   │       ├── snr_results_Qwen-Qwen2.5-3B-Instruct.json
│   │   │   │       ├── snr_results_Qwen-Qwen2.5-3B.json
│   │   │   │       ├── snr_results_Qwen-Qwen2.5-7B-Instruct.json
│   │   │   │       ├── snr_results_Qwen-Qwen2.5-7B.json
│   │   │   │       ├── snr_results_google-gemma-2-2b.json
│   │   │   │       ├── snr_results_meta-llama-Llama-3.2-1B-Instruct.json
│   │   │   │       ├── snr_results_meta-llama-Llama-3.2-1B.json
│   │   │   │       ├── snr_results_meta-llama-Llama-3.2-3B-Instruct.json
│   │   │   │       └── snr_results_meta-llama-Llama-3.2-3B.json
│   │   │   └── swanlab/
│   │   │       ├── README.md
│   │   │       ├── __init__.py
│   │   │       ├── args.py
│   │   │       ├── callbacks.py
│   │   │       ├── completion_logger.py
│   │   │       ├── plugins.py
│   │   │       └── profiling.py
│   │   ├── kernels/
│   │   │   ├── __init__.py
│   │   │   ├── geglu.py
│   │   │   ├── lora.py
│   │   │   ├── quantize.py
│   │   │   ├── swiglu.py
│   │   │   └── utils.py
│   │   ├── loaders/
│   │   │   ├── __init__.py
│   │   │   ├── adapter.py
│   │   │   ├── adapters/
│   │   │   │   └── __init__.py
│   │   │   ├── constants.py
│   │   │   ├── model.py
│   │   │   ├── patch_manager.py
│   │   │   ├── processor.py
│   │   │   ├── tokenizer.py
│   │   │   └── utils.py
│   │   ├── logging_config.py
│   │   ├── models/
│   │   │   ├── __init__.py
│   │   │   └── mamba/
│   │   │       ├── __init__.py
│   │   │       ├── configuration_mamba.py
│   │   │       └── modeling_mamba.py
│   │   ├── monkeypatch/
│   │   │   ├── __init__.py
│   │   │   ├── accelerate/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── fsdp2.py
│   │   │   │   └── parallelism_config.py
│   │   │   ├── attention/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── flash_attn_4.py
│   │   │   │   ├── flex_attn.py
│   │   │   │   ├── sage_attn.py
│   │   │   │   └── xformers.py
│   │   │   ├── btlm_attn_hijack_flash.py
│   │   │   ├── data/
│   │   │   │   ├── __init__.py
│   │   │   │   └── batch_dataset_fetcher.py
│   │   │   ├── deepspeed_utils.py
│   │   │   ├── fsdp2_qlora.py
│   │   │   ├── gradient_checkpointing/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── offload_cpu.py
│   │   │   │   └── offload_disk.py
│   │   │   ├── llama_attn_hijack_flash.py
│   │   │   ├── llama_attn_hijack_xformers.py
│   │   │   ├── lora_kernels.py
│   │   │   ├── loss/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── chunked.py
│   │   │   │   └── eaft.py
│   │   │   ├── mistral_attn_hijack_flash.py
│   │   │   ├── mixtral/
│   │   │   │   └── __init__.py
│   │   │   ├── models/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── apertus/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── activation.py
│   │   │   │   ├── kimi_linear/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   ├── configuration_kimi.py
│   │   │   │   │   ├── modeling_kimi.py
│   │   │   │   │   ├── patch_kimi_linear.py
│   │   │   │   │   └── tokenization_kimi.py
│   │   │   │   ├── llama4/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── modeling.py
│   │   │   │   ├── mistral3/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── mistral_common_tokenizer.py
│   │   │   │   ├── pixtral/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── modeling_flash_attention_utils.py
│   │   │   │   ├── qwen3_5/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── modeling.py
│   │   │   │   ├── qwen3_next/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── modeling.py
│   │   │   │   └── voxtral/
│   │   │   │       ├── __init__.py
│   │   │   │       └── modeling.py
│   │   │   ├── moe_quant.py
│   │   │   ├── multipack.py
│   │   │   ├── peft/
│   │   │   │   ├── __init__.py
│   │   │   │   └── utils.py
│   │   │   ├── relora.py
│   │   │   ├── ring_attn/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── adapters/
│   │   │   │   │   ├── __init__.py
│   │   │   │   │   └── batch.py
│   │   │   │   └── patch.py
│   │   │   ├── scaled_softmax_attn.py
│   │   │   ├── stablelm_attn_hijack_flash.py
│   │   │   ├── tiled_mlp/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── base.py
│   │   │   │   └── patch.py
│   │   │   ├── trainer/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── lr.py
│   │   │   │   ├── trl.py
│   │   │   │   ├── trl_vllm.py
│   │   │   │   └── utils.py
│   │   │   ├── trainer_accelerator_args.py
│   │   │   ├── trainer_fsdp_optim.py
│   │   │   ├── transformers/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── trainer_context_parallel.py
│   │   │   │   └── trainer_loss_calc.py
│   │   │   ├── transformers_fa_utils.py
│   │   │   ├── unsloth_.py
│   │   │   ├── utils.py
│   │   │   └── xformers_/
│   │   │       └── __init__.py
│   │   ├── processing_strategies.py
│   │   ├── prompt_strategies/
│   │   │   ├── __init__.py
│   │   │   ├── alpaca_chat.py
│   │   │   ├── alpaca_instruct.py
│   │   │   ├── alpaca_w_system.py
│   │   │   ├── base.py
│   │   │   ├── bradley_terry/
│   │   │   │   ├── README.md
│   │   │   │   ├── __init__.py
│   │   │   │   ├── chat_template.py
│   │   │   │   └── llama3.py
│   │   │   ├── chat_template.py
│   │   │   ├── completion.py
│   │   │   ├── context_qa.py
│   │   │   ├── creative_acr.py
│   │   │   ├── dpo/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── chat_template.py
│   │   │   │   ├── chatml.py
│   │   │   │   ├── llama3.py
│   │   │   │   ├── passthrough.py
│   │   │   │   ├── user_defined.py
│   │   │   │   └── zephyr.py
│   │   │   ├── input_output.py
│   │   │   ├── jinja_template_analyzer.py
│   │   │   ├── kto/
│   │   │   │   ├── __init__.py
│   │   │   │   ├── chatml.py
│   │   │   │   ├── llama3.py
│   │   │   │   └── user_defined.py
│   │   │   ├── llama2_chat.py
│   │   │   ├── messages/
│   │   │   │   ├── __init__.py
│   │   │   │   └── chat.py
│   │   │   ├── metharme.py
│   │   │   ├── orcamini.py
│   │   │   ├── orpo/
│   │   │   │   ├── __init__.py
│   │   │   │   └── chat_template.py
│   │   │   ├── pretrain.py
│   │   │   ├── pygmalion.py
│   │   │   ├── stepwise_supervised.py
│   │   │   └── user_defined.py
│   │   ├── prompt_tokenizers.py
│   │   ├── prompters.py
│   │   ├── scripts/
│   │   │   ├── __init__.py
│   │   │   ├── vllm_serve_lora.py
│   │   │   └── vllm_worker_ext.py
│   │   ├── telemetry/
│   │   │   ├── __init__.py
│   │   │   ├── callbacks.py
│   │   │   ├── errors.py
│   │   │   ├── manager.py
│   │   │   ├── runtime_metrics.py
│   │   │   └── whitelist.yaml
│   │   ├── train.py
│   │   └── utils/
│   │       ├── __init__.py
│   │       ├── bench.py
│   │       ├── callbacks/
│   │       │   ├── __init__.py
│   │       │   ├── comet_.py
│   │       │   ├── dynamic_checkpoint.py
│   │       │   ├── generation.py
│   │       │   ├── lisa.py
│   │       │   ├── mlflow_.py
│   │       │   ├── models.py
│   │       │   ├── opentelemetry.py
│   │       │   ├── perplexity.py
│   │       │   ├── profiler.py
│   │       │   ├── qat.py
│   │       │   ├── swanlab.py
│   │       │   ├── tokens_per_second.py
│   │       │   └── trackio_.py
│   │       ├── chat_templates/
│   │       │   ├── __init__.py
│   │       │   ├── base.py
│   │       │   └── templates/
│   │       │       ├── alpaca.jinja
│   │       │       ├── aya.jinja
│   │       │       ├── chatml.jinja
│   │       │       ├── cohere.jinja
│   │       │       ├── command_a.jinja
│   │       │       ├── command_a_rag.jinja
│   │       │       ├── command_a_tool_use.jinja
│   │       │       ├── deepseek_v2.jinja
│   │       │       ├── deepseek_v3.jinja
│   │       │       ├── exaone.jinja
│   │       │       ├── exaone4.jinja
│   │       │       ├── falcon_h1.jinja
│   │       │       ├── gemma.jinja
│   │       │       ├── gemma3.jinja
│   │       │       ├── gemma3n.jinja
│   │       │       ├── jamba.jinja
│   │       │       ├── llama3.jinja
│   │       │       ├── llama3_2_vision.jinja
│   │       │       ├── llama4.jinja
│   │       │       ├── llava.jinja
│   │       │       ├── metharme.jinja
│   │       │       ├── mistral_v1.jinja
│   │       │       ├── mistral_v2v3.jinja
│   │       │       ├── mistral_v3_tekken.jinja
│   │       │       ├── mistral_v7_tekken.jinja
│   │       │       ├── phi_3.jinja
│   │       │       ├── phi_35.jinja
│   │       │       ├── phi_4.jinja
│   │       │       ├── pixtral.jinja
│   │       │       ├── qwen2_vl.jinja
│   │       │       ├── qwen3.jinja
│   │       │       ├── qwen3_5.jinja
│   │       │       └── qwen_25.jinja
│   │       ├── collators/
│   │       │   ├── __init__.py
│   │       │   ├── batching.py
│   │       │   ├── core.py
│   │       │   ├── mamba.py
│   │       │   └── mm_chat.py
│   │       ├── comet_.py
│   │       ├── config/
│   │       │   ├── __init__.py
│   │       │   └── models/
│   │       │       └── __init__.py
│   │       ├── ctx_managers/
│   │       │   ├── __init__.py
│   │       │   └── sequence_parallel.py
│   │       ├── data/
│   │       │   ├── __init__.py
│   │       │   ├── lock.py
│   │       │   ├── rl.py
│   │       │   ├── sft.py
│   │       │   ├── shared.py
│   │       │   ├── streaming.py
│   │       │   ├── utils.py
│   │       │   └── wrappers.py
│   │       ├── datasets.py
│   │       ├── dict.py
│   │       ├── distributed.py
│   │       ├── environment.py
│   │       ├── freeze.py
│   │       ├── generation/
│   │       │   ├── __init__.py
│   │       │   └── sft.py
│   │       ├── import_helper.py
│   │       ├── logging.py
│   │       ├── lora.py
│   │       ├── mistral/
│   │       │   ├── __init__.py
│   │       │   ├── mistral3_processor.py
│   │       │   └── mistral_tokenizer.py
│   │       ├── mlflow_.py
│   │       ├── model_shard_quant.py
│   │       ├── optimizers/
│   │       │   ├── __init__.py
│   │       │   └── adopt.py
│   │       ├── quantization.py
│   │       ├── samplers/
│   │       │   ├── __init__.py
│   │       │   ├── multipack.py
│   │       │   └── utils.py
│   │       ├── schedulers.py
│   │       ├── schemas/
│   │       │   ├── __init__.py
│   │       │   ├── config.py
│   │       │   ├── datasets.py
│   │       │   ├── deprecated.py
│   │       │   ├── dynamic_checkpoint.py
│   │       │   ├── enums.py
│   │       │   ├── fsdp.py
│   │       │   ├── integrations.py
│   │       │   ├── internal/
│   │       │   │   └── __init__.py
│   │       │   ├── model.py
│   │       │   ├── multimodal.py
│   │       │   ├── peft.py
│   │       │   ├── quantization.py
│   │       │   ├── training.py
│   │       │   ├── trl.py
│   │       │   ├── utils.py
│   │       │   ├── validation.py
│   │       │   └── vllm.py
│   │       ├── tee.py
│   │       ├── tokenization.py
│   │       ├── trackio_.py
│   │       ├── train.py
│   │       ├── trainer.py
│   │       └── wandb_.py
│   └── setuptools_axolotl_dynamic_dependencies.py
├── styles.css
└── tests/
    ├── __init__.py
    ├── cli/
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_cli_base.py
    │   ├── test_cli_evaluate.py
    │   ├── test_cli_fetch.py
    │   ├── test_cli_inference.py
    │   ├── test_cli_interface.py
    │   ├── test_cli_merge_lora.py
    │   ├── test_cli_merge_sharded_fsdp_weights.py
    │   ├── test_cli_preprocess.py
    │   ├── test_cli_sweeps.py
    │   ├── test_cli_train.py
    │   ├── test_cli_version.py
    │   ├── test_nested_options.py
    │   └── test_utils.py
    ├── conftest.py
    ├── constants.py
    ├── core/
    │   ├── chat/
    │   │   ├── __init__.py
    │   │   ├── format/
    │   │   │   └── __init__.py
    │   │   └── test_messages.py
    │   ├── test_async_grpo.py
    │   └── test_builders.py
    ├── e2e/
    │   ├── .gitignore
    │   ├── __init__.py
    │   ├── integrations/
    │   │   ├── test_cut_cross_entropy.py
    │   │   ├── test_fp8.py
    │   │   ├── test_hooks.py
    │   │   ├── test_kd.py
    │   │   ├── test_liger.py
    │   │   ├── test_llm_compressor.py
    │   │   ├── test_scattermoe_lora_kernels.py
    │   │   ├── test_scattermoe_lora_olmoe.py
    │   │   └── test_sonicmoe.py
    │   ├── kernels/
    │   │   ├── test_geglu.py
    │   │   ├── test_lora.py
    │   │   ├── test_quantize.py
    │   │   └── test_swiglu.py
    │   ├── multigpu/
    │   │   ├── __init__.py
    │   │   ├── patched/
    │   │   │   ├── __init__.py
    │   │   │   └── test_sp.py
    │   │   ├── solo/
    │   │   │   ├── __init__.py
    │   │   │   ├── test_flex.py
    │   │   │   ├── test_gdpo.py
    │   │   │   └── test_grpo.py
    │   │   ├── test_dist_muon_fsdp2.py
    │   │   ├── test_eval.py
    │   │   ├── test_fp8_fsdp2.py
    │   │   ├── test_fsdp1.py
    │   │   ├── test_fsdp2.py
    │   │   ├── test_gemma3.py
    │   │   ├── test_llama.py
    │   │   ├── test_locking.py
    │   │   ├── test_ray.py
    │   │   └── test_tp.py
    │   ├── patched/
    │   │   ├── __init__.py
    │   │   ├── lora_kernels/
    │   │   │   ├── __init__.py
    │   │   │   └── test_lora_kernel_patching.py
    │   │   ├── test_4d_multipack_llama.py
    │   │   ├── test_activation_checkpointing.py
    │   │   ├── test_cli_integrations.py
    │   │   ├── test_fa_xentropy.py
    │   │   ├── test_falcon_samplepack.py
    │   │   ├── test_flattening.py
    │   │   ├── test_fsdp2_qlora.py
    │   │   ├── test_fused_llama.py
    │   │   ├── test_llama_s2_attention.py
    │   │   ├── test_lora_llama_multipack.py
    │   │   ├── test_mistral_samplepack.py
    │   │   ├── test_mixtral_samplepack.py
    │   │   ├── test_model_patches.py
    │   │   ├── test_peft_embeddings.py
    │   │   ├── test_phi_multipack.py
    │   │   ├── test_resume.py
    │   │   ├── test_unsloth_integration.py
    │   │   └── test_unsloth_qlora.py
    │   ├── solo/
    │   │   ├── __init__.py
    │   │   ├── test_flex.py
    │   │   └── test_relora_llama.py
    │   ├── test_activation_offloading.py
    │   ├── test_deepseekv3.py
    │   ├── test_diffusion.py
    │   ├── test_dpo.py
    │   ├── test_embeddings_lr.py
    │   ├── test_evaluate.py
    │   ├── test_falcon.py
    │   ├── test_gemma2.py
    │   ├── test_gemma3_text.py
    │   ├── test_imports.py
    │   ├── test_llama.py
    │   ├── test_llama_pretrain.py
    │   ├── test_llama_vision.py
    │   ├── test_load_model.py
    │   ├── test_lora_llama.py
    │   ├── test_mamba.py
    │   ├── test_mistral.py
    │   ├── test_mixtral.py
    │   ├── test_optimizers.py
    │   ├── test_packing_loss.py
    │   ├── test_phi.py
    │   ├── test_preprocess.py
    │   ├── test_process_reward_model_smollm2.py
    │   ├── test_profiler.py
    │   ├── test_qat.py
    │   ├── test_quantization.py
    │   ├── test_qwen.py
    │   ├── test_reward_model_smollm2.py
    │   ├── test_save_first_step.py
    │   ├── test_schedulers.py
    │   ├── test_streaming.py
    │   ├── test_tokenizer.py
    │   └── utils.py
    ├── fixtures/
    │   ├── alpaca/
    │   │   └── alpaca.json
    │   ├── conversation.json
    │   ├── conversation.missingturns.json
    │   ├── conversation.tokenized.json
    │   └── conversation.tokenized_llama2chat.json
    ├── hf_offline_utils.py
    ├── integrations/
    │   ├── __init__.py
    │   ├── test_diffusion.py
    │   ├── test_diffusion_callback.py
    │   ├── test_kd_chat_template.py
    │   ├── test_liger.py
    │   ├── test_routing_parity.py
    │   ├── test_scattermoe_autotune_telemetry.py
    │   ├── test_scattermoe_lora.py
    │   ├── test_scattermoe_lora_kernels.py
    │   ├── test_sonicmoe.py
    │   ├── test_sonicmoe_gradients.py
    │   └── test_swanlab.py
    ├── monkeypatch/
    │   ├── test_llama_attn_hijack_flash.py
    │   ├── test_pixtral_flash_attention_patch.py
    │   ├── test_qwen3_next_modeling_patch.py
    │   ├── test_trainer_accelerator_args.py
    │   ├── test_trainer_context_parallel_patch.py
    │   ├── test_trainer_loss_calc.py
    │   ├── test_trl_vllm.py
    │   └── test_voxtral_modeling_patch.py
    ├── patched/
    │   └── test_validation.py
    ├── prompt_strategies/
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── messages/
    │   │   ├── __init__.py
    │   │   └── test_chat.py
    │   ├── test_alpaca.py
    │   ├── test_chat_template_ds_schema_unification.py
    │   ├── test_chat_template_utils.py
    │   ├── test_chat_templates.py
    │   ├── test_chat_templates_advanced.py
    │   ├── test_chat_templates_mistral.py
    │   ├── test_chat_templates_thinking.py
    │   ├── test_chat_templates_tool_call_string_arguments.py
    │   ├── test_dpo_chat_templates.py
    │   ├── test_dpo_chatml.py
    │   ├── test_jinja_template_analyzer.py
    │   ├── test_raw_io.py
    │   └── test_stepwise.py
    ├── telemetry/
    │   ├── __init__.py
    │   ├── conftest.py
    │   ├── test_callbacks.py
    │   ├── test_errors.py
    │   ├── test_manager.py
    │   └── test_runtime_metrics.py
    ├── test_chunked_xentropy.py
    ├── test_context_parallel_batch_size.py
    ├── test_convert.py
    ├── test_data.py
    ├── test_datasets.py
    ├── test_dict.py
    ├── test_exact_deduplication.py
    ├── test_freeze.py
    ├── test_loaders.py
    ├── test_logging_config_file_capture.py
    ├── test_lora.py
    ├── test_normalize_config.py
    ├── test_opentelemetry_callback.py
    ├── test_packed_batch_sampler.py
    ├── test_packed_dataset.py
    ├── test_packed_pretraining.py
    ├── test_perplexity.py
    ├── test_prompt_tokenizers.py
    ├── test_prompters.py
    ├── test_revision_parameter.py
    ├── test_save_deduplicated.py
    ├── test_schedulers.py
    ├── test_streaming.py
    ├── test_tensor_parallel_batch_size.py
    ├── test_tokenizers.py
    ├── test_train.py
    ├── test_triton_kernels.py
    ├── test_utils_tee.py
    ├── test_validation_dataset.py
    └── utils/
        ├── callbacks/
        │   └── test_dynamic_checkpoint.py
        ├── data/
        │   └── test_utils.py
        ├── lora/
        │   ├── test_config_validation_lora.py
        │   ├── test_freeze_lora.py
        │   └── test_merge_lora.py
        ├── schemas/
        │   └── validation/
        │       ├── test_activation_offloading.py
        │       ├── test_default_values.py
        │       ├── test_fsdp.py
        │       └── test_moe_quant.py
        ├── test_grpo_rw_fnc.py
        ├── test_import_helper.py
        ├── test_mistral3_processor.py
        └── test_train.py

================================================
FILE CONTENTS
================================================

================================================
FILE: .axolotl-complete.bash
================================================
#!/bin/bash

_axolotl_completions() {
    local cur prev
    COMPREPLY=()
    cur="${COMP_WORDS[COMP_CWORD]}"
    prev="${COMP_WORDS[COMP_CWORD-1]}"

    # If we're completing the first argument (the command)
    if [[ $COMP_CWORD -eq 1 ]]; then
        mapfile -t COMPREPLY < <(compgen -W "delinearize-llama4 fetch lm-eval merge-sharded-fsdp-weights quantize vllm-serve evaluate inference merge-lora preprocess train" -- "$cur")
        return 0
    fi

    # Commands that should complete with directories and YAML files
    local -a yaml_commands=("merge-sharded-fsdp-weights" "quantize" "vllm-serve" "evaluate" "inference" "merge-lora" "preprocess" "train")

    # Check if previous word is in our list
    if [[ " ${yaml_commands[*]} " =~ (^|[[:space:]])$prev($|[[:space:]]) ]]; then
        # Use filename completion which handles directories properly
        compopt -o filenames
        mapfile -t COMPREPLY < <(compgen -f -- "$cur")

        # Filter to only include directories and YAML files
        local -a filtered=()
        for item in "${COMPREPLY[@]}"; do
            if [[ -d "$item" ]] || [[ "$item" == *.yaml ]] || [[ "$item" == *.yml ]]; then
                filtered+=("$item")
            fi
        done
        COMPREPLY=("${filtered[@]}")

        return 0
    fi

    # Default: no completion
    return 0
}

# Remove the -o nospace option - let filenames handle it
complete -F _axolotl_completions axolotl


================================================
FILE: .bandit
================================================
[bandit]
exclude = tests
skips = B101,B615,B102,B110


================================================
FILE: .coderabbit.yaml
================================================
# yaml-language-server: $schema=https://coderabbit.ai/integrations/schema.v2.json
language: "en-US"
early_access: false
reviews:
  profile: "chill"
  request_changes_workflow: false
  high_level_summary: true
  review_status: true
  collapse_walkthrough: true
  poem: false
  sequence_diagrams: false
  auto_review:
    enabled: true
    drafts: false
    auto_incremental_review: false
chat:
  auto_reply: true


================================================
FILE: .coveragerc
================================================
[run]
source = axolotl
omit =
    */tests/*
    setup.py

[report]
exclude_lines =
    pragma: no cover
    def __repr__
    raise NotImplementedError
    if __name__ == .__main__.:
    pass
    raise ImportError


================================================
FILE: .editorconfig
================================================
root = true

[*]
end_of_line = lf
insert_final_newline = true
trim_trailing_whitespace = true

[*.py]
indent_style = space
indent_size = 4

[**.yml]
indent_style = space
indent_size = 2


================================================
FILE: .gitattributes
================================================
data/*.jsonl filter=lfs diff=lfs merge=lfs -text


================================================
FILE: .github/CODE_OF_CONDUCT.md
================================================
# Contributor Covenant Code of Conduct

## Our Pledge

We as members, contributors, and leaders pledge to make participation in our
community a harassment-free experience for everyone, regardless of age, body
size, visible or invisible disability, ethnicity, sex characteristics, gender
identity and expression, level of experience, education, socio-economic status,
nationality, personal appearance, race, religion, or sexual identity
and orientation.

We pledge to act and interact in ways that contribute to an open, welcoming,
diverse, inclusive, and healthy community.

## Our Standards

Examples of behavior that contributes to a positive environment for our
community include:

* Demonstrating empathy and kindness toward other people
* Being respectful of differing opinions, viewpoints, and experiences
* Giving and gracefully accepting constructive feedback
* Accepting responsibility and apologizing to those affected by our mistakes,
  and learning from the experience
* Focusing on what is best not just for us as individuals, but for the
  overall community

Examples of unacceptable behavior include:

* The use of sexualized language or imagery, and sexual attention or
  advances of any kind
* Trolling, insulting or derogatory comments, and personal or political attacks
* Public or private harassment
* Publishing others' private information, such as a physical or email
  address, without their explicit permission
* Other conduct which could reasonably be considered inappropriate in a
  professional setting

## Enforcement Responsibilities

Community leaders are responsible for clarifying and enforcing our standards of
acceptable behavior and will take appropriate and fair corrective action in
response to any behavior that they deem inappropriate, threatening, offensive,
or harmful.

Community leaders have the right and responsibility to remove, edit, or reject
comments, commits, code, wiki edits, issues, and other contributions that are
not aligned to this Code of Conduct, and will communicate reasons for moderation
decisions when appropriate.

## Scope

This Code of Conduct applies within all community spaces, and also applies when
an individual is officially representing the community in public spaces.
Examples of representing our community include using an official e-mail address,
posting via an official social media account, or acting as an appointed
representative at an online or offline event.

## Enforcement

Instances of abusive, harassing, or otherwise unacceptable behavior may be
reported to the community leaders responsible for enforcement on Discord
at https://discord.gg/QYF8QrtEUm

All complaints will be reviewed and investigated promptly and fairly.

All community leaders are obligated to respect the privacy and security of the
reporter of any incident.

## Enforcement Guidelines

Community leaders will follow these Community Impact Guidelines in determining
the consequences for any action they deem in violation of this Code of Conduct:

### 1. Correction

**Community Impact**: Use of inappropriate language or other behavior deemed
unprofessional or unwelcome in the community.

**Consequence**: A private, written warning from community leaders, providing
clarity around the nature of the violation and an explanation of why the
behavior was inappropriate. A public apology may be requested.

### 2. Warning

**Community Impact**: A violation through a single incident or series
of actions.

**Consequence**: A warning with consequences for continued behavior. No
interaction with the people involved, including unsolicited interaction with
those enforcing the Code of Conduct, for a specified period of time. This
includes avoiding interactions in community spaces as well as external channels
like social media. Violating these terms may lead to a temporary or
permanent ban.

### 3. Temporary Ban

**Community Impact**: A serious violation of community standards, including
sustained inappropriate behavior.

**Consequence**: A temporary ban from any sort of interaction or public
communication with the community for a specified period of time. No public or
private interaction with the people involved, including unsolicited interaction
with those enforcing the Code of Conduct, is allowed during this period.
Violating these terms may lead to a permanent ban.

### 4. Permanent Ban

**Community Impact**: Demonstrating a pattern of violation of community
standards, including sustained inappropriate behavior,  harassment of an
individual, or aggression toward or disparagement of classes of individuals.

**Consequence**: A permanent ban from any sort of public interaction within
the community.

## Attribution

This Code of Conduct is adapted from the [Contributor Covenant][homepage],
version 2.0, available at
https://www.contributor-covenant.org/version/2/0/code_of_conduct.html.

Community Impact Guidelines were inspired by [Mozilla's code of conduct
enforcement ladder](https://github.com/mozilla/diversity).

[homepage]: https://www.contributor-covenant.org

For answers to common questions about this code of conduct, see the FAQ at
https://www.contributor-covenant.org/faq. Translations are available at
https://www.contributor-covenant.org/translations.


================================================
FILE: .github/CONTRIBUTING.md
================================================
# Contributing to axolotl

First of all, thank you for your interest in contributing to axolotl! We appreciate the time and effort you're willing to invest in making our project better. This document provides guidelines and information to make the contribution process as smooth as possible.

## Table of Contents

- [Code of Conduct](#code-of-conduct)
- [Getting Started](#getting-started)
- [How to Contribute](#how-to-contribute)
  - [Reporting Bugs](#reporting-bugs)
  - [Suggesting Enhancements](#suggesting-enhancements)
  - [Submitting Pull Requests](#submitting-pull-requests)
- [Style Guidelines](#style-guidelines)
  - [Code Style](#code-style)
  - [Commit Messages](#commit-messages)
- [Additional Resources](#additional-resources)

## Code of Conduct

All contributors are expected to adhere to our [Code of Conduct](CODE_OF_CONDUCT.md). Please read it before participating in the axolotl community.

## Getting Started

Bugs? Please check for open issue else create a new [Issue](https://github.com/axolotl-ai-cloud/axolotl/issues/new).

PRs are **greatly welcome**!

1. Fork the repository and clone it to your local machine.
2. Set up the development environment by following the instructions in the [README.md](https://github.com/axolotl-ai-cloud/axolotl/tree/main/README.md) file.
3. Explore the codebase, run tests, and verify that everything works as expected.

Please run below to setup env
```bash
pip3 install -r requirements-dev.txt -r requirements-tests.txt
pre-commit install

# test
pytest tests/
```

## How to Contribute

### Reporting Bugs

If you encounter a bug or issue while using axolotl, please open a new issue on the [GitHub Issues](https://github.com/axolotl-ai-cloud/axolotl/issues) page. Provide a clear and concise description of the problem, steps to reproduce it, and any relevant error messages or logs.

### Suggesting Enhancements

We welcome ideas for improvements and new features. To suggest an enhancement, open a new issue on the [GitHub Issues](https://github.com/axolotl-ai-cloud/axolotl/issues) page. Describe the enhancement in detail, explain the use case, and outline the benefits it would bring to the project.

### Submitting Pull Requests

1. Create a new branch for your feature or bugfix. Use a descriptive name like `feature/your-feature-name` or `fix/your-bugfix-name`.
2. Make your changes, following the [Style Guidelines](#style-guidelines) below.
3. Test your changes and ensure that they don't introduce new issues or break existing functionality.
4. Commit your changes, following the [commit message guidelines](#commit-messages).
5. Push your branch to your fork on GitHub.
6. Open a new pull request against the `main` branch of the axolotl repository. Include a clear and concise description of your changes, referencing any related issues.

#### Skipping CI Checks

You can skip certain CI checks by including specific keywords in your commit messages:

- `[skip ci]` or `skip ci` - Skips all CI checks for that commit
- `[skip-e2e]` or `skip-e2e` - Skips only end-to-end tests while running other CI checks. You may also include this in the title of your PR to disable end-to-end tests for the entire PR.

## Style Guidelines

### Code Style

axolotl uses [Ruff](https://docs.astral.sh/ruff/) as its code style guide. Please ensure that your code follows these guidelines.

Use the pre-commit linter to ensure that your code is formatted consistently.
```bash
pre-commit run --all-files
```

### Commit Messages

Write clear and concise commit messages that briefly describe the changes made in each commit. Use the imperative mood and start with a capitalized verb, e.g., "Add new feature" or "Fix bug in function".

## Additional Resources

- [GitHub Help](https://help.github.com/)
- [GitHub Pull Request Documentation](https://docs.github.com/en/github/collaborating-with-issues-and-pull-requests)
- [Ruff](https://docs.astral.sh/ruff/)

Thank you once again for your interest in contributing to axolotl. We look forward to collaborating with you and creating an even better project together!


================================================
FILE: .github/FUNDING.yml
================================================
# These are supported funding model platforms

github: # Replace with up to 4 GitHub Sponsors-enabled usernames e.g., [user1, user2]
patreon: # Replace with a single Patreon username
open_collective: # Replace with a single Open Collective username
ko_fi: # Replace with a single Ko-fi username
tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
liberapay: # Replace with a single Liberapay username
issuehunt: # Replace with a single IssueHunt username
otechie: # Replace with a single Otechie username
lfx_crowdfunding: # Replace with a single LFX Crowdfunding project-name e.g., cloud-foundry
custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']


================================================
FILE: .github/ISSUE_TEMPLATE/bug-report.yaml
================================================
name: Bug Report
description: File a bug report
labels: ["bug", "needs triage"]
body:
  - type: markdown
    attributes:
      value: |
        ## Before you start
        Please **make sure you are on the latest version.**
        If you encountered the issue after you installed, updated, or reloaded, **please try restarting before reporting the bug**.

  - type: checkboxes
    id: no-duplicate-issues
    attributes:
      label: "Please check that this issue hasn't been reported before."
      description: "The **Label filters** may help make your search more focussed."
      options:
        - label: "I searched previous [Bug Reports](https://github.com/axolotl-ai-cloud/axolotl/labels/bug) didn't find any similar reports."
          required: true

  - type: textarea
    id: expected
    attributes:
      label: Expected Behavior
      description: Tell us what **should** happen.
    validations:
      required: true

  - type: textarea
    id: what-happened
    attributes:
      label: Current behaviour
      description: |
        Tell us what happens instead of the expected behavior.
        Provide stacktrace and/or screenshots.
    validations:
      required: true

  - type: textarea
    id: reproduce
    attributes:
      label: Steps to reproduce
      description: |
        Which exact steps can a developer take to reproduce the issue?
        The more detail you provide, the easier it will be to narrow down and fix the bug.
        Please paste in tasks and/or queries **as text, not screenshots**.
      placeholder: |
        Example of the level of detail needed to reproduce any bugs efficiently and reliably.
        1. Go to the '...' page.
        2. Click on the '...' button.
        3. Scroll down to '...'.
        4. Observe the error.
    validations:
      required: true

  - type: textarea
    id: config
    attributes:
      label: Config yaml
      description: |
        Please attach the config yaml!
      render: yaml

  - type: textarea
    id: possible-solution
    attributes:
      label: Possible solution
      description: |
        Not obligatory, but please suggest a fix or reason for the bug, if you have an idea.


  - type: checkboxes
    id: operating-systems
    attributes:
      label: Which Operating Systems are you using?
      description: You may select more than one.
      options:
        - label: Linux
        - label: macOS
        - label: Windows

  - type: input
    id: Python-version
    attributes:
      label: Python Version
      description: Which {Programming} version are you using?
      placeholder: 3.10 / please change accordingly
    validations:
      required: true

  - type: input
    id: axolotl-branch-commit
    attributes:
      label: axolotl branch-commit
      description: On which branch/commit are you?
      placeholder: main/4d6490b
    validations:
      required: true

  - type: checkboxes
    id: acknowledgements
    attributes:
      label: 'Acknowledgements'
      description: 'Please confirm the following:'
      options:
        - label: 'My issue title is concise, descriptive, and in title casing.'
          required: true
        - label: 'I have searched the existing issues to make sure this bug has not been reported yet.'
          required: true
        - label: 'I am using the latest version of axolotl.'
          required: true
        - label: 'I have provided enough information for the maintainers to reproduce and diagnose the issue.'
          required: true


================================================
FILE: .github/ISSUE_TEMPLATE/config.yml
================================================
blank_issues_enabled: false
contact_links:
  - name: Ask a question
    url: https://github.com/axolotl-ai-cloud/axolotl/discussions/categories/q-a
    about: Ask questions and discuss with other community members
  - name: Discuss the Project in Discord
    url: https://discord.gg/HhrNrHJPRb


================================================
FILE: .github/ISSUE_TEMPLATE/docs.yml
================================================
name: Documentation Improvement / Clarity
description: Make a suggestion to improve the project documentation.
labels: ['needs triage', 'docs']
body:
  - type: markdown
    attributes:
      value: '## :book: Documentation :book:'
  - type: markdown
    attributes:
      value: |
        * Ask questions in [Discord](https://discord.gg/HhrNrHJPRb).
        * Before you file an issue read the [Contributing guide](./CONTRIBUTING.md).
        * Check to make sure someone hasn't already opened a [similar issue](https://github.com/axolotl-ai-cloud/axolotl/issues).
  - type: textarea
    attributes:
      label: What piece of documentation is affected?
      description: Please link to the article you'd like to see updated.
    validations:
      required: true
  - type: textarea
    attributes:
      label: What part(s) of the article would you like to see updated?
      description: |
        - Give as much detail as you can to help us understand the change you want to see.
        - Why should the docs be changed? What use cases does it support?
        - What is the expected outcome?
    validations:
      required: true
  - type: textarea
    attributes:
      label: Additional Information
      description: Add any other context or screenshots about the feature request here.
    validations:
      required: false
  - type: checkboxes
    id: acknowledgements
    attributes:
      label: 'Acknowledgements'
      description: 'Please confirm the following:'
      options:
        - label: 'My issue title is concise, descriptive, and in title casing.'
          required: true
        - label: 'I have searched the existing issues to make sure this feature has not been requested yet.'
          required: true
        - label: 'I have provided enough information for the maintainers to understand and evaluate this request.'
          required: true


================================================
FILE: .github/ISSUE_TEMPLATE/feature-request.yaml
================================================
name: Feature Request / Enhancement
description: Suggest a new feature or feature enhancement for the project
labels: ["enhancement", "needs triage"]
body:
  - type: checkboxes
    id: no-duplicate-issues
    attributes:
      label: "⚠️ Please check that this feature request hasn't been suggested before."
      description: "There are two locations for previous feature requests. Please search in both. Thank you. The **Label filters** may help make your search more focussed."
      options:
        - label: "I searched previous [Ideas in Discussions](https://github.com/axolotl-ai-cloud/axolotl/discussions/categories/ideas) didn't find any similar feature requests."
          required: true
        - label: "I searched previous [Issues](https://github.com/axolotl-ai-cloud/axolotl/labels/enhancement) didn't find any similar feature requests."
          required: true

  - type: textarea
    id: feature-description
    validations:
      required: true
    attributes:
      label: "🔖 Feature description"
      description: "A clear and concise description of what the feature request is."
      placeholder: "You should add ..."

  - type: textarea
    id: solution
    validations:
      required: true
    attributes:
      label: "✔️ Solution"
      description: "A clear and concise description of what you want to happen, and why."
      placeholder: "In my use-case, ..."

  - type: textarea
    id: alternatives
    validations:
      required: false
    attributes:
      label: "❓ Alternatives"
      description: "A clear and concise description of any alternative solutions or features you've considered."
      placeholder: "I have considered ..."

  - type: textarea
    id: additional-context
    validations:
      required: false
    attributes:
      label: "📝 Additional Context"
      description: "Add any other context or screenshots about the feature request here."
      placeholder: "..."

  - type: checkboxes
    id: acknowledgements
    attributes:
      label: 'Acknowledgements'
      description: 'Please confirm the following:'
      options:
        - label: 'My issue title is concise, descriptive, and in title casing.'
          required: true
        - label: 'I have searched the existing issues to make sure this feature has not been requested yet.'
          required: true
        - label: 'I have provided enough information for the maintainers to understand and evaluate this request.'
          required: true


================================================
FILE: .github/PULL_REQUEST_TEMPLATE.md
================================================
<!--- Provide a general summary of your changes in the Title above -->

# Description

<!--- Describe your changes in detail -->

## Motivation and Context

<!--- Why is this change required? What problem does it solve? -->
<!--- If it fixes an open issue, please link to the issue here. -->

## How has this been tested?

<!--- Please describe in detail how you tested your changes. -->
<!--- Include details of your testing environment, tests ran to see how -->
<!--- your change affects other areas of the code, etc. -->

## AI Usage Disclaimer

<!--- Was AI (e.g., ChatGPT, Claude, Copilot) used to generate or assist with this PR? -->
<!--- Please indicate: No / Yes (specify which tool and to what extent) -->

## Screenshots (if appropriate)

## Types of changes

<!--- What types of changes does your code introduce? Put an `x` in all the boxes that apply: -->

## Social Handles (Optional)

<!-- Thanks for submitting a bugfix or enhancement. -->
<!-- We'd love to show our thanks to you on Twitter & Discord if you provide your handle -->


================================================
FILE: .github/SECURITY.md
================================================
# Security Policy

## Supported Versions

Due to the nature of the fast development that is happening in this project, only the latest released version can be supported.

## Reporting a Vulnerability

If you find a vulnerability, please contact us on  [Discord](https://discord.gg/xcu3ECkH9a) rather than creating a GitHub issue to allow us some time to fix it before it is a known vulnerability to others.


================================================
FILE: .github/SUPPORT.md
================================================
# Support

If you need help with this project or have questions, please:

1. Check the documentation.
2. Search the existing issues and pull requests.
3. Create a new issue if your question is not answered or your problem is not solved.
4. Have a look in the [Discord server](https://discord.gg/HhrNrHJPRb)

Please note that this project is maintained by volunteers who have limited availability. We'll do our best to address your questions and concerns in a timely manner.


================================================
FILE: .github/release-drafter.yml
================================================
name-template: 'v$RESOLVED_VERSION'
tag-template: 'v$RESOLVED_VERSION'
categories:
  - title: '🚀 Features'
    labels:
      - 'feature'
      - 'enhancement'
  - title: '🐛 Bug Fixes'
    labels:
      - 'fix'
      - 'bugfix'
      - 'bug'
  - title: '🧰 Maintenance'
    label: 'chore'
change-template: '- $TITLE @$AUTHOR (#$NUMBER)'
change-title-escapes: '\<*_&' # You can add # and @ to disable mentions, and add ` to disable code blocks.
version-resolver:
  major:
    labels:
      - 'major'
  minor:
    labels:
      - 'minor'
  patch:
    labels:
      - 'patch'
  default: patch
template: |
  ## What’s Changed

  $CHANGES


================================================
FILE: .github/workflows/base.yml
================================================
name: ci-cd-base

on:
  push:
    branches:
      - "main"
    paths:
      - 'docker/Dockerfile-base'
      - 'docker/Dockerfile-uv-base'
      - '.github/workflows/base.yml'
  pull_request:
    paths:
      - 'docker/Dockerfile-base'
      - 'docker/Dockerfile-uv-base'
      - '.github/workflows/base.yml'
  workflow_dispatch:

permissions:
  contents: read

jobs:
  build-base:
    if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
    timeout-minutes: 480
    # this job needs to be run on self-hosted GPU runners...
    runs-on: ubuntu-latest-m
    env:
      HAS_DOCKERHUB_CREDS: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_TOKEN != '' }}
    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.8.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.9.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.9.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.10.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.12"
            pytorch: 2.10.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
#          - cuda: "129"
#            cuda_version: 12.9.1
#            cudnn_version: ""
#            python_version: "3.12"
#            pytorch: 2.9.1
#            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
#            dockerfile: "Dockerfile-base"
#            platforms: "linux/amd64,linux/arm64"
          - cuda: "130"
            cuda_version: 13.0.0
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.9.1
            torch_cuda_arch_list: "9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "130"
            cuda_version: 13.0.0
            cudnn_version: ""
            python_version: "3.12"
            pytorch: 2.9.1
            torch_cuda_arch_list: "9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "130"
            cuda_version: 13.0.0
            cudnn_version: ""
            python_version: "3.12"
            pytorch: 2.10.0
            torch_cuda_arch_list: "9.0+PTX"
            dockerfile: "Dockerfile-base"
            platforms: "linux/amd64,linux/arm64"
#          - cuda: "128"
#            cuda_version: 12.8.1
#            cudnn_version: ""
#            python_version: "3.11"
#            pytorch: nightly
#            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
#            dockerfile: "Dockerfile-base-nightly"
#          # "next" is for release candidates of pytorch
#          - cuda: "128"
#            cuda_version: 12.8.1
#            cudnn_version: ""
#            python_version: "3.11"
#            pytorch: next
#            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
#            dockerfile: "Dockerfile-base-next"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Docker metadata
        id: metadata
        uses: docker/metadata-action@v5
        with:
          images: |
            axolotlai/axolotl-base
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        if: ${{ github.event_name != 'pull_request' && env.HAS_DOCKERHUB_CREDS == 'true' }}
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Build
        uses: docker/build-push-action@v5
        with:
          context: .
          file: ./docker/${{ matrix.dockerfile }}
          platforms: ${{ matrix.platforms }}
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
          build-args: |
            CUDA_VERSION=${{ matrix.cuda_version }}
            CUDNN_VERSION=${{ matrix.cudnn_version }}
            CUDA=${{ matrix.cuda }}
            PYTHON_VERSION=${{ matrix.python_version }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
            TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}
  build-base-uv:
    if: ${{ github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
    timeout-minutes: 480
    runs-on: ubuntu-latest-m
    env:
      HAS_DOCKERHUB_CREDS: ${{ secrets.DOCKERHUB_USERNAME != '' && secrets.DOCKERHUB_TOKEN != '' }}
    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.8.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.9.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.12"
            pytorch: 2.9.1
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.9.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.10.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "128"
            cuda_version: 12.8.1
            cudnn_version: ""
            python_version: "3.12"
            pytorch: 2.10.0
            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
#          - cuda: "129"
#            cuda_version: 12.9.1
#            cudnn_version: ""
#            python_version: "3.12"
#            pytorch: 2.9.1
#            torch_cuda_arch_list: "7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
#            dockerfile: "Dockerfile-uv-base"
#            platforms: "linux/amd64,linux/arm64"
          - cuda: "130"
            cuda_version: 13.0.0
            cudnn_version: ""
            python_version: "3.11"
            pytorch: 2.9.1
            torch_cuda_arch_list: "9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "130"
            cuda_version: 13.0.0
            cudnn_version: ""
            python_version: "3.12"
            pytorch: 2.9.1
            torch_cuda_arch_list: "9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
          - cuda: "130"
            cuda_version: 13.0.0
            cudnn_version: ""
            python_version: "3.12"
            pytorch: 2.10.0
            torch_cuda_arch_list: "9.0+PTX"
            dockerfile: "Dockerfile-uv-base"
            platforms: "linux/amd64,linux/arm64"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Docker metadata
        id: metadata
        uses: docker/metadata-action@v5
        with:
          images: |
            axolotlai/axolotl-base-uv
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        if: ${{ github.event_name != 'pull_request' && env.HAS_DOCKERHUB_CREDS == 'true' }}
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Build
        uses: docker/build-push-action@v5
        with:
          context: .
          file: ./docker/${{ matrix.dockerfile }}
          platforms: ${{ matrix.platforms }}
          push: ${{ github.event_name != 'pull_request' }}
          tags: ${{ steps.metadata.outputs.tags }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}
          build-args: |
            CUDA_VERSION=${{ matrix.cuda_version }}
            CUDNN_VERSION=${{ matrix.cudnn_version }}
            CUDA=${{ matrix.cuda }}
            PYTHON_VERSION=${{ matrix.python_version }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
            TORCH_CUDA_ARCH_LIST=${{ matrix.torch_cuda_arch_list }}


================================================
FILE: .github/workflows/docs.yml
================================================
name: Publish Docs
on:
  push:
    branches:
      - main

permissions:
    contents: write
    pages: write

jobs:
    build-deploy:
        runs-on: ubuntu-latest
        steps:
        - name: cleanup node
          run: |
            sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL
        - name: Check out repository
          uses: actions/checkout@v4
        - name: Set up Quarto
          uses: quarto-dev/quarto-actions/setup@v2
        - name: Setup Python
          uses: actions/setup-python@v5
          with:
            python-version: '3.11'
        - name: Install dependencies
          run: |
            python3 -m pip install jupyter quartodoc
            python3 -m pip install -e .
        - name: Build autodoc
          run: quartodoc build
        - name: Publish to GitHub Pages (and render)
          uses: quarto-dev/quarto-actions/publish@v2
          with:
            target: gh-pages
          env:
            GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}


================================================
FILE: .github/workflows/lint.yml
================================================
name: lint
on:
  # check on PRs, and manual triggers
  merge_group:
  pull_request:
      types: [opened, synchronize, reopened, ready_for_review]
      paths:
       - '**.py'
       - 'requirements.txt'
       - '.github/workflows/*.yml'
       - "*.[q]md"
       - "examples/**/*.y[a]?ml"
       - ".pre-commit-config.yaml"
  workflow_dispatch:

permissions:
  contents: read

jobs:
  pre-commit:
    name: pre-commit
    runs-on: ubuntu-latest
    if: ${{ !github.event.pull_request.draft }}
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.1


================================================
FILE: .github/workflows/main.yml
================================================
name: ci-cd

on:
  push:
    branches:
      - "main"
    tags:
      - "v*"
  workflow_dispatch:

permissions:
  contents: read

jobs:
  build-axolotl:
    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.8.0
            axolotl_extras:
            platforms: "linux/amd64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.0
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
            is_latest: true
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.12"
            pytorch: 2.10.0
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
#          - cuda: 129
#            cuda_version: 12.9.1
#            python_version: "3.12"
#            pytorch: 2.9.1
#            axolotl_extras:
#            platforms: "linux/amd64,linux/arm64"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.12"
            pytorch: 2.10.0
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Docker metadata
        id: metadata
        uses: docker/metadata-action@v5
        with:
          images: |
            axolotlai/axolotl
          tags: |
            type=ref,event=branch
            type=pep440,pattern={{version}}
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      # guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/
      - name: Build and export to Docker
        uses: docker/build-push-action@v5
        with:
          context: .
          platforms: ${{ matrix.platforms }}
          build-args: |
            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
            AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}
          file: ./docker/Dockerfile
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
          labels: ${{ steps.metadata.outputs.labels }}

  build-axolotl-uv:
    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.12"
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
            is_latest: true
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.12"
            pytorch: 2.10.0
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.12"
            pytorch: 2.10.0
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Docker metadata
        id: metadata
        uses: docker/metadata-action@v5
        with:
          images: |
            axolotlai/axolotl-uv
          tags: |
            type=ref,event=branch
            type=pep440,pattern={{version}}
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      # guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/
      - name: Build and export to Docker
        uses: docker/build-push-action@v5
        with:
          context: .
          platforms: ${{ matrix.platforms }}
          build-args: |
            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
            AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}
          file: ./docker/Dockerfile-uv
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
          labels: ${{ steps.metadata.outputs.labels }}

  build-axolotl-cloud:
    needs: build-axolotl
    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.8.0
            axolotl_extras:
            platforms: "linux/amd64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.0
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            is_latest: true
            platforms: "linux/amd64,linux/arm64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.12"
            pytorch: 2.10.0
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
#          - cuda: 129
#            cuda_version: 12.9.1
#            python_version: "3.12"
#            pytorch: 2.9.1
#            axolotl_extras:
#            platforms: "linux/amd64,linux/arm64"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.12"
            pytorch: 2.10.0
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Docker metadata
        id: metadata
        uses: docker/metadata-action@v5
        with:
          images: |
            axolotlai/axolotl-cloud
          tags: |
            type=ref,event=branch
            type=pep440,pattern={{version}}
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Build
        uses: docker/build-push-action@v5
        with:
          context: .
          platforms: ${{ matrix.platforms }}
          build-args: |
            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
          file: ./docker/Dockerfile-cloud
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
             ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
          labels: ${{ steps.metadata.outputs.labels }}

  build-axolotl-cloud-uv:
    needs: build-axolotl-uv
    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.12"
            pytorch: 2.9.1
            axolotl_extras:
            is_latest: true
            platforms: "linux/amd64,linux/arm64"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.12"
            pytorch: 2.10.0
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.12"
            pytorch: 2.10.0
            axolotl_extras:
            platforms: "linux/amd64,linux/arm64"
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Docker metadata
        id: metadata
        uses: docker/metadata-action@v5
        with:
          images: |
            axolotlai/axolotl-cloud-uv
          tags: |
            type=ref,event=branch
            type=pep440,pattern={{version}}
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Build
        uses: docker/build-push-action@v5
        with:
          context: .
          platforms: ${{ matrix.platforms }}
          build-args: |
            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
          file: ./docker/Dockerfile-cloud-uv
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
             ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
          labels: ${{ steps.metadata.outputs.labels }}

  build-axolotl-cloud-no-tmux:
    needs: build-axolotl
    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            is_latest: true
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
            is_latest:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Docker metadata
        id: metadata
        uses: docker/metadata-action@v5
        with:
          images: |
            axolotlai/axolotl-cloud-term
          tags: |
            type=ref,event=branch
            type=pep440,pattern={{version}}
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Build
        uses: docker/build-push-action@v5
        with:
          context: .
          platforms: linux/amd64,linux/arm64
          build-args: |
            BASE_TAG=${{ github.ref_type == 'tag' && 'main' || github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
          file: ./docker/Dockerfile-cloud-no-tmux
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
             ${{ (matrix.is_latest) && format('{0}-latest', steps.metadata.outputs.tags) || '' }}
          labels: ${{ steps.metadata.outputs.labels }}


================================================
FILE: .github/workflows/multi-gpu-e2e.yml
================================================
name: docker-multigpu-tests-biweekly

on:
  pull_request:
    paths:
      - 'tests/e2e/multigpu/**.py'
      - 'requirements.txt'
      - 'setup.py'
      - 'pyproject.toml'
      - '.github/workflows/multi-gpu-e2e.yml'
      - 'scripts/cutcrossentropy_install.py'
      - 'src/axolotl/core/trainers/mixins/sequence_parallel.py'
      - 'src/axolotl/utils/distributed.py'
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * 1,4'  # Runs at 00:00 UTC every monday & thursday

# Cancel jobs on the same ref if a new one is triggered
concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

permissions:
  contents: read

env:
  MODAL_IMAGE_BUILDER_VERSION: "2025.06"

jobs:
  test-axolotl-multigpu:
    if: ${{ ! contains(github.event.commits[0].message, '[skip e2e]') && github.repository_owner == 'axolotl-ai-cloud' && (github.event_name != 'pull_request' || !github.event.pull_request.draft) }}
    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.8.0
            axolotl_extras: fbgemm-gpu
            num_gpus: 2
#          - cuda: 129
#            cuda_version: 12.9.1
#            python_version: "3.12"
#            pytorch: 2.9.1
#            axolotl_extras: "fbgemm-gpu"
#            num_gpus: 2
#            dockerfile: "Dockerfile-uv.jinja"
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
#            axolotl_extras: fbgemm-gpu
            num_gpus: 2
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.10.0
            axolotl_extras: "fbgemm-gpu"
            num_gpus: 2
            dockerfile: "Dockerfile-uv.jinja"
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Install Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
          pip install modal==1.3.0.post1 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
        run: |
          modal run -m cicd.multigpu


================================================
FILE: .github/workflows/nightlies.yml
================================================
name: docker-nightlies

on:
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * *'  # Runs at 00:00 UTC every day

permissions:
  contents: read

jobs:
  build-axolotl:
    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.8.0
            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Docker metadata
        id: metadata
        uses: docker/metadata-action@v5
        with:
          images: |
            axolotlai/axolotl
          tags: |
            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      # guidance for testing before pushing: https://docs.docker.com/build/ci/github-actions/test-before-push/
      - name: Build and export to Docker
        uses: docker/build-push-action@v5
        with:
          context: .
          build-args: |
            BASE_TAG=${{ github.ref_name }}-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}
            CUDA=${{ matrix.cuda }}
            PYTORCH_VERSION=${{ matrix.pytorch }}
            AXOLOTL_ARGS=${{ matrix.axolotl_args }}
          file: ./docker/Dockerfile
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
            ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}

  build-axolotl-cloud:
    needs: build-axolotl
    if: ${{ ! contains(github.event.commits[0].message, '[skip docker]') && github.repository_owner == 'axolotl-ai-cloud' }}
    # this job needs to be run on self-hosted GPU runners...
    strategy:
      matrix:
        include:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.8.0
            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
            axolotl_extras:
    runs-on: axolotl-gpu-runner
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Docker metadata
        id: metadata
        uses: docker/metadata-action@v5
        with:
          images: |
            axolotlai/axolotl-cloud
          tags: |
            type=raw,value={{ branch }}-{{ date 'YYYYMMDD' }}
      - name: Login to Docker Hub
        uses: docker/login-action@v3
        with:
          username: ${{ secrets.DOCKERHUB_USERNAME }}
          password: ${{ secrets.DOCKERHUB_TOKEN }}
      - name: Set up Docker Buildx
        uses: docker/setup-buildx-action@v3
      - name: Build
        uses: docker/build-push-action@v5
        with:
          context: .
          build-args: |
            BASE_TAG=${{ github.ref_name }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
            CUDA=${{ matrix.cuda }}
          file: ./docker/Dockerfile-cloud
          push: ${{ github.event_name != 'pull_request' }}
          tags: |
             ${{ steps.metadata.outputs.tags }}-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}${{ matrix.axolotl_extras != '' && '-' || '' }}${{ matrix.axolotl_extras }}
          labels: ${{ steps.metadata.outputs.labels }}


================================================
FILE: .github/workflows/precommit-autoupdate.yml
================================================
name: Pre-commit auto-update

on:
  schedule:
    - cron: '0 0 1 * *'  # Run monthly
  workflow_dispatch:  # Manual kickoff

permissions: {}

jobs:
  auto-update:
    runs-on: ubuntu-latest
    permissions:
      contents: write
      pull-requests: write
    steps:
      - uses: actions/checkout@v4

      - uses: actions/setup-python@v5
        with:
          python-version: '3.11'

      - name: Update pre-commit hooks
        id: update
        run: |
          pip install pre-commit
          pre-commit autoupdate
          if [[ -n $(git status --porcelain) ]]; then
            echo "changes=true" >> $GITHUB_OUTPUT
          fi

      - name: Create Pull Request
        if: steps.update.outputs.changes == 'true'
        uses: peter-evans/create-pull-request@v6
        with:
          token: ${{ secrets.GITHUB_TOKEN }}
          branch: update/pre-commit-hooks
          delete-branch: true
          title: "chore: update pre-commit hooks"
          commit-message: "chore: update pre-commit hooks"
          body: |
            Automated PR to update pre-commit hooks to their latest versions.


================================================
FILE: .github/workflows/preview-docs.yml
================================================
name: Preview
on:
  workflow_dispatch:
  pull_request:
    types: [opened, synchronize, reopened, ready_for_review]

    # Run the workflow only when one of these files changes
    paths:
      - '**/*.md'      # any Markdown file
      - '**/*.qmd'     # any Quarto file
      - '_quarto.yml'
      - docs/scripts/generate_config_docs.py
      - src/axolotl/utils/schemas/**.py
      - .github/workflows/preview-docs.yml

permissions:
  contents: read
  pull-requests: write

jobs:
  preview:
    runs-on: ubuntu-latest
    if: ${{ !github.event.pull_request.draft }}
    steps:
      - name: cleanup node
        run: |
          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL

      - name: Check out repository
        uses: actions/checkout@v4
        with:
          ref: ${{ github.event.pull_request.head.sha }}

      - name: Set up Quarto
        uses: quarto-dev/quarto-actions/setup@v2

      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: '3.11'

      - name: Install dependencies
        run: |
          python3 -m pip install jupyter quartodoc
          python3 -m pip install -e .

      - name: Build autodoc
        run: quartodoc build

      - name: Quarto render
        run: quarto render

      - name: Netlify Publish
        uses: nwtgck/actions-netlify@v3.0
        if: ${{ github.event.pull_request.head.repo.full_name == github.repository }}
        id: netlify
        with:
          publish-dir: './_site'
          enable-pull-request-comment: false
          enable-github-deployment: false
          github-token: ${{ secrets.GITHUB_TOKEN }}
          deploy-message: "Deployed On Netlify"
          github-deployment-environment: 'preview'
          github-deployment-description: 'Preview Deployment'
        env:
          NETLIFY_AUTH_TOKEN: ${{ secrets.NETLIFY_AUTH_TOKEN }}
          NETLIFY_SITE_ID: ${{ secrets.NETLIFY_SITE_ID }}

      - name: Update PR with preview link
        if: ${{ steps.netlify.outcome == 'success' }}
        uses: marocchino/sticky-pull-request-comment@v2
        with:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
          message: |
            📖 **Documentation Preview**: ${{ steps.netlify.outputs.deploy-url }}

            Deployed on Netlify from commit ${{ github.event.pull_request.head.sha }}


================================================
FILE: .github/workflows/pypi.yml
================================================
name: publish pypi

on:
  push:
    tags:
      - "v*"
  workflow_dispatch:

permissions: {}

jobs:
  setup_release:
    name: Create Release
    runs-on: ubuntu-latest
    permissions:
      contents: write
    steps:
      - name: Checkout code
        uses: actions/checkout@v4

      - name: Create release
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: gh release create "$GITHUB_REF_NAME" --generate-notes
  pypi-publish:
    name: Upload release to PyPI
    runs-on: ubuntu-latest
    needs: [setup_release]
    environment:
      name: pypi
      url: https://pypi.org/p/axolotl
    permissions:
      contents: read
      id-token: write # IMPORTANT: this permission is mandatory for trusted publishing
    steps:
      - name: Check out repository code
        uses: actions/checkout@v4

      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"

      - name: Install dependencies
        run: |
          pip3 install wheel packaging==26.0
          pip3 install --no-build-isolation -e .
          pip3 install -r requirements-dev.txt -r requirements-tests.txt

      - name: Extract tag name
        id: tag
        run: echo "TAG_NAME=$(echo $GITHUB_REF | cut -d / -f 3)" >> "$GITHUB_OUTPUT"

      - name: Update version in VERSION file
        run: |
          echo "${{ steps.tag.outputs.TAG_NAME }}" | sed 's/^v//' > VERSION

      - name: Build a source dist
        run: |
          python setup.py sdist

      - name: Publish package distributions to PyPI
        uses: pypa/gh-action-pypi-publish@release/v1


================================================
FILE: .github/workflows/tests-nightly.yml
================================================
name: Tests Nightly against upstream main
on:
  workflow_dispatch:
  schedule:
    - cron: '0 0 * * *'  # Runs at 00:00 UTC every day
  pull_request:
    types: [opened, synchronize, reopened, ready_for_review]
    paths:
      - '.github/workflows/tests-nightly.yml'

permissions:
  contents: read

jobs:
  pre-commit:
    name: pre-commit
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch

  prime-cdn-s3-cache:
    name: Prefetch S3 once to prime the CDN cache
    runs-on: ubuntu-latest
    if: ${{ !github.event.pull_request.draft }}
    timeout-minutes: 10
    steps:
      - name: Restore Cache from S3
        id: hf-cache-restore-s3
        run: |
          curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null

  pytest:
    name: PyTest
    runs-on: ubuntu-latest
    needs: [prime-cdn-s3-cache]
    strategy:
      fail-fast: false
      matrix:
        python_version: ["3.12"]  # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
        pytorch_version: ["2.8.0", "2.9.1", "2.10.0"]
    timeout-minutes: 20

    steps:
      - name: Check out repository code
        uses: actions/checkout@v4

      - name: Restore Cache from S3
        id: hf-cache-restore-s3
        run: |
          mkdir -p /home/runner/.cache/huggingface/hub
          curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xf - -C /home/runner/.cache/huggingface/hub/  --use-compress-program unzstd

      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies

      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
          pip3 install --upgrade packaging==26.0 setuptools==78.1.1 wheel

      - name: Install PyTorch
        run: |
          pip3 install torch==${{ matrix.pytorch_version }} torchvision

      - name: Update requirements.txt
        run: |
          sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt
          sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt
          sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt
          sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt
          sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt

      - name: Install dependencies
        run: |
          pip3 show torch
          pip3 install --no-build-isolation -U -e .
          python scripts/unsloth_install.py | sh
          python scripts/cutcrossentropy_install.py | sh
          pip3 install -r requirements-dev.txt -r requirements-tests.txt

      - name: Make sure PyTorch version wasn't clobbered
        run: |
          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"

      - name: Ensure axolotl CLI was installed
        run: |
          axolotl --help

      - name: Run tests
        run: |
          pytest -v --durations=10 -n8 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ tests/
          pytest -v --durations=10 tests/patched/
          pytest -v --durations=10 tests/cli/

      - name: cleanup pip cache
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

  docker-e2e-tests:
    if: github.repository_owner == 'axolotl-ai-cloud'
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    needs: [pre-commit, pytest]

    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
            nightly_build: "true"
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.10.0
            num_gpus: 1
            axolotl_extras:
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.12"
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
            dockerfile: "Dockerfile-uv.jinja"
            nightly_build: "true"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Install Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
          pip install modal==1.3.0.post1 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
        run: |
          modal run cicd.e2e_tests
  docker-e2e-multigpu-tests:
    if: github.repository_owner == 'axolotl-ai-cloud'
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    needs: [pre-commit, pytest, docker-e2e-tests]

    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
            num_gpus: 2
            axolotl_extras:
            nightly_build: "true"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Install Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
          pip install modal==1.3.0.post1 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "NIGHTLY_BUILD=${{ matrix.nightly_build }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
        run: |
          modal run cicd.multigpu


================================================
FILE: .github/workflows/tests.yml
================================================
name: Tests
on:
  # check on push/merge to main, PRs, and manual triggers
  merge_group:
  push:
    branches:
      - "main"
    paths:
      - '**.py'
      - 'requirements.txt'
      - '.github/workflows/*.yml'
      - 'requirements-tests.txt'
      - 'cicd/cicd.sh'
      - 'cicd/Dockerfile.jinja'
  pull_request:
      types: [opened, synchronize, reopened, ready_for_review]
      paths:
       - '**.py'
       - 'requirements.txt'
       - '.github/workflows/*.yml'
       - 'requirements-tests.txt'
       - 'cicd/cicd.sh'
       - 'cicd/Dockerfile.jinja'
  workflow_dispatch:

# Cancel jobs on the same ref if a new one is triggered
concurrency:
  group: ${{ github.workflow }}-${{ github.ref }}
  cancel-in-progress: ${{ github.ref != 'refs/heads/main' }}

permissions:
  contents: read

env:
  TRANSFORMERS_IS_CI: "yes"

jobs:
  pre-commit:
    name: pre-commit
    runs-on: ubuntu-latest
    if: ${{ !github.event.pull_request.draft }}
    steps:
      - uses: actions/checkout@v4
      - uses: actions/setup-python@v5
        with:
          python-version: "3.11"
          cache: 'pip' # caching pip dependencies
      - uses: pre-commit/action@v3.0.1
        env:
          SKIP: no-commit-to-branch

  prime-cdn-s3-cache:
    name: Prefetch S3 once to prime the CDN cache
    runs-on: ubuntu-latest
    if: ${{ !github.event.pull_request.draft }}
    timeout-minutes: 10
    steps:
      - name: Restore Cache from S3
        id: hf-cache-restore-s3
        run: |
          curl -v -H "Range: bytes=0-1023" -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst > /dev/null

  pytest:
    name: PyTest
    runs-on: ubuntu-latest
    if: ${{ !github.event.pull_request.draft }}
    needs: [prime-cdn-s3-cache]
    strategy:
      fail-fast: false
      matrix:
        python_version: ["3.12"]  # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
        pytorch_version: ["2.8.0", "2.9.1", "2.10.0"]
#        exclude:
#          - python_version: "3.14"
#            pytorch_version: "2.8.0"
#          - python_version: "3.14"
#            pytorch_version: "2.9.1"
    timeout-minutes: 20

    steps:
      - name: cleanup node
        run: |
          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL

      - name: Check out repository code
        uses: actions/checkout@v4

      - name: Restore Cache from S3
        id: hf-cache-restore-s3
        run: |
          mkdir -p ~/.cache/huggingface/hub
          curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/  --use-compress-program unzstd --strip-components=1
          ls -ltr ~/.cache/huggingface/hub/

      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies

      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 wheel

      - name: Install PyTorch
        run: |
          pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision

      - name: Install dependencies
        run: |
          pip3 show torch
          pip3 install --no-cache-dir --no-build-isolation -U -e .
          python scripts/unsloth_install.py | sh
          python scripts/cutcrossentropy_install.py | sh
          pip3 install -r requirements-dev.txt -r requirements-tests.txt

      - name: cleanup pip cache
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

      - name: Make sure PyTorch version wasn't clobbered
        run: |
          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"

      - name: Ensure axolotl CLI was installed
        run: |
          axolotl --help

      - name: Pre-Download dataset fixture
        run: |
          hf download --repo-type=dataset axolotl-ai-internal/axolotl-oss-dataset-fixtures

      - name: Show HF cache
        run: hf cache ls

      - name: Run tests
        run: |
          df -h
          pytest -v --durations=10 -n4 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
          df -h
          pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
          df -h
          pytest -v --durations=10 tests/patched/ --cov=axolotl --cov-append --cov-report=xml
          df -h
          pytest -v --durations=10 tests/cli/ --cov=axolotl --cov-append --cov-report=xml

      - name: Show HF cache
        run: hf cache ls

      - name: Upload coverage to Codecov
        uses: codecov/codecov-action@v5
        with:
          token: ${{ secrets.CODECOV_TOKEN }}
          files: ./coverage.xml
          flags: unittests,pytorch-${{ matrix.pytorch_version }}
          fail_ci_if_error: false

  pytest-sdist:
    name: PyTest from Source Dist
    runs-on: ubuntu-latest
    if: ${{ !github.event.pull_request.draft }}
    needs: [prime-cdn-s3-cache]
    strategy:
      fail-fast: false
      matrix:
        python_version: ["3.12"]  # TODO include py3.14 once https://github.com/mistralai/mistral-common/pull/194 is merged
        pytorch_version: ["2.8.0", "2.9.1", "2.10.0"]
#        exclude:
#          - python_version: "3.14"
#            pytorch_version: "2.8.0"
#          - python_version: "3.14"
#            pytorch_version: "2.9.1"
    timeout-minutes: 30

    steps:
      - name: cleanup node
        run: |
          sudo rm -rf /usr/share/dotnet /usr/local/lib/android /opt/ghc /opt/hostedtoolcache/CodeQL

      - name: Check out repository code
        uses: actions/checkout@v4

      - name: Restore Cache from S3
        id: hf-cache-restore-s3
        run: |
          mkdir -p ~/.cache/huggingface/hub
          curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C ~/.cache/huggingface/hub/  --use-compress-program unzstd --strip-components=1
          ls -ltr ~/.cache/huggingface/hub/

      - name: Setup Python
        uses: actions/setup-python@v5
        with:
          python-version: ${{ matrix.python_version }}
          cache: 'pip' # caching pip dependencies

      - name: upgrade pip
        run: |
          pip3 install --upgrade pip
          pip3 install --upgrade packaging==26.0 setuptools==75.8.0 setuptools_scm build wheel psutil

      - name: Install PyTorch
        run: |
          pip3 install --no-cache-dir torch==${{ matrix.pytorch_version }} torchvision

      - name: Install dependencies
        run: |
          pip3 show torch
          python -m build --no-isolation --sdist
          pip3 install --no-cache-dir --no-build-isolation dist/axolotl*.tar.gz
          python scripts/unsloth_install.py | sh
          python scripts/cutcrossentropy_install.py | sh
          pip3 install -r requirements-dev.txt -r requirements-tests.txt

      - name: cleanup pip cache
        run: |
          find "$(pip cache dir)/http-v2" -type f -mtime +14 -exec rm {} \;

      - name: Make sure PyTorch version wasn't clobbered
        run: |
          python -c "import torch; assert '${{ matrix.pytorch_version }}' in torch.__version__"

      - name: Ensure axolotl CLI was installed
        run: |
          axolotl --help

      - name: Show HF cache
        run: hf cache ls

      - name: Run tests
        run: |
          pytest -v --durations=10 -n4 --dist loadfile --ignore=tests/e2e/ --ignore=tests/patched/ --ignore=tests/cli/ --ignore=tests/monkeypatch/ tests/ --cov=axolotl --cov-report=xml
          pytest -v --durations=10 tests/monkeypatch/ --cov=axolotl --cov-append --cov-report=xml
          pytest -v --durations=10 tests/cli/

      - name: Show HF cache
        run: hf cache ls

  gate-skip-e2e:
    needs: [pre-commit]
    runs-on: ubuntu-latest
    outputs:
      skip: ${{ steps.compute.outputs.skip }}
    steps:
      - uses: actions/github-script@v7
        id: compute
        with:
          script: |
            const token = /\[skip-e2e\]/i;
            let msg = '';
            if (context.eventName === 'push') {
              msg = context.payload.head_commit?.message || '';
            } else if (context.eventName === 'pull_request') {
              const { owner, repo } = context.repo;
              const prNumber = context.payload.pull_request.number;
              const commits = await github.paginate(
                github.rest.pulls.listCommits,
                { owner, repo, pull_number: prNumber, per_page: 100 }
              );
              msg = commits.at(-1)?.commit?.message || '';
            }
            const title = context.payload.pull_request?.title || '';
            const body  = context.payload.pull_request?.body  || '';
            const skip = token.test(msg) || token.test(title) || token.test(body);
            core.setOutput('skip', String(skip));

  docker-e2e-tests-1st:
    # Run this job first as a gate for running the remainder of the test matrix
    if: >
      github.repository_owner == 'axolotl-ai-cloud' &&
      (github.event_name != 'pull_request' || !github.event.pull_request.draft) &&
      needs.gate-skip-e2e.outputs.skip != 'true'
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    needs: [pre-commit, pytest]

    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.12"
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
            dockerfile: "Dockerfile-uv.jinja"
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Install Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
          pip install modal==1.3.0.post1 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
        run: |
          modal run cicd.e2e_tests

  docker-e2e-tests:
    if: >
      github.repository_owner == 'axolotl-ai-cloud' &&
      (github.event_name != 'pull_request' || !github.event.pull_request.draft) &&
      needs.gate-skip-e2e.outputs.skip != 'true'
    # this job needs to be run on self-hosted GPU runners...
    runs-on: [self-hosted, modal]
    timeout-minutes: 120
    # Only run the remainder of the matrix if the first e2e check passed;
    # this is to save on wasted compute costs for known failures that get caught in the first run
    needs: [pre-commit, pytest, gate-skip-e2e, docker-e2e-tests-1st]

    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.8.0
            num_gpus: 1
            gpu_type: "B200"
            axolotl_extras: fbgemm-gpu
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.10.0
            num_gpus: 1
            axolotl_extras:
          - cuda: 130
            cuda_version: 13.0.0
            python_version: "3.11"
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Install Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
          pip install modal==1.3.0.post1 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
          echo "GPU_TYPE=${{ matrix.gpu_type || 'L40S'}}" >> $GITHUB_ENV
          echo "E2E_DOCKERFILE=${{ matrix.dockerfile || 'Dockerfile.jinja'}}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        env:
          CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
        run: |
          modal run cicd.e2e_tests

  docker-e2e-cleanup:
    runs-on: [self-hosted, modal]
    timeout-minutes: 90
    needs: [docker-e2e-tests]
    if: ${{ !github.event.pull_request.draft }}

    strategy:
      fail-fast: false
      matrix:
        include:
          - cuda: 128
            cuda_version: 12.8.1
            python_version: "3.11"
            pytorch: 2.9.1
            num_gpus: 1
            axolotl_extras:
    steps:
      - name: Checkout
        uses: actions/checkout@v4
      - name: Install Python
        uses: actions/setup-python@v5
        with:
          python-version: "3.11"
      - name: Install Modal
        run: |
          python -m pip install --upgrade pip
          pip install modal==1.3.0.post1 jinja2
      - name: Update env vars
        run: |
          echo "BASE_TAG=main-base-py${{ matrix.python_version }}-cu${{ matrix.cuda }}-${{ matrix.pytorch }}" >> $GITHUB_ENV
          echo "PYTORCH_VERSION=${{ matrix.pytorch}}" >> $GITHUB_ENV
          echo "AXOLOTL_ARGS=${{ matrix.axolotl_args}}" >> $GITHUB_ENV
          echo "AXOLOTL_EXTRAS=${{ matrix.axolotl_extras}}" >> $GITHUB_ENV
          echo "CUDA=${{ matrix.cuda }}" >> $GITHUB_ENV
          echo "MODAL_IMAGE_BUILDER_VERSION=2024.10" >> $GITHUB_ENV
          echo "N_GPUS=${{ matrix.num_gpus }}" >> $GITHUB_ENV
      - name: Run tests job on Modal
        run: |
          modal run cicd.cleanup


================================================
FILE: .gitignore
================================================
**/axolotl.egg-info
configs
last_run_prepared/
outputs
.vscode
_site/

# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/
venv3.10/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
.idea/

# WandB
# wandb creates a folder to store logs for training runs
wandb

# Runs
lora-out/*
qlora-out/*
mlruns/*

/.quarto/
prepared-datasets/
submit.sh
*.out*

# Quartodoc generated files
objects.json
site_libs/

typings/
out/

# vim
*.swp

# scm auto-versioning
src/axolotl/_version.py


================================================
FILE: .mypy.ini
================================================
[mypy]
plugins = pydantic.mypy
exclude = venv

[mypy-alpaca_lora_4bit.*]
ignore_missing_imports = True

[mypy-axolotl.monkeypatch.*]
ignore_errors = True

[mypy-axolotl.models.mixtral.*]
ignore_errors = True

[mypy-axolotl.integrations.liger.models.*]
ignore_errors = True

[mypy-axolotl.models.phi.*]
ignore_errors = True

[mypy-flash_attn.*]
ignore_missing_imports = True

[mypy-huggingface_hub]
ignore_missing_imports = True

[mypy-transformers.*]
ignore_missing_imports = True

[mypy-peft]
ignore_missing_imports = True

[mypy-wandb]
ignore_missing_imports = True

[mypy-bitsandbytes]
ignore_missing_imports = True

[mypy-requests]
ignore_missing_imports = True

[mypy-datasets]
ignore_missing_imports = True

[mypy-fire]
ignore_missing_imports = True

[mypy-setuptools]
ignore_missing_imports = True

[mypy-addict]
ignore_missing_imports = True

[mypy-xformers.*]
ignore_missing_imports = True


================================================
FILE: .pre-commit-config.yaml
================================================
default_language_version:
    python: python3

repos:
-   repo: https://github.com/pre-commit/pre-commit-hooks
    rev: v6.0.0
    hooks:
    -   id: check-yaml
    -   id: end-of-file-fixer
    -   id: trailing-whitespace
    -   id: no-commit-to-branch
        args: ['--branch', 'main']
-   repo: https://github.com/astral-sh/ruff-pre-commit
    rev: v0.15.4
    hooks:
    -   id: ruff
        args: [--fix]
    -   id: ruff-format
-   repo: https://github.com/pre-commit/mirrors-mypy
    rev: v1.19.1
    hooks:
    - id: mypy
      additional_dependencies:
        [
            'types-PyYAML',
            'pydantic>=2.5.3',
        ]
-   repo: https://github.com/PyCQA/bandit
    rev: 1.9.4
    hooks:
    -   id: bandit
        args: [
            '--ini',
            '.bandit',
        ]


================================================
FILE: .runpod/.gitignore
================================================
# Byte-compiled / optimized / DLL files
__pycache__/
*.py[cod]
*$py.class

# C extensions
*.so

# Distribution / packaging
.Python
build/
develop-eggs/
dist/
downloads/
eggs/
.eggs/
lib/
lib64/
parts/
sdist/
var/
wheels/
share/python-wheels/
*.egg-info/
.installed.cfg
*.egg
MANIFEST

# PyInstaller
#  Usually these files are written by a python script from a template
#  before PyInstaller builds the exe, so as to inject date/other infos into it.
*.manifest
*.spec

# Installer logs
pip-log.txt
pip-delete-this-directory.txt

# Unit test / coverage reports
htmlcov/
.tox/
.nox/
.coverage
.coverage.*
.cache
nosetests.xml
coverage.xml
*.cover
*.py,cover
.hypothesis/
.pytest_cache/
cover/

# Translations
*.mo
*.pot

# Django stuff:
*.log
local_settings.py
db.sqlite3
db.sqlite3-journal

# Flask stuff:
instance/
.webassets-cache

# Scrapy stuff:
.scrapy

# Sphinx documentation
docs/_build/

# PyBuilder
.pybuilder/
target/

# Jupyter Notebook
.ipynb_checkpoints

# IPython
profile_default/
ipython_config.py

# pyenv
#   For a library or package, you might want to ignore these files since the code is
#   intended to run in multiple environments; otherwise, check them in:
# .python-version

# pipenv
#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
#   However, in case of collaboration, if having platform-specific dependencies or dependencies
#   having no cross-platform support, pipenv may install dependencies that don't work, or not
#   install all needed dependencies.
#Pipfile.lock

# poetry
#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
#   This is especially recommended for binary packages to ensure reproducibility, and is more
#   commonly ignored for libraries.
#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
#poetry.lock

# pdm
#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
#pdm.lock
#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
#   in version control.
#   https://pdm.fming.dev/#use-with-ide
.pdm.toml

# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
__pypackages__/

# Celery stuff
celerybeat-schedule
celerybeat.pid

# SageMath parsed files
*.sage.py

# Environments
.env
.venv
env/
venv/
ENV/
env.bak/
venv.bak/

# Spyder project settings
.spyderproject
.spyproject

# Rope project settings
.ropeproject

# mkdocs documentation
/site

# mypy
.mypy_cache/
.dmypy.json
dmypy.json

# Pyre type checker
.pyre/

# pytype static type analyzer
.pytype/

# Cython debug symbols
cython_debug/

# PyCharm
#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
#  and can be added to the global gitignore or merged into this file.  For a more nuclear
#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
#.idea/
pod/scripts/config.yaml


================================================
FILE: .runpod/Dockerfile
================================================
FROM axolotlai/axolotl-cloud:main-py3.11-cu124-2.6.0

COPY .runpod/requirements.txt /requirements.txt
RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install --upgrade pip && \
    python3 -m pip install --upgrade -r /requirements.txt

# Environment settings
ARG BASE_VOLUME="/runpod-volume"
ENV BASE_VOLUME=$BASE_VOLUME
ENV HF_DATASETS_CACHE="${BASE_VOLUME}/huggingface-cache/datasets"
ENV HUGGINGFACE_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
ENV HF_HUB_CACHE="${BASE_VOLUME}/huggingface-cache/hub"
ENV TRANSFORMERS_CACHE="${BASE_VOLUME}/huggingface-cache/hub"

COPY .runpod/src /src

WORKDIR /src
CMD ["python3", "/src/handler.py"]


================================================
FILE: .runpod/README.md
================================================
<h1>LLM Post Training- Full fine-tune, LoRA, QLoRa etc. Llama/Mistral/Gemma and more</h1>

# Configuration Options

This document outlines all available configuration options for training models. The configuration can be provided as a JSON request.

## Usage

You can use these configuration Options:

1. As a JSON request body:

```json
{
  "input": {
    "user_id": "user",
    "model_id": "model-name",
    "run_id": "run-id",
    "credentials": {
      "wandb_api_key": "", # add your Weights & biases key. TODO:  you will be able to set this in Enviornment variables.
      "hf_token": "", # add your HF_token. TODO:  you will be able to set this in Enviornment variables.
    },
    "args": {
      "base_model": "NousResearch/Llama-3.2-1B",
      // ... other options
    }
  }
}
```

## Configuration Options

### Model Configuration

| Option              | Description                                                                                   | Default              |
| ------------------- | --------------------------------------------------------------------------------------------- | -------------------- |
| `base_model`        | Path to the base model (local or HuggingFace)                                                 | Required             |
| `base_model_config` | Configuration path for the base model                                                         | Same as base_model   |
| `revision_of_model` | Specific model revision from HuggingFace hub                                                  | Latest               |
| `tokenizer_config`  | Custom tokenizer configuration path                                                           | Optional             |
| `model_type`        | Type of model to load                                                                         | AutoModelForCausalLM |
| `tokenizer_type`    | Type of tokenizer to use                                                                      | AutoTokenizer        |
| `hub_model_id`      | Repository ID where the model will be pushed on Hugging Face Hub (format: username/repo-name) | Optional             |

## Model Family Identification

| Option                     | Default | Description                    |
| -------------------------- | ------- | ------------------------------ |
| `is_falcon_derived_model`  | `false` | Whether model is Falcon-based  |
| `is_llama_derived_model`   | `false` | Whether model is LLaMA-based   |
| `is_qwen_derived_model`    | `false` | Whether model is Qwen-based    |
| `is_mistral_derived_model` | `false` | Whether model is Mistral-based |

## Model Configuration Overrides

| Option                                          | Default    | Description                        |
| ----------------------------------------------- | ---------- | ---------------------------------- |
| `overrides_of_model_config.rope_scaling.type`   | `"linear"` | RoPE scaling type (linear/dynamic) |
| `overrides_of_model_config.rope_scaling.factor` | `1.0`      | RoPE scaling factor                |

### Model Loading Options

| Option         | Description                   | Default |
| -------------- | ----------------------------- | ------- |
| `load_in_8bit` | Load model in 8-bit precision | false   |
| `load_in_4bit` | Load model in 4-bit precision | false   |
| `bf16`         | Use bfloat16 precision        | false   |
| `fp16`         | Use float16 precision         | false   |
| `tf32`         | Use tensor float 32 precision | false   |

## Memory and Device Settings

| Option             | Default   | Description             |
| ------------------ | --------- | ----------------------- |
| `gpu_memory_limit` | `"20GiB"` | GPU memory limit        |
| `lora_on_cpu`      | `false`   | Load LoRA on CPU        |
| `device_map`       | `"auto"`  | Device mapping strategy |
| `max_memory`       | `null`    | Max memory per device   |

## Training Hyperparameters

| Option                        | Default   | Description                 |
| ----------------------------- | --------- | --------------------------- |
| `gradient_accumulation_steps` | `1`       | Gradient accumulation steps |
| `micro_batch_size`            | `2`       | Batch size per GPU          |
| `eval_batch_size`             | `null`    | Evaluation batch size       |
| `num_epochs`                  | `4`       | Number of training epochs   |
| `warmup_steps`                | `100`     | Warmup steps                |
| `warmup_ratio`                | `0.05`    | Warmup ratio                |
| `learning_rate`               | `0.00003` | Learning rate               |
| `lr_quadratic_warmup`         | `false`   | Quadratic warmup            |
| `logging_steps`               | `null`    | Logging frequency           |
| `eval_steps`                  | `null`    | Evaluation frequency        |
| `evals_per_epoch`             | `null`    | Evaluations per epoch       |
| `save_strategy`               | `"epoch"` | Checkpoint saving strategy  |
| `save_steps`                  | `null`    | Saving frequency            |
| `saves_per_epoch`             | `null`    | Saves per epoch             |
| `save_total_limit`            | `null`    | Maximum checkpoints to keep |
| `max_steps`                   | `null`    | Maximum training steps      |

### Dataset Configuration

```yaml
datasets:
  - path: vicgalle/alpaca-gpt4 # HuggingFace dataset or TODO: You will be able to add the local path.
    type: alpaca # Format type (alpaca, gpteacher, oasst, etc.)
    ds_type: json # Dataset type
    data_files: path/to/data # Source data files
    train_on_split: train # Dataset split to use
```

## Chat Template Settings

| Option                   | Default                          | Description            |
| ------------------------ | -------------------------------- | ---------------------- |
| `chat_template`          | `"tokenizer_default"`            | Chat template type     |
| `chat_template_jinja`    | `null`                           | Custom Jinja template  |
| `default_system_message` | `"You are a helpful assistant."` | Default system message |

## Dataset Processing

| Option                            | Default                    | Description                         |
| --------------------------------- | -------------------------- | ----------------------------------- |
| `dataset_prepared_path`           | `"data/last_run_prepared"` | Path for prepared dataset           |
| `push_dataset_to_hub`             | `""`                       | Push dataset to HF hub              |
| `dataset_num_proc`                | `4`                        | Number of preprocessing processes   |
| `dataset_keep_in_memory`          | `false`                    | Keep dataset in memory              |
| `shuffle_merged_datasets`         | `true`                     | Shuffle merged datasets             |
| `shuffle_before_merging_datasets` | `false`                    | Shuffle each dataset before merging |
| `dataset_exact_deduplication`     | `true`                     | Deduplicate datasets                |

## LoRA Configuration

| Option                     | Default                | Description                    |
| -------------------------- | ---------------------- | ------------------------------ |
| `adapter`                  | `"lora"`               | Adapter type (lora/qlora)      |
| `lora_model_dir`           | `""`                   | Directory with pretrained LoRA |
| `lora_r`                   | `8`                    | LoRA attention dimension       |
| `lora_alpha`               | `16`                   | LoRA alpha parameter           |
| `lora_dropout`             | `0.05`                 | LoRA dropout                   |
| `lora_target_modules`      | `["q_proj", "v_proj"]` | Modules to apply LoRA          |
| `lora_target_linear`       | `false`                | Target all linear modules      |
| `peft_layers_to_transform` | `[]`                   | Layers to transform            |
| `lora_modules_to_save`     | `[]`                   | Modules to save                |
| `lora_fan_in_fan_out`      | `false`                | Fan in/out structure           |

## Optimization Settings

| Option                    | Default | Description                |
| ------------------------- | ------- | -------------------------- |
| `train_on_inputs`         | `false` | Train on input prompts     |
| `group_by_length`         | `false` | Group by sequence length   |
| `gradient_checkpointing`  | `false` | Use gradient checkpointing |
| `early_stopping_patience` | `3`     | Early stopping patience    |

## Learning Rate Scheduling

| Option                     | Default    | Description          |
| -------------------------- | ---------- | -------------------- |
| `lr_scheduler`             | `"cosine"` | Scheduler type       |
| `lr_scheduler_kwargs`      | `{}`       | Scheduler parameters |
| `cosine_min_lr_ratio`      | `null`     | Minimum LR ratio     |
| `cosine_constant_lr_ratio` | `null`     | Constant LR ratio    |
| `lr_div_factor`            | `null`     | LR division factor   |

## Optimizer Settings

| Option                 | Default      | Description         |
| ---------------------- | ------------ | ------------------- |
| `optimizer`            | `"adamw_hf"` | Optimizer choice    |
| `optim_args`           | `{}`         | Optimizer arguments |
| `optim_target_modules` | `[]`         | Target modules      |
| `weight_decay`         | `null`       | Weight decay        |
| `adam_beta1`           | `null`       | Adam beta1          |
| `adam_beta2`           | `null`       | Adam beta2          |
| `adam_epsilon`         | `null`       | Adam epsilon        |
| `max_grad_norm`        | `null`       | Gradient clipping   |

## Attention Implementations

| Option                     | Default | Description                   |
| -------------------------- | ------- | ----------------------------- |
| `flash_optimum`            | `false` | Use better transformers       |
| `xformers_attention`       | `false` | Use xformers                  |
| `flash_attention`          | `false` | Use flash attention           |
| `flash_attn_cross_entropy` | `false` | Flash attention cross entropy |
| `flash_attn_rms_norm`      | `false` | Flash attention RMS norm      |
| `flash_attn_fuse_mlp`      | `false` | Fuse MLP operations           |
| `sdp_attention`            | `false` | Use scaled dot product        |
| `s2_attention`             | `false` | Use shifted sparse attention  |

## Tokenizer Modifications

| Option           | Default | Description                  |
| ---------------- | ------- | ---------------------------- |
| `special_tokens` | -       | Special tokens to add/modify |
| `tokens`         | `[]`    | Additional tokens            |

## Distributed Training

| Option                  | Default | Description           |
| ----------------------- | ------- | --------------------- |
| `fsdp`                  | `null`  | FSDP configuration    |
| `fsdp_config`           | `null`  | FSDP config options   |
| `deepspeed`             | `null`  | Deepspeed config path |
| `ddp_timeout`           | `null`  | DDP timeout           |
| `ddp_bucket_cap_mb`     | `null`  | DDP bucket capacity   |
| `ddp_broadcast_buffers` | `null`  | DDP broadcast buffers |

<details>
<summary><h3>Example Configuration Request:</h3></summary>

Here's a complete example for fine-tuning a LLaMA model using LoRA:

```json
{
  "input": {
    "user_id": "user",
    "model_id": "llama-test",
    "run_id": "test-run",
    "credentials": {
      "wandb_api_key": "",
      "hf_token": ""
    },
    "args": {
      "base_model": "NousResearch/Llama-3.2-1B",
      "load_in_8bit": false,
      "load_in_4bit": false,
      "strict": false,
      "datasets": [
        {
          "path": "teknium/GPT4-LLM-Cleaned",
          "type": "alpaca"
        }
      ],
      "dataset_prepared_path": "last_run_prepared",
      "val_set_size": 0.1,
      "output_dir": "./outputs/lora-out",
      "adapter": "lora",
      "sequence_len": 2048,
      "sample_packing": true,
      "eval_sample_packing": true,
      "pad_to_sequence_len": true,
      "lora_r": 16,
      "lora_alpha": 32,
      "lora_dropout": 0.05,
      "lora_target_modules": [
        "gate_proj",
        "down_proj",
        "up_proj",
        "q_proj",
        "v_proj",
        "k_proj",
        "o_proj"
      ],
      "gradient_accumulation_steps": 2,
      "micro_batch_size": 2,
      "num_epochs": 1,
      "optimizer": "adamw_8bit",
      "lr_scheduler": "cosine",
      "learning_rate": 0.0002,
      "train_on_inputs": false,
      "group_by_length": false,
      "bf16": "auto",
      "tf32": false,
      "gradient_checkpointing": true,
      "logging_steps": 1,
      "flash_attention": true,
      "loss_watchdog_threshold": 5,
      "loss_watchdog_patience": 3,
      "warmup_steps": 10,
      "evals_per_epoch": 4,
      "saves_per_epoch": 1,
      "weight_decay": 0,
      "hub_model_id": "runpod/llama-fr-lora",
      "wandb_name": "test-run-1",
      "wandb_project": "test-run-1",
      "wandb_entity": "axo-test",
      "special_tokens": {
        "pad_token": "<|end_of_text|>"
      }
    }
  }
}
```

</details>

### Advanced Features

#### Wandb Integration

- `wandb_project`: Project name for Weights & Biases
- `wandb_entity`: Team name in W&B
- `wandb_watch`: Monitor model with W&B
- `wandb_name`: Name of the W&B run
- `wandb_run_id`: ID for the W&B run

#### Performance Optimization

- `sample_packing`: Enable efficient sequence packing
- `eval_sample_packing`: Use sequence packing during evaluation
- `torch_compile`: Enable PyTorch 2.0 compilation
- `flash_attention`: Use Flash Attention implementation
- `xformers_attention`: Use xFormers attention implementation

### Available Optimizers

The following optimizers are supported:

- `adamw_hf`: HuggingFace's AdamW implementation
- `adamw_torch`: PyTorch's AdamW
- `adamw_torch_fused`: Fused AdamW implementation
- `adamw_torch_xla`: XLA-optimized AdamW
- `adamw_apex_fused`: NVIDIA Apex fused AdamW
- `adafactor`: Adafactor optimizer
- `adamw_anyprecision`: Anyprecision AdamW
- `adamw_bnb_8bit`: 8-bit AdamW from bitsandbytes
- `lion_8bit`: 8-bit Lion optimizer
- `lion_32bit`: 32-bit Lion optimizer
- `sgd`: Stochastic Gradient Descent
- `adagrad`: Adagrad optimizer

## Notes

- Set `load_in_8bit: true` or `load_in_4bit: true` for memory-efficient training
- Enable `flash_attention: true` for faster training on modern GPUs
- Use `gradient_checkpointing: true` to reduce memory usage
- Adjust `micro_batch_size` and `gradient_accumulation_steps` based on your GPU memory

For more detailed information, please refer to the [documentation](https://axolotl-ai-cloud.github.io/axolotl/docs/config-reference.html).

### Errors:

- if you face any issues with the Flash Attention-2, Delete yoor worker and Re-start.


================================================
FILE: .runpod/hub.json
================================================
{
  "title": "Axolotl Fine-Tuning",
  "description": "Serverless fine-tuning of open-source LLMs with Axolotl. Supports LoRA, QLoRA, DPO, and more using Hugging Face models and datasets.",
  "type": "serverless",
  "category": "language",
  "iconUrl": "https://avatars.githubusercontent.com/u/167502477",
  "config": {
    "runsOn": "GPU",
    "containerDiskInGb": 200,
    "gpuCount": 1,
    "allowedCudaVersions": [
      "12.8",
      "12.7",
      "12.6",
      "12.5",
      "12.4"
    ],
    "presets": [],
    "env": [
      {
        "key": "TOKENIZER",
        "input": {
          "name": "Tokenizer",
          "type": "string",
          "description": "Name or path of the Hugging Face tokenizer to use.",
          "default": "",
          "advanced": true
        }
      },
      {
        "key": "MAX_NUM_SEQS",
        "input": {
          "name": "Max Num Seqs",
          "type": "number",
          "description": "Maximum number of sequences per iteration.",
          "default": 256,
          "advanced": true
        }
      },
      {
        "key": "DISABLE_LOG_STATS",
        "input": {
          "name": "Disable Log Stats",
          "type": "boolean",
          "description": "Disable logging statistics.",
          "default": false,
          "trueValue": "true",
          "falseValue": "false"
        }
      },
      {
        "key": "LOAD_FORMAT",
        "input": {
          "name": "Load Format",
          "type": "string",
          "description": "The format of the model weights to load.",
          "default": "auto",
          "options": [
            {
              "label": "auto",
              "value": "auto"
            },
            {
              "label": "pt",
              "value": "pt"
            },
            {
              "label": "safetensors",
              "value": "safetensors"
            },
            {
              "label": "npcache",
              "value": "npcache"
            },
            {
              "label": "dummy",
              "value": "dummy"
            },
            {
              "label": "tensorizer",
              "value": "tensorizer"
            },
            {
              "label": "bitsandbytes",
              "value": "bitsandbytes"
            }
          ],
          "advanced": true
        }
      }
    ]
  }
}


================================================
FILE: .runpod/requirements.txt
================================================
# Required Python packages get listed here, one per line.
# Reccomended to lock the version number to avoid unexpected changes.

# You can also install packages from a git repository, e.g.:
# git+https://github.com/runpod/runpod-python.git
# To learn more, see https://pip.pypa.io/en/stable/reference/requirements-file-format/
runpod~=1.7.0


================================================
FILE: .runpod/src/config/config.yaml
================================================
# # This is the huggingface model that contains *.pt, *.safetensors, or *.bin files
# # This can also be a relative path to a model on disk
# base_model: ./llama-7b-hf
# # You can specify an ignore pattern if the model repo contains more than 1 model type (*.pt, etc)
# base_model_ignore_patterns:
# # If the base_model repo on hf hub doesn't include configuration .json files,
# # You can set that here, or leave this empty to default to base_model
# base_model_config: ./llama-7b-hf
# # You can specify to choose a specific model revision from huggingface hub
# model_revision:
# # Optional tokenizer configuration override in case you want to use a different tokenizer
# # than the one defined in the base model
# tokenizer_config:
# # If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too
# model_type: AutoModelForCausalLM
# # Corresponding tokenizer for the model AutoTokenizer is a good choice
# tokenizer_type: AutoTokenizer
# # Trust remote code for untrusted source
# trust_remote_code:
# # use_fast option for tokenizer loading from_pretrained, default to True
# tokenizer_use_fast:
# # Whether to use the legacy tokenizer setting, defaults to True
# tokenizer_legacy:
# # Resize the model embeddings when new tokens are added to multiples of 32
# # This is reported to improve training speed on some models
# resize_token_embeddings_to_32x:

# # Used to identify which the model is based on
# is_falcon_derived_model:
# is_llama_derived_model:
# # Please note that if you set this to true, `padding_side` will be set to "left" by default
# is_mistral_derived_model:
# is_qwen_derived_model:

# # optional overrides to the base model configuration
# model_config:
#   # RoPE Scaling https://github.com/huggingface/transformers/pull/24653
#   rope_scaling:
#     type: # linear | dynamic
#     factor: # float

# # Whether you are training a 4-bit GPTQ quantized model
# gptq: true
# gptq_groupsize: 128 # group size
# gptq_model_v1: false # v1 or v2

# # This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer
# load_in_8bit: true
# # Use bitsandbytes 4 bit
# load_in_4bit:

# # Use CUDA bf16
# bf16: true # bool or 'full' for `bf16_full_eval`. require >=ampere
# # Use CUDA fp16
# fp16: true
# # Use CUDA tf32
# tf32: true # require >=ampere

# # No AMP (automatic mixed precision)
# bfloat16: true # require >=ampere
# float16: true

# # A list of one or more datasets to finetune the model with
# datasets:
#   # HuggingFace dataset repo | s3://,gs:// path | "json" for local dataset, make sure to fill data_files
#   - path: vicgalle/alpaca-gpt4
#   # The type of prompt to use for training. [alpaca, sharegpt, gpteacher, oasst, reflection]
#     type: alpaca # format | format:<prompt_style> (chat/instruct) | <prompt_strategies>.load_<load_fn>
#     ds_type: # Optional[str] (json|arrow|parquet|text|csv) defines the datatype when path is a file
#     data_files: # Optional[str] path to source data files
#     shards: # Optional[int] number of shards to split data into
#     name: # Optional[str] name of dataset configuration to load
#     train_on_split: train # Optional[str] name of dataset split to load from

#     # Optional[str] fastchat conversation type, only used with type: sharegpt
#     conversation:  # Options (see Conversation 'name'): https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
#     field_human: # Optional[str]. Human key to use for conversation.
#     field_model: # Optional[str]. Assistant key to use for conversation.

#   # Custom user prompt
#   - path: repo
#     type:
#       # The below are defaults. only set what's needed.
#       system_prompt: ""
#       system_format: "{system}"
#       field_system: system
#       field_instruction: instruction
#       field_input: input
#       field_output: output

#       # Customizable to be single line or multi-line
#       # 'format' can include {input}
#       format: |-
#         User: {instruction} {input}
#         Assistant:
#       # 'no_input_format' cannot include {input}
#       no_input_format: "{instruction} "

#       # For `completion` datasets only, uses the provided field instead of `text` column
#       field:

# # Axolotl attempts to save the dataset as an arrow after packing the data together so
# # subsequent training attempts load faster, relative path
# dataset_prepared_path: data/last_run_prepared
# # Push prepared dataset to hub
# push_dataset_to_hub: # repo path
# # The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()`
# # if not set.
# dataset_num_proc: # defaults to os.cpu_count() if not set
# # push checkpoints to hub
# hub_model_id: # repo path to push finetuned model
# # how to push checkpoints to hub
# # https://huggingface.co/docs/transformers/v4.31.0/en/main_classes/trainer#transformers.TrainingArguments.hub_strategy
# hub_strategy:
# # Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets
# # Required to be true when used in combination with `push_dataset_to_hub`
# hf_use_auth_token: # boolean
# # How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval.
# val_set_size: 0.04
# # Num shards for whole dataset
# dataset_shard_num:
# # Index of shard to use for whole dataset
# dataset_shard_idx:

# # The maximum length of an input to train with, this should typically be less than 2048
# # as most models have a token/context limit of 2048
# sequence_len: 2048
# # Pad inputs so each step uses constant sized buffers
# # This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently
# pad_to_sequence_len:
# # Max sequence length to concatenate training samples together up to
# # Inspired by StackLLaMA. see https://huggingface.co/blog/stackllama#supervised-fine-tuning
# # FutureWarning: This will soon be DEPRECATED
# max_packed_sequence_len: 1024
# # Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'
# sample_packing:
# # Set to 'false' if getting errors during eval with sample_packing on.
# eval_sample_packing:
# # You can set these packing optimizations AFTER starting a training at least once.
# # The trainer will provide recommended values for these values.
# sample_packing_eff_est:
# total_num_tokens:

# # If you want to use 'lora' or 'qlora' or leave blank to train all parameters in original model
# adapter: lora
# # If you already have a lora model trained that you want to load, put that here.
# # This means after training, if you want to test the model, you should set this to the value of `lora_out_dir`.
# lora_model_dir:

# # LoRA hyperparameters
# # For more details about the following options, see:
# # https://www.anyscale.com/blog/fine-tuning-llms-lora-or-full-parameter-an-in-depth-analysis-with-llama-2
# lora_r: 8
# lora_alpha: 16
# lora_dropout: 0.05
# lora_target_modules:
#   - q_proj
#   - v_proj
# #  - k_proj
# #  - o_proj
# #  - gate_proj
# #  - down_proj
# #  - up_proj
# lora_target_linear: # If true, will target all linear layers

# # If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens.
# # For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models.
# # `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities.
# # https://github.com/huggingface/peft/issues/334#issuecomment-1561727994
# lora_modules_to_save:
# #  - embed_tokens
# #  - lm_head

# # Once you complete training, the model will be saved to the following directory.
# # If you merge the adapter to the base model, a subdirectory `merged` will be created under this directory.
# # Make sure `lora_model_dir` points to this directory if you want to use the trained model.
# lora_out_dir:
# lora_fan_in_fan_out: false

# # ReLoRA configuration
# # Must use either 'lora' or 'qlora' adapter, and does not support fsdp or deepspeed
# relora_steps: # Number of steps per ReLoRA restart
# relora_warmup_steps: # Number of per-restart warmup steps
# relora_cpu_offload: # True to perform lora weight merges on cpu during restarts, for modest gpu memory savings

# # wandb configuration if you're using it
# wandb_mode: # "offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb
# wandb_project: # Your wandb project name
# wandb_entity: # A wandb Team name if using a Team
# wandb_watch:
# wandb_run_id: # Set the name of your wandb run
# wandb_log_model: # "checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training

# # Where to save the full-finetuned model to
# output_dir: ./completed-model

# # Whether to use torch.compile and which backend to use
# torch_compile:  # bool
# torch_compile_backend:  # Optional[str]

# # Training hyperparameters

# # If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps.
# gradient_accumulation_steps: 1
# # The number of samples to include in each batch. This is the number of samples sent to each GPU.
# micro_batch_size: 2
# eval_batch_size:
# num_epochs: 4
# warmup_steps: 100  # cannot use with warmup_ratio
# warmup_ratio: 0.05  # cannot use with warmup_steps
# learning_rate: 0.00003
# lr_quadratic_warmup:
# logging_steps:
# save_strategy: # Set to `no` to skip checkpoint saves
# save_steps: # Leave empty to save at each epoch
# eval_steps: # Leave empty to eval at each epoch, integers for every N steps. decimal for fraction of total steps
# save_total_limit: # Checkpoints saved at a time
# # Maximum number of iterations to train for. It precedes num_epochs which means that
# # if both are set, num_epochs will not be guaranteed.
# # e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps
# max_steps:

# eval_table_size: # Approximate number of predictions sent to wandb depending on batch size. Enabled above 0. Default is 0
# eval_table_max_new_tokens: # Total number of tokens generated for predictions sent to wandb. Default is 128

# # Whether to mask out or include the human's prompt from the training labels
# train_on_inputs: false
# # Group similarly sized data to minimize padding.
# # May be slower to start, as it must download and sort the entire dataset.
# # Note that training loss may have an oscillating pattern with this enabled.
# group_by_length: false

# # Whether to use gradient checkpointing https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing
# gradient_checkpointing: false

# # Stop training after this many evaluation losses have increased in a row
# # https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
# early_stopping_patience: 3

# # Specify a scheduler and kwargs to use with the optimizer
# lr_scheduler: # 'one_cycle' | empty for cosine
# lr_scheduler_kwargs:

# # For one_cycle optim
# lr_div_factor: # Learning rate div factor

# # Specify optimizer
# # Valid values are driven by the Transformers OptimizerNames class, see:
# # https://github.com/huggingface/transformers/blob/95b374952dc27d8511541d6f5a4e22c9ec11fb24/src/transformers/training_args.py#L134
# #
# # Note that not all optimizers may be available in your environment, ex: 'adamw_anyprecision' is part of
# # torchdistx, 'adamw_bnb_8bit' is part of bnb.optim.Adam8bit, etc. When in doubt, it is recommended to start with the optimizer used
# # in the examples/ for your model and fine-tuning use case.
# #
# # Valid values for 'optimizer' include:
# # - adamw_hf
# # - adamw_torch
# # - adamw_torch_fused
# # - adamw_torch_xla
# # - adamw_apex_fused
# # - adafactor
# # - adamw_anyprecision
# # - sgd
# # - adagrad
# # - adamw_bnb_8bit
# # - lion_8bit
# # - lion_32bit
# # - paged_adamw_32bit
# # - paged_adamw_8bit
# # - paged_lion_32bit
# # - paged_lion_8bit
# optimizer:
# # Specify weight decay
# weight_decay:
# # adamw hyperparams
# adam_beta1:
# adam_beta2:
# adam_epsilon:
# # Gradient clipping max norm
# max_grad_norm:

# # Augmentation techniques
# # NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings
# # currently only supported on Llama and Mistral
# noisy_embedding_alpha:

# # Whether to bettertransformers
# flash_optimum:
# # Whether to use xformers attention patch https://github.com/facebookresearch/xformers:
# xformers_attention:
# # Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention:
# flash_attention:
# flash_attn_cross_entropy:  # Whether to use flash-attention cross entropy implementation - advanced use only
# flash_attn_rms_norm:  # Whether to use flash-attention rms norm implementation - advanced use only
# flash_attn_fuse_mlp: # Whether to fuse part of the MLP into a single operation
# # Whether to use scaled-dot-product attention
# # https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html
# sdp_attention:
# # Landmark attention (only llama)
# landmark_attention:
# # xpos RoPE see https://github.com/kaiokendev/cutoff-len-is-context-len/blob/main/util/xpos_rope_llama_monkey_patch.py
# # LLaMA only
# xpos_rope:

# # Resume from a specific checkpoint dir
# resume_from_checkpoint:
# # If resume_from_checkpoint isn't set and you simply want it to start where it left off.
# # Be careful with this being turned on between different models.
# auto_resume_from_checkpoints: false

# # Don't mess with this, it's here for accelerate and torchrun
# local_rank:

# # Add or change special tokens.
# # If you add tokens here, you don't need to add them to the `tokens` list.
# special_tokens:
#   # bos_token: "<s>"
#   # eos_token: "</s>"
#   # unk_token: "<unk>"

# # Add extra tokens.
# tokens:

# # FSDP
# fsdp:
# fsdp_config:

# # Deepspeed config path. e.g., deepspeed/zero3.json
# deepspeed:

# # Advanced DDP Arguments
# ddp_timeout:
# ddp_bucket_cap_mb:
# ddp_broadcast_buffers:

# # Path to torch distx for optim 'adamw_anyprecision'
# torchdistx_path:

# # Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize
# pretraining_dataset:

# # Debug mode
# debug:

# # Seed
# seed:

# # Allow overwrite yml config using from cli
# strict:

base_model: ${BASE_MODEL}
base_model_ignore_patterns: ${BASE_MODEL_IGNORE_PATTERNS}
base_model_config: ${BASE_MODEL_CONFIG}
revision_of_model: ${REVISION_OF_MODEL}
tokenizer_config: ${TOKENIZER_CONFIG}
model_type: ${MODEL_TYPE}
tokenizer_type: ${TOKENIZER_TYPE}
trust_remote_code: ${TRUST_REMOTE_CODE}
tokenizer_use_fast: ${TOKENIZER_USE_FAST}
tokenizer_legacy: ${TOKENIZER_LEGACY}
resize_token_embeddings_to_32x: ${RESIZE_TOKEN_EMBEDDINGS_TO_32X}

is_falcon_derived_model: ${IS_FALCON_DERIVED_MODEL}
is_llama_derived_model: ${IS_LLAMA_DERIVED_MODEL}
is_qwen_derived_model: ${IS_QWEN_DERIVED_MODEL}
is_mistral_derived_model: ${IS_MISTRAL_DERIVED_MODEL}

overrides_of_model_config:
  rope_scaling:
    type: ${ROPE_SCALING_TYPE}
    factor: ${ROPE_SCALING_FACTOR}

bnb_config_kwargs:
  llm_int8_has_fp16_weight: ${BNB_LLM_INT8_HAS_FP16_WEIGHT}
  bnb_4bit_quant_type: ${BNB_4BIT_QUANT_TYPE}
  bnb_4bit_use_double_quant: ${BNB_4BIT_USE_DOUBLE_QUANT}

gptq: ${GPTQ}
load_in_8bit: ${LOAD_IN_8BIT}
load_in_4bit: ${LOAD_IN_4BIT}
bf16: ${BF16}
fp16: ${FP16}
tf32: ${TF32}
bfloat16: ${BFLOAT16}
float16: ${FLOAT16}

gpu_memory_limit: ${GPU_MEMORY_LIMIT}
lora_on_cpu: ${LORA_ON_CPU}

datasets:
  - path: ${DATASET_PATH}
    type: ${DATASET_TYPE}
    ds_type: ${DATASET_DS_TYPE}
    data_files: ${DATASET_DATA_FILES}
    shards: ${DATASET_SHARDS}
    name: ${DATASET_NAME}
    train_on_split: ${DATASET_TRAIN_ON_SPLIT}
    revision: ${DATASET_REVISION}
    trust_remote_code: ${DATASET_TRUST_REMOTE_CODE}

rl: ${RL}
dpo_use_weighting: ${DPO_USE_WEIGHTING}

chat_template: ${CHAT_TEMPLATE}
chat_template_jinja: ${CHAT_TEMPLATE_JINJA}
default_system_message: ${DEFAULT_SYSTEM_MESSAGE}
dataset_prepared_path: ${DATASET_PREPARED_PATH}
push_dataset_to_hub: ${PUSH_DATASET_TO_HUB}
dataset_num_proc: ${DATASET_NUM_PROC}
dataset_keep_in_memory: ${DATASET_KEEP_IN_MEMORY}
hub_model_id: ${HUB_MODEL_ID}
hub_strategy: ${HUB_STRATEGY}
hf_use_auth_token: ${HF_USE_AUTH_TOKEN}
val_set_size: ${VAL_SET_SIZE}
dataset_shard_num: ${DATASET_SHARD_NUM}
dataset_shard_idx: ${DATASET_SHARD_IDX}

sequence_len: ${SEQUENCE_LEN}
pad_to_sequence_len: ${PAD_TO_SEQUENCE_LEN}
sample_packing: ${SAMPLE_PACKING}
eval_sample_packing: ${EVAL_SAMPLE_PACKING}
sample_packing_eff_est: ${SAMPLE_PACKING_EFF_EST}
total_num_tokens: ${TOTAL_NUM_TOKENS}
sample_packing_group_size: ${SAMPLE_PACKING_GROUP_SIZE}
sample_packing_bin_size: ${SAMPLE_PACKING_BIN_SIZE}

batch_flattening: ${BATCH_FLATTENING}
device_map: ${DEVICE_MAP}
max_memory: ${MAX_MEMORY}

adapter: ${ADAPTER}
lora_model_dir: ${LORA_MODEL_DIR}

lora_r: ${LORA_R}
lora_alpha: ${LORA_ALPHA}
lora_dropout: ${LORA_DROPOUT}
lora_target_modules:
  - ${LORA_TARGET_MODULES}
lora_target_linear: ${LORA_TARGET_LINEAR}
peft_layers_to_transform: ${PEFT_LAYERS_TO_TRANSFORM}
lora_modules_to_save: ${LORA_MODULES_TO_SAVE}
lora_fan_in_fan_out: ${LORA_FAN_IN_FAN_OUT}

loraplus_lr_ratio: ${LORAPLUS_LR_RATIO}
loraplus_lr_embedding: ${LORAPLUS_LR_EMBEDDING}

peft:
  loftq_config:
    loftq_bits: ${LOFTQ_BITS}

relora_steps: ${RELORA_STEPS}
relora_warmup_steps: ${RELORA_WARMUP_STEPS}
relora_anneal_steps: ${RELORA_ANNEAL_STEPS}
relora_prune_ratio: ${RELORA_PRUNE_RATIO}
relora_cpu_offload: ${RELORA_CPU_OFFLOAD}

wandb_mode: ${WANDB_MODE}
wandb_project: ${WANDB_PROJECT}
wandb_entity: ${WANDB_ENTITY}
wandb_watch: ${WANDB_WATCH}
wandb_name: ${WANDB_NAME}
wandb_run_id: ${WANDB_RUN_ID}
wandb_log_model: ${WANDB_LOG_MODEL}

mlflow_tracking_uri: ${MLFLOW_TRACKING_URI}
mlflow_experiment_name: ${MLFLOW_EXPERIMENT_NAME}
mlflow_run_name: ${MLFLOW_RUN_NAME}
hf_mlflow_log_artifacts: ${HF_MLFLOW_LOG_ARTIFACTS}

use_comet: ${USE_COMET}
comet_api_key: ${COMET_API_KEY}
comet_workspace: ${COMET_WORKSPACE}
comet_project_name: ${COMET_PROJECT_NAME}
comet_experiment_key: ${COMET_EXPERIMENT_KEY}
comet_mode: ${COMET_MODE}
comet_online: ${COMET_ONLINE}
comet_experiment_config: ${COMET_EXPERIMENT_CONFIG}

output_dir: ${OUTPUT_DIR}

torch_compile: ${TORCH_COMPILE}
torch_compile_backend: ${TORCH_COMPILE_BACKEND}

gradient_accumulation_steps: ${GRADIENT_ACCUMULATION_STEPS}
micro_batch_size: ${MICRO_BATCH_SIZE}
eval_batch_size: ${EVAL_BATCH_SIZE}
num_epochs: ${NUM_EPOCHS}
warmup_steps: ${WARMUP_STEPS}
warmup_ratio: ${WARMUP_RATIO}
learning_rate: ${LEARNING_RATE}
lr_quadratic_warmup: ${LR_QUADRATIC_WARMUP}
logging_steps: ${LOGGING_STEPS}
eval_steps: ${EVAL_STEPS}
evals_per_epoch: ${EVALS_PER_EPOCH}
save_strategy: ${SAVE_STRATEGY}
save_steps: ${SAVE_STEPS}
saves_per_epoch: ${SAVES_PER_EPOCH}
save_total_limit: ${SAVE_TOTAL_LIMIT}
max_steps: ${MAX_STEPS}

eval_table_size: ${EVAL_TABLE_SIZE}
eval_max_new_tokens: ${EVAL_MAX_NEW_TOKENS}
eval_causal_lm_metrics: ${EVAL_CAUSAL_LM_METRICS}

profiler_steps: ${PROFILER_STEPS}
loss_watchdog_threshold: ${LOSS_WATCHDOG_THRESHOLD}
loss_watchdog_patience: ${LOSS_WATCHDOG_PATIENCE}

train_on_inputs: ${TRAIN_ON_INPUTS}
group_by_length: ${GROUP_BY_LENGTH}
gradient_checkpointing: ${GRADIENT_CHECKPOINTING}
early_stopping_patience: ${EARLY_STOPPING_PATIENCE}

lr_scheduler: ${LR_SCHEDULER}
lr_scheduler_kwargs: ${LR_SCHEDULER_KWARGS}
cosine_min_lr_ratio: ${COSINE_MIN_LR_RATIO}
cosine_constant_lr_ratio: ${COSINE_CONSTANT_LR_RATIO}
lr_div_factor: ${LR_DIV_FACTOR}

optimizer: ${OPTIMIZER}
optim_args: ${OPTIM_ARGS}
optim_target_modules: ${OPTIM_TARGET_MODULES}
weight_decay: ${WEIGHT_DECAY}
adam_beta1: ${ADAM_BETA1}
adam_beta2: ${ADAM_BETA2}
adam_epsilon: ${ADAM_EPSILON}
max_grad_norm: ${MAX_GRAD_NORM}

neftune_noise_alpha: ${NEFTUNE_NOISE_ALPHA}

flash_optimum: ${FLASH_OPTIMUM}
xformers_attention: ${XFORMERS_ATTENTION}
flash_attention: ${FLASH_ATTENTION}
flash_attn_cross_entropy: ${FLASH_ATTN_CROSS_ENTROPY}
flash_attn_rms_norm: ${FLASH_ATTN_RMS_NORM}
flash_attn_fuse_mlp: ${FLASH_ATTN_FUSE_MLP}
sdp_attention: ${SDP_ATTENTION}
s2_attention: ${S2_ATTENTION}
resume_from_checkpoint: ${RESUME_FROM_CHECKPOINT}
auto_resume_from_checkpoints: ${AUTO_RESUME_FROM_CHECKPOINTS}

local_rank: ${LOCAL_RANK}

special_tokens:
  bos_token: ${SPECIAL_TOKEN_BOS}
  eos_token: ${SPECIAL_TOKEN_EOS}
  unk_token: ${SPECIAL_TOKEN_UNK}
  pad_token: ${SPECIAL_TOKEN_PAD}

tokens: ${TOKENS}

fsdp: ${FSDP}
fsdp_config: ${FSDP_CONFIG}
deepspeed: ${DEEPSPEED}

ddp_timeout: ${DDP_TIMEOUT}
ddp_bucket_cap_mb: ${DDP_BUCKET_CAP_MB}
ddp_broadcast_buffers: ${DDP_BROADCAST_BUFFERS}

torchdistx_path: ${TORCHDISTX_PATH}
pretraining_dataset: ${PRETRAINING_DATASET}
debug: ${DEBUG}
seed: ${SEED}
strict: ${STRICT}


================================================
FILE: .runpod/src/handler.py
================================================
"""
Runpod serverless entrypoint handler
"""

import os

import runpod
import yaml
from huggingface_hub._login import login
from train import train
from utils import get_output_dir

BASE_VOLUME = os.environ.get("BASE_VOLUME", "/runpod-volume")
if not os.path.exists(BASE_VOLUME):
    os.makedirs(BASE_VOLUME)

logger = runpod.RunPodLogger()


async def handler(job):
    runpod_job_id = job["id"]
    inputs = job["input"]
    run_id = inputs.get("run_id", "default_run_id")
    args = inputs.get("args", {})

    # Set output directory
    output_dir = os.path.join(BASE_VOLUME, get_output_dir(run_id))
    args["output_dir"] = output_dir

    # First save args to a temporary config file
    config_path = "/workspace/test_config.yaml"

    # Add run_name and job_id to args before saving
    args["run_name"] = run_id
    args["runpod_job_id"] = runpod_job_id

    yaml_data = yaml.dump(args, default_flow_style=False)
    with open(config_path, "w", encoding="utf-8") as file:
        file.write(yaml_data)

    # Handle credentials
    credentials = inputs.get("credentials", {})

    if "wandb_api_key" in credentials:
        os.environ["WANDB_API_KEY"] = credentials["wandb_api_key"]
    if "hf_token" in credentials:
        os.environ["HF_TOKEN"] = credentials["hf_token"]

    if os.environ.get("HF_TOKEN"):
        login(token=os.environ["HF_TOKEN"])
    else:
        logger.info("No HF_TOKEN provided. Skipping login.")

    logger.info("Starting Training.")
    async for result in train(config_path):  # Pass the config path instead of args
        logger.info(result)
    logger.info("Training Complete.")

    # Cleanup
    if "WANDB_API_KEY" in os.environ:
        del os.environ["WANDB_API_KEY"]
    if "HF_TOKEN" in os.environ:
        del os.environ["HF_TOKEN"]


runpod.serverless.start({"handler": handler, "return_aggregate_stream": True})


================================================
FILE: .runpod/src/test_input.json
================================================
{
  "input": {
    "user_id": "user",
    "model_id": "llama-test",
    "run_id": "llama-test",
    "credentials": {
      "wandb_api_key": "",
      "hf_token": ""
    },
    "args": {
      "base_model": "NousResearch/Meta-Llama-3-8B",
      "model_type": "LlamaForCausalLM",
      "tokenizer_type": "AutoTokenizer",
      "load_in_8bit": true,
      "load_in_4bit": false,
      "strict": false,
      "datasets": [
        {
          "path": "mhenrichsen/alpaca_2k_test",
          "type": "alpaca"
        }
      ],
      "val_set_size": 0.05,
      "output_dir": "./outputs/lora-out",
      "sequence_len": 4096,
      "sample_packing": true,
      "eval_sample_packing": false,
      "pad_to_sequence_len": true,
      "adapter": "lora",
      "lora_r": 32,
      "lora_alpha": 16,
      "lora_dropout": 0.05,
      "lora_target_linear": true,
      "lora_modules_to_save": [
        "embed_tokens",
        "lm_head"
      ],
      "gradient_accumulation_steps": 4,
      "micro_batch_size": 2,
      "num_epochs": 1,
      "optimizer": "adamw_bnb_8bit",
      "lr_scheduler": "cosine",
      "learning_rate": 0.0002,
      "train_on_inputs": false,
      "group_by_length": false,
      "bf16": "auto",
      "tf32": false,
      "gradient_checkpointing": true,
      "logging_steps": 1,
      "flash_attention": true,
      "warmup_steps": 1,
      "evals_per_epoch": 1,
      "eval_max_new_tokens": 128,
      "saves_per_epoch": 1,
      "weight_decay": 0.0,
      "special_tokens": {
        "pad_token": "<|end_of_text|>"
      }
    }
  }
}


================================================
FILE: .runpod/src/train.py
================================================
"""
Runpod train entrypoint
"""

import asyncio


async def train(config_path: str, gpu_id: str = "0", preprocess: bool = True):
    """
    Run preprocessing (if enabled) and training with the given config file
    :param config_path: Path to the YAML config file
    :param gpu_id: GPU ID to use (default: "0")
    :param preprocess: Whether to run preprocessing (default: True)

    """
    # First check if preprocessing is needed
    if preprocess:
        # Preprocess command
        preprocess_cmd = (
            f"CUDA_VISIBLE_DEVICES={gpu_id} axolotl preprocess {config_path}"
        )
        process = await asyncio.create_subprocess_shell(
            preprocess_cmd,
            stdout=asyncio.subprocess.PIPE,
            stderr=asyncio.subprocess.STDOUT,
        )

        if process.stdout is not None:
            async for line in process.stdout:
                yield f"Preprocessing: {line.decode().strip()}"
        await process.wait()
        yield "Preprocessing completed."
    else:
        yield "Skipping preprocessing step."

    # Training command
    train_cmd = f"axolotl train {config_path}"
    process = await asyncio.create_subprocess_shell(
        train_cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.STDOUT
    )

    if process.stdout is not None:
        async for line in process.stdout:
            yield f"Training: {line.decode().strip()}"
    await process.wait()


================================================
FILE: .runpod/src/utils.py
================================================
"""
Runpod launcher utils
"""

import os

import yaml


def get_output_dir(run_id):
    path = f"fine-tuning/{run_id}"
    return path


def make_valid_config(input_args):
    """
    Creates and saves updated config file, returns the path to the new config
    :param input_args: dict of input args
    :return: str, path to the updated config file
    """
    # Load default config
    with open("config/config.yaml", "r", encoding="utf-8") as fin:
        all_args = yaml.safe_load(fin)

    if not input_args:
        print("No args provided, using defaults")
    else:
        all_args.update(input_args)

    # Create updated config path
    updated_config_path = "config/updated_config.yaml"

    # Save updated config to new file
    with open(updated_config_path, "w", encoding="utf-8") as f:
        yaml.dump(all_args, f)

    return updated_config_path


def set_config_env_vars(args: dict):
    """
    Convert API arguments into environment variables.
    Handles nested dictionaries, lists, and special values.

    Args:
        args (dict): The arguments dictionary from the API request
    """

    def process_value(value):
        """Convert Python values to string format for environment variables"""
        if value is None:
            return ""
        if isinstance(value, bool):
            return str(value).lower()
        if isinstance(value, (list, dict)):
            return str(value)
        return str(value)

    def set_env_vars(data, prefix=""):
        """Recursively set environment variables from nested dictionary"""
        for key, value in data.items():
            env_key = prefix + key.upper()

            # Handle special cases
            if isinstance(value, dict):
                # For nested dictionaries (like special_tokens)
                set_env_vars(value, f"{env_key}_")
            elif isinstance(value, list):
                # Handle list of dictionaries (like datasets)
                if value and isinstance(value[0], dict):
                    for i, item in enumerate(value):
                        set_env_vars(item, f"{env_key}_{i}_")
                else:
                    # For simple lists (like lora_target_modules)
                    os.environ[env_key] = process_value(value)
            else:
                # Handle all other cases
                os.environ[env_key] = process_value(value)

    # Clear any existing related environment variables
    # This prevents old values from persisting
    for key in list(os.environ.keys()):
        if key.startswith(
            ("BASE_MODEL", "MODEL_TYPE", "TOKENIZER_TYPE", "DATASET", "LORA_", "WANDB_")
        ):
            del os.environ[key]

    # Set new environment variables
    set_env_vars(args)


================================================
FILE: .runpod/test-input.json
================================================
{
  "input": {
    "name": "quick_smoke_test_sft",
    "user_id": "user",
    "model_id": "llama-test",
    "run_id": "llama-test",
    "credentials": {
      "wandb_api_key": "",
      "hf_token": ""
    },
    "args": {
      "base_model": "HuggingFaceTB/SmolLM2-135M",
      "model_type": "AutoModelForCausalLM",
      "tokenizer_type": "AutoTokenizer",
      "load_in_4bit": true,
      "strict": false,
      "datasets": [
        {
          "path": "mhenrichsen/alpaca_2k_test",
          "type": "alpaca",
          "split": "train[:10%]"
        }
      ],
      "val_set_size": 0.02,
      "output_dir": "./outputs/lora-out",
      "sequence_len": 4096,
      "sample_packing": true,
      "eval_sample_packing": false,
      "pad_to_sequence_len": true,
      "adapter": "qlora",
      "lora_r": 32,
      "lora_alpha": 64,
      "lora_dropout": 0.05,
      "lora_target_linear": true,
      "lora_modules_to_save": [
        "embed_tokens",
        "lm_head"
      ],
      "gradient_accumulation_steps": 2,
      "micro_batch_size": 1,
      "num_epochs": 1,
      "optimizer": "adamw_torch_fused",
      "lr_scheduler": "cosine",
      "learning_rate": 0.0002,
      "train_on_inputs": false,
      "group_by_length": false,
      "bf16": "auto",
      "tf32": true,
      "gradient_checkpointing": true,
      "logging_steps": 1,
      "flash_attention": true,
      "warmup_steps": 1,
      "evals_per_epoch": 1,
      "eval_max_new_tokens": 128,
      "saves_per_epoch": 1,
      "weight_decay": 0.0,
      "special_tokens": {
        "pad_token": "<|endoftext|>"
      },
      "max_steps": 20
    },
    "timeout": 100000
  },
  "config": {
    "gpuTypeId": "NVIDIA GeForce RTX 4090",
    "gpuCount": 1,
    "containerDiskInGb": 200,
    "env": [
      {
        "key": "TOKENIZER",
        "value": ""
      },
      {
        "key": "DISABLE_LOG_STATS",
        "value": "true"
      }
    ],
    "allowedCudaVersions": [
      "12.8",
      "12.7",
      "12.6",
      "12.5",
      "12.4"
    ]
  }
}


================================================
FILE: .runpod/tests.json
================================================
{
  "tests": [
    {
      "name": "quick_smoke_test_sft",
      "input": {
        "user_id": "user",
        "model_id": "llama-test",
        "run_id": "llama-test",
        "credentials": {
          "wandb_api_key": "",
          "hf_token": ""
        },
        "args": {
          "base_model": "HuggingFaceTB/SmolLM2-135M",
          "model_type": "AutoModelForCausalLM",
          "tokenizer_type": "AutoTokenizer",
          "load_in_4bit": true,
          "strict": false,
          "datasets": [
            {
              "path": "mhenrichsen/alpaca_2k_test",
              "type": "alpaca",
              "split": "train[:10%]"
            }
          ],
          "val_set_size": 0.02,
          "output_dir": "./outputs/lora-out",
          "sequence_len": 4096,
          "sample_packing": true,
          "eval_sample_packing": false,
          "pad_to_sequence_len": true,
          "adapter": "qlora",
          "lora_r": 32,
          "lora_alpha": 64,
          "lora_dropout": 0.05,
          "lora_target_linear": true,
          "lora_modules_to_save": [
            "embed_tokens",
            "lm_head"
          ],
          "gradient_accumulation_steps": 2,
          "micro_batch_size": 1,
          "num_epochs": 1,
          "optimizer": "adamw_torch_fused",
          "lr_scheduler": "cosine",
          "learning_rate": 0.0002,
          "train_on_inputs": false,
          "group_by_length": false,
          "bf16": "auto",
          "tf32": true,
          "gradient_checkpointing": true,
          "logging_steps": 1,
          "flash_attention": true,
          "warmup_steps": 1,
          "evals_per_epoch": 1,
          "eval_max_new_tokens": 128,
          "saves_per_epoch": 1,
          "weight_decay": 0.0,
          "special_tokens": {
            "pad_token": "<|endoftext|>"
          },
          "max_steps": 20
        }
      },
      "timeout": 100000
    }
  ],
  "config": {
    "gpuTypeId": "NVIDIA GeForce RTX 4090",
    "gpuCount": 1,
    "containerDiskInGb": 200,
    "env": [
      {
        "key": "TOKENIZER",
        "value": ""
      },
      {
        "key": "DISABLE_LOG_STATS",
        "value": "true"
      }
    ],
    "allowedCudaVersions": [
      "12.8",
      "12.7",
      "12.6",
      "12.5",
      "12.4"
    ]
  }
}


================================================
FILE: CITATION.cff
================================================
cff-version: 1.2.0
type: software
title: "Axolotl: Open Source LLM Post-Training"
message: "If you use this software, please cite it as below."
authors:
  - name: "Axolotl maintainers and contributors"
repository-code: "https://github.com/axolotl-ai-cloud/axolotl"
url: "https://axolotl.ai/"
license: Apache-2.0
date-released: "2023-05-30"


================================================
FILE: CNAME
================================================
docs.axolotl.ai


================================================
FILE: FAQS.md
================================================
# FAQs

- Can you train StableLM with this? Yes, but only with a single GPU atm. Multi GPU support is coming soon! Just waiting on this [PR](https://github.com/huggingface/transformers/pull/22874)
- Will this work with Deepspeed? That's still a WIP, but setting `export ACCELERATE_USE_DEEPSPEED=true` should work in some cases
- `Error invalid argument at line 359 in file /workspace/bitsandbytes/csrc/pythonInterface.c`
`/arrow/cpp/src/arrow/filesystem/s3fs.cc:2598:  arrow::fs::FinalizeS3 was not called even though S3 was initialized.`
This could lead to a segmentation fault at exit. Try reinstalling bitsandbytes and transformers from source.


================================================
FILE: LICENSE
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: MANIFEST.in
================================================
include requirements.txt
include README.md
include LICENSE
include src/setuptools_axolotl_dynamic_dependencies.py
include src/axolotl/utils/chat_templates/templates/*.jinja
recursive-include axolotl *.py


================================================
FILE: README.md
================================================
<p align="center">
    <picture>
        <source media="(prefers-color-scheme: dark)" srcset="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/887513285d98132142bf5db2a74eb5e0928787f1/image/axolotl_logo_digital_white.svg">
        <source media="(prefers-color-scheme: light)" srcset="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/887513285d98132142bf5db2a74eb5e0928787f1/image/axolotl_logo_digital_black.svg">
        <img alt="Axolotl" src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/887513285d98132142bf5db2a74eb5e0928787f1/image/axolotl_logo_digital_black.svg" width="400" height="104" style="max-width: 100%;">
    </picture>
</p>
  <p align="center">
      <strong>A Free and Open Source LLM Fine-tuning Framework</strong><br>
  </p>

<p align="center">
    <img src="https://img.shields.io/github/license/axolotl-ai-cloud/axolotl.svg?color=blue" alt="GitHub License">
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests.yml/badge.svg" alt="tests">
    <a href="https://codecov.io/gh/axolotl-ai-cloud/axolotl"><img src="https://codecov.io/gh/axolotl-ai-cloud/axolotl/branch/main/graph/badge.svg" alt="codecov"></a>
    <a href="https://github.com/axolotl-ai-cloud/axolotl/releases"><img src="https://img.shields.io/github/release/axolotl-ai-cloud/axolotl.svg" alt="Releases"></a>
    <br/>
    <a href="https://github.com/axolotl-ai-cloud/axolotl/graphs/contributors"><img src="https://img.shields.io/github/contributors-anon/axolotl-ai-cloud/axolotl?color=yellow&style=flat-square" alt="contributors" style="height: 20px;"></a>
    <img src="https://img.shields.io/github/stars/axolotl-ai-cloud/axolotl" alt="GitHub Repo stars">
    <br/>
    <a href="https://discord.com/invite/HhrNrHJPRb"><img src="https://img.shields.io/badge/discord-7289da.svg?style=flat-square&logo=discord" alt="discord" style="height: 20px;"></a>
    <a href="https://twitter.com/axolotl_ai"><img src="https://img.shields.io/twitter/follow/axolotl_ai?style=social" alt="twitter" style="height: 20px;"></a>
    <a href="https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="google-colab" style="height: 20px;"></a>
    <br/>
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/tests-nightly.yml/badge.svg" alt="tests-nightly">
    <img src="https://github.com/axolotl-ai-cloud/axolotl/actions/workflows/multi-gpu-e2e.yml/badge.svg" alt="multigpu-semi-weekly tests">
</p>


## 🎉 Latest Updates

- 2026/03:
  - New model support has been added in Axolotl for [Mistral Small 4](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/mistral4), [Qwen3.5, Qwen3.5 MoE](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen3.5), [GLM-4.7-Flash](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm47-flash), [GLM-4.6V](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm46v), and [GLM-4.5-Air](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/glm45).
  - [MoE expert quantization](https://docs.axolotl.ai/docs/expert_quantization.html) support (via `quantize_moe_experts: true`) greatly reduces VRAM when training MoE models (FSDP2 compat).
- 2026/02:
  - [ScatterMoE LoRA](https://github.com/axolotl-ai-cloud/axolotl/pull/3410) support. LoRA fine-tuning directly on MoE expert weights using custom Triton kernels.
  - Axolotl now has support for [SageAttention](https://github.com/axolotl-ai-cloud/axolotl/pull/2823) and [GDPO](https://github.com/axolotl-ai-cloud/axolotl/pull/3353) (Generalized DPO).
- 2026/01:
  - New integration for [EAFT](https://github.com/axolotl-ai-cloud/axolotl/pull/3366) (Entropy-Aware Focal Training), weights loss by entropy of the top-k logit distribution, and [Scalable Softmax](https://github.com/axolotl-ai-cloud/axolotl/pull/3338), improves long context in attention.
- 2025/12:
  - Axolotl now includes support for [Kimi-Linear](https://docs.axolotl.ai/docs/models/kimi-linear.html), [Plano-Orchestrator](https://docs.axolotl.ai/docs/models/plano.html), [MiMo](https://docs.axolotl.ai/docs/models/mimo.html), [InternVL 3.5](https://docs.axolotl.ai/docs/models/internvl3_5.html), [Olmo3](https://docs.axolotl.ai/docs/models/olmo3.html), [Trinity](https://docs.axolotl.ai/docs/models/trinity.html), and [Ministral3](https://docs.axolotl.ai/docs/models/ministral3.html).
  - [Distributed Muon Optimizer](https://github.com/axolotl-ai-cloud/axolotl/pull/3264) support has been added for FSDP2 pretraining.
- 2025/10: New model support has been added in Axolotl for: [Qwen3 Next](https://docs.axolotl.ai/docs/models/qwen3-next.html), [Qwen2.5-vl, Qwen3-vl](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/qwen2_5-vl), [Qwen3, Qwen3MoE](https://docs.axolotl.ai/docs/models/qwen3.html), [Granite 4](https://docs.axolotl.ai/docs/models/granite4.html), [HunYuan](https://docs.axolotl.ai/docs/models/hunyuan.html), [Magistral 2509](https://docs.axolotl.ai/docs/models/magistral/vision.html), [Apertus](https://docs.axolotl.ai/docs/models/apertus.html), and [Seed-OSS](https://docs.axolotl.ai/docs/models/seed-oss.html).

<details>

<summary>Expand older updates</summary>

- 2025/09: Axolotl now has text diffusion training. Read more [here](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations/diffusion).
- 2025/08: QAT has been updated to include NVFP4 support. See [PR](https://github.com/axolotl-ai-cloud/axolotl/pull/3107).
- 2025/07:
  - ND Parallelism support has been added into Axolotl. Compose Context Parallelism (CP), Tensor Parallelism (TP), and Fully Sharded Data Parallelism (FSDP) within a single node and across multiple nodes. Check out the [blog post](https://huggingface.co/blog/accelerate-nd-parallel) for more info.
  - Axolotl adds more models: [GPT-OSS](https://docs.axolotl.ai/docs/models/gpt-oss.html), [Gemma 3n](https://docs.axolotl.ai/docs/models/gemma3n.html), [Liquid Foundation Model 2 (LFM2)](https://docs.axolotl.ai/docs/models/LiquidAI.html), and [Arcee Foundation Models (AFM)](https://docs.axolotl.ai/docs/models/arcee.html).
  - FP8 finetuning with fp8 gather op is now possible in Axolotl via `torchao`. Get started [here](https://docs.axolotl.ai/docs/mixed_precision.html#sec-fp8)!
  - [Voxtral](https://docs.axolotl.ai/docs/models/voxtral.html), [Magistral 1.1](https://docs.axolotl.ai/docs/models/magistral.html), and [Devstral](https://docs.axolotl.ai/docs/models/devstral.html) with mistral-common tokenizer support has been integrated in Axolotl!
  - TiledMLP support for single-GPU to multi-GPU training with DDP, DeepSpeed and FSDP support has been added to support Arctic Long Sequence Training. (ALST). See [examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst) for using ALST with Axolotl!
- 2025/06: Magistral with mistral-common tokenizer support has been added to Axolotl. See [docs](https://docs.axolotl.ai/docs/models/magistral.html) to start training your own Magistral models with Axolotl!
- 2025/05: Quantization Aware Training (QAT) support has been added to Axolotl. Explore the [docs](https://docs.axolotl.ai/docs/qat.html) to learn more!
- 2025/04: Llama 4 support has been added in Axolotl. See [docs](https://docs.axolotl.ai/docs/models/llama-4.html) to start training your own Llama 4 models with Axolotl's linearized version!
- 2025/03: Axolotl has implemented Sequence Parallelism (SP) support. Read the [blog](https://huggingface.co/blog/axolotl-ai-co/long-context-with-sequence-parallelism-in-axolotl) and [docs](https://docs.axolotl.ai/docs/sequence_parallelism.html) to learn how to scale your context length when fine-tuning.
- 2025/03: (Beta) Fine-tuning Multimodal models is now supported in Axolotl. Check out the [docs](https://docs.axolotl.ai/docs/multimodal.html) to fine-tune your own!
- 2025/02: Axolotl has added LoRA optimizations to reduce memory usage and improve training speed for LoRA and QLoRA in single GPU and multi-GPU training (DDP and DeepSpeed). Jump into the [docs](https://docs.axolotl.ai/docs/lora_optims.html) to give it a try.
- 2025/02: Axolotl has added GRPO support. Dive into our [blog](https://huggingface.co/blog/axolotl-ai-co/training-llms-w-interpreter-feedback-wasm) and [GRPO example](https://github.com/axolotl-ai-cloud/grpo_code) and have some fun!
- 2025/01: Axolotl has added Reward Modelling / Process Reward Modelling fine-tuning support. See [docs](https://docs.axolotl.ai/docs/reward_modelling.html).

</details>

## ✨ Overview

Axolotl is a free and open-source tool designed to streamline post-training and fine-tuning for the latest large language models (LLMs).

Features:

- **Multiple Model Support**: Train various models like GPT-OSS, LLaMA, Mistral, Mixtral, Pythia, and many more models available on the Hugging Face Hub.
- **Multimodal Training**: Fine-tune vision-language models (VLMs) including LLaMA-Vision, Qwen2-VL, Pixtral, LLaVA, SmolVLM2, GLM-4.6V, InternVL 3.5, Gemma 3n, and audio models like Voxtral with image, video, and audio support.
- **Training Methods**: Full fine-tuning, LoRA, QLoRA, GPTQ, QAT, Preference Tuning (DPO, IPO, KTO, ORPO), RL (GRPO, GDPO), and Reward Modelling (RM) / Process Reward Modelling (PRM).
- **Easy Configuration**: Re-use a single YAML configuration file across the full fine-tuning pipeline: dataset preprocessing, training, evaluation, quantization, and inference.
- **Performance Optimizations**: [Multipacking](https://docs.axolotl.ai/docs/multipack.html), [Flash Attention 2/3/4](https://docs.axolotl.ai/docs/attention.html#flash-attention), [Xformers](https://docs.axolotl.ai/docs/attention.html#xformers), [Flex Attention](https://docs.axolotl.ai/docs/attention.html#flex-attention), [SageAttention](https://docs.axolotl.ai/docs/attention.html#sageattention), [Liger Kernel](https://docs.axolotl.ai/docs/custom_integrations.html#liger-kernels), [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy), [ScatterMoE](https://docs.axolotl.ai/docs/custom_integrations.html#kernels-integration), [Sequence Parallelism (SP)](https://docs.axolotl.ai/docs/sequence_parallelism.html), [LoRA optimizations](https://docs.axolotl.ai/docs/lora_optims.html), [Multi-GPU training (FSDP1, FSDP2, DeepSpeed)](https://docs.axolotl.ai/docs/multi-gpu.html), [Multi-node training (Torchrun, Ray)](https://docs.axolotl.ai/docs/multi-node.html), and many more!
- **Flexible Dataset Handling**: Load from local, HuggingFace, and cloud (S3, Azure, GCP, OCI) datasets.
- **Cloud Ready**: We ship [Docker images](https://hub.docker.com/u/axolotlai) and also [PyPI packages](https://pypi.org/project/axolotl/) for use on cloud platforms and local hardware.


## 🚀 Quick Start - LLM Fine-tuning in Minutes

**Requirements**:

- NVIDIA GPU (Ampere or newer for `bf16` and Flash Attention) or AMD GPU
- Python 3.11
- PyTorch ≥2.8.0

### Google Colab

[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb#scrollTo=msOCO4NRmRLa)

### Installation

#### Using pip

```bash
pip3 install -U packaging==26.0 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]

# Download example axolotl configs, deepspeed configs
axolotl fetch examples
axolotl fetch deepspeed_configs  # OPTIONAL
```

#### Using Docker

Installing with Docker can be less error prone than installing in your own environment.
```bash
docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
```

Other installation approaches are described [here](https://docs.axolotl.ai/docs/installation.html).

#### Cloud Providers

<details>

- [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
- [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=github&utm_medium=developer_community&utm_campaign=template_launch_axolotl&utm_content=readme)
- [PRIME Intellect](https://app.primeintellect.ai/dashboard/create-cluster?image=axolotl&location=Cheapest&security=Cheapest&show_spot=true)
- [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl)
- [Novita](https://novita.ai/gpus-console?templateId=311)
- [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
- [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)

</details>

### Your First Fine-tune

```bash
# Fetch axolotl examples
axolotl fetch examples

# Or, specify a custom path
axolotl fetch examples --dest path/to/folder

# Train a model using LoRA
axolotl train examples/llama-3/lora-1b.yml
```

That's it! Check out our [Getting Started Guide](https://docs.axolotl.ai/docs/getting-started.html) for a more detailed walkthrough.


## 📚 Documentation

- [Installation Options](https://docs.axolotl.ai/docs/installation.html) - Detailed setup instructions for different environments
- [Configuration Guide](https://docs.axolotl.ai/docs/config-reference.html) - Full configuration options and examples
- [Dataset Loading](https://docs.axolotl.ai/docs/dataset_loading.html) - Loading datasets from various sources
- [Dataset Guide](https://docs.axolotl.ai/docs/dataset-formats/) - Supported formats and how to use them
- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [Multipacking](https://docs.axolotl.ai/docs/multipack.html)
- [API Reference](https://docs.axolotl.ai/docs/api/) - Auto-generated code documentation
- [FAQ](https://docs.axolotl.ai/docs/faq.html) - Frequently asked questions

## 🤝 Getting Help

- Join our [Discord community](https://discord.gg/HhrNrHJPRb) for support
- Check out our [Examples](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/) directory
- Read our [Debugging Guide](https://docs.axolotl.ai/docs/debugging.html)
- Need dedicated support? Please contact [✉️wing@axolotl.ai](mailto:wing@axolotl.ai) for options

## 🌟 Contributing

Contributions are welcome! Please see our [Contributing Guide](https://github.com/axolotl-ai-cloud/axolotl/blob/main/.github/CONTRIBUTING.md) for details.

## 📈 Telemetry

Axolotl has opt-out telemetry that helps us understand how the project is being used
and prioritize improvements. We collect basic system information, model types, and
error rates—never personal data or file paths. Telemetry is enabled by default. To
disable it, set AXOLOTL_DO_NOT_TRACK=1. For more details, see our [telemetry documentation](https://docs.axolotl.ai/docs/telemetry.html).

## ❤️ Sponsors

Interested in sponsoring? Contact us at [wing@axolotl.ai](mailto:wing@axolotl.ai)

## 📝 Citing Axolotl

If you use Axolotl in your research or projects, please cite it as follows:

```bibtex
@software{axolotl,
  title = {Axolotl: Open Source LLM Post-Training},
  author = {{Axolotl maintainers and contributors}},
  url = {https://github.com/axolotl-ai-cloud/axolotl},
  license = {Apache-2.0},
  year = {2023}
}
```

## 📜 License

This project is licensed under the Apache 2.0 License - see the [LICENSE](LICENSE) file for details.


================================================
FILE: VERSION
================================================
0.16.0.dev0


================================================
FILE: _quarto.yml
================================================
project:
  type: website
  pre-render:
   - docs/scripts/generate_config_docs.py
   - docs/scripts/generate_examples_docs.py

quartodoc:
  dir: docs/api
  package: axolotl
  title: API Reference
  parser: google

  sections:
    - title: Core
      desc: Core functionality for training
      contents:
        - train
        - evaluate
        - datasets
        - convert
        - prompt_tokenizers
        - logging_config
        - core.builders.base
        - core.builders.causal
        - core.builders.rl
        - core.training_args
        - core.chat.messages
        - core.chat.format.chatml
        - core.chat.format.llama3x
        - core.chat.format.shared
        - core.datasets.chat
        - core.datasets.transforms.chat_builder
    - title: CLI
      desc: Command-line interface
      contents:
        - cli.main
        - cli.train
        - cli.evaluate
        - cli.args
        - cli.art
        - cli.checks
        - cli.config
        - cli.delinearize_llama4
        - cli.inference
        - cli.merge_lora
        - cli.merge_sharded_fsdp_weights
        - cli.preprocess
        - cli.quantize
        - cli.vllm_serve
        - cli.cloud.base
        - cli.cloud.modal_
        - cli.utils
        - cli.utils.args
        - cli.utils.fetch
        - cli.utils.load
        - cli.utils.sweeps
        - cli.utils.train
    - title: Trainers
      desc: Training implementations
      contents:
        - core.trainers.base
        - core.trainers.trl
        - core.trainers.mamba
        - core.trainers.dpo.trainer
        - core.trainers.grpo.trainer
        - core.trainers.grpo.sampler
        - core.trainers.utils
    - title: Model Loading
      desc: Functionality for loading and patching models, tokenizers, etc.
      contents:
        - loaders.model
        - loaders.tokenizer
        - loaders.processor
        - loaders.adapter
        - loaders.patch_manager
        - loaders.constants
    - title: Mixins
      desc: Mixin classes for augmenting trainers
      contents:
        - core.trainers.mixins.optimizer
        - core.trainers.mixins.rng_state_loader
        - core.trainers.mixins.scheduler
    - title: Context Managers
      desc: Context managers for altering trainer behaviors
      contents:
        - utils.ctx_managers.sequence_parallel
    - title: Prompt Strategies
      desc: Prompt formatting strategies
      contents:
        - prompt_strategies.base
        - prompt_strategies.chat_template
        - prompt_strategies.alpaca_chat
        - prompt_strategies.alpaca_instruct
        - prompt_strategies.alpaca_w_system
        - prompt_strategies.user_defined
        - prompt_strategies.llama2_chat
        - prompt_strategies.completion
        - prompt_strategies.input_output
        - prompt_strategies.stepwise_supervised
        - prompt_strategies.metharme
        - prompt_strategies.orcamini
        - prompt_strategies.pygmalion
        - prompt_strategies.messages.chat
        - prompt_strategies.dpo.chat_template
        - prompt_strategies.dpo.llama3
        - prompt_strategies.dpo.chatml
        - prompt_strategies.dpo.zephyr
        - prompt_strategies.dpo.user_defined
        - prompt_strategies.dpo.passthrough
        - prompt_strategies.kto.llama3
        - prompt_strategies.kto.chatml
        - prompt_strategies.kto.user_defined
        - prompt_strategies.orpo.chat_template
        - prompt_strategies.bradley_terry.llama3
    - title: Kernels
      desc: Low-level performance optimizations
      contents:
        - kernels.lora
        - kernels.geglu
        - kernels.swiglu
        - kernels.quantize
        - kernels.utils
    - title: Monkey Patches
      desc: Runtime patches for model optimizations
      contents:
        - monkeypatch.llama_attn_hijack_flash
        - monkeypatch.llama_attn_hijack_xformers
        - monkeypatch.mistral_attn_hijack_flash
        - monkeypatch.multipack
        - monkeypatch.relora
        - monkeypatch.lora_kernels
        - monkeypatch.utils
        - monkeypatch.btlm_attn_hijack_flash
        - monkeypatch.stablelm_attn_hijack_flash
        - monkeypatch.trainer_fsdp_optim
        - monkeypatch.transformers_fa_utils
        - monkeypatch.unsloth_
        - monkeypatch.data.batch_dataset_fetcher
        - monkeypatch.mixtral
        - monkeypatch.gradient_checkpointing.offload_cpu
        - monkeypatch.gradient_checkpointing.offload_disk
    - title: Utils
      desc: Utility functions
      contents:
        - utils.tokenization
        - utils.chat_templates
        - utils.lora
        - utils.model_shard_quant
        - utils.bench
        - utils.freeze
        - utils.trainer
        - utils.schedulers
        - utils.distributed
        - utils.dict
        - utils.optimizers.adopt
        - utils.data.streaming
        - utils.data.sft
        - utils.quantization
    - title: Schemas
      desc: Pydantic data models for Axolotl config
      contents:
        - utils.schemas.config
        - utils.schemas.model
        - utils.schemas.training
        - utils.schemas.datasets
        - utils.schemas.peft
        - utils.schemas.trl
        - utils.schemas.multimodal
        - utils.schemas.integrations
        - utils.schemas.enums
        - utils.schemas.utils
    - title: Integrations
      desc: Third-party integrations and extensions
      contents:
        - integrations.base
        - integrations.cut_cross_entropy.args
        - integrations.grokfast.optimizer
        - integrations.kd.trainer
        - integrations.liger.args
        - integrations.lm_eval.args
        - integrations.spectrum.args
    - title: Common
      desc: Common utilities and shared functionality
      contents:
        - common.architectures
        - common.const
        - common.datasets
    - title: Models
      desc: Custom model implementations
      contents:
        - models.mamba.modeling_mamba
    - title: Data Processing
      desc: Data processing utilities
      contents:
        - utils.collators.core
        - utils.collators.batching
        - utils.collators.mamba
        - utils.collators.mm_chat
        - utils.samplers.multipack
    - title: Callbacks
      desc: Training callbacks
      contents:
        - utils.callbacks.perplexity
        - utils.callbacks.profiler
        - utils.callbacks.lisa
        - utils.callbacks.mlflow_
        - utils.callbacks.comet_
        - utils.callbacks.qat
website:
  title: "Axolotl"
  description: "We make fine-tuning accessible, scalable, and fun"
  favicon: favicon.jpg

  google-analytics: "G-9KYCVJBNMQ"

  navbar:
    logo: image/axolotl_logo_digital_white.svg
    title: false
    background: dark
    pinned: false
    collapse: false
    tools:
    - icon: twitter
      href: https://twitter.com/axolotl_ai
    - icon: github
      href: https://github.com/axolotl-ai-cloud/axolotl/
    - icon: discord
      href: https://discord.gg/7m9sfhzaf3

  sidebar:
      pinned: true
      collapse-level: 2
      style: docked
      contents:
        - text: Home
          href: index.qmd

        - section: "Getting Started"
          contents:
            - docs/getting-started.qmd
            - docs/installation.qmd
            - docs/inference.qmd
            - section: "Model Guides"
              contents:
                - docs/models/kimi-linear.qmd
                - docs/models/plano.qmd
                - docs/models/mimo.qmd
                - docs/models/internvl3_5.qmd
                - docs/models/olmo3.qmd
                - docs/models/trinity.qmd
                - docs/models/arcee.qmd
                - section: "Ministral3"
                  contents:
                    - docs/models/ministral3.qmd
                    - docs/models/ministral3/think.qmd
                    - docs/models/ministral3/vision.qmd
                - section: "Magistral"
                  contents:
                    - docs/models/magistral.qmd
                    - docs/models/magistral/think.qmd
                    - docs/models/magistral/vision.qmd
                - docs/models/ministral.qmd
                - docs/models/mistral-small.qmd
                - docs/models/voxtral.qmd
                - docs/models/devstral.qmd
                - docs/models/mistral.qmd
                - docs/models/llama-4.qmd
                - docs/models/llama-2.qmd
                - docs/models/qwen3-next.qmd
                - docs/models/qwen3.qmd
                - docs/models/gemma3n.qmd
                - docs/models/apertus.qmd
                - docs/models/gpt-oss.qmd
                - docs/models/seed-oss.qmd
                - docs/models/phi.qmd
                - docs/models/smolvlm2.qmd
                - docs/models/granite4.qmd
                - docs/models/LiquidAI.qmd
                - docs/models/hunyuan.qmd
                - docs/models/jamba.qmd
                - docs/models/orpheus.qmd

            - docs/cli.qmd
            - docs/telemetry.qmd
            - docs/config-reference.qmd
            - text: "API Reference"
              href: docs/api

        - section: "Dataset Formats"
          contents: docs/dataset-formats/*

        - section: "Deployments"
          contents:
            - docs/docker.qmd
            - docs/multi-gpu.qmd
            - docs/multi-node.qmd
            - docs/ray-integration.qmd
            - docs/amd_hpc.qmd
            - docs/mac.qmd

        - section: "How To Guides"
          contents:
            - docs/multimodal.qmd
            - docs/rlhf.qmd
            - docs/reward_modelling.qmd
            - docs/lr_groups.qmd
            - docs/lora_optims.qmd
            - docs/dataset_loading.qmd
            - docs/qat.qmd
            - docs/quantize.qmd
            - docs/optimizations.qmd

        - section: "Core Concepts"
          contents:
            - docs/batch_vs_grad.qmd
            - docs/dataset_preprocessing.qmd
            - docs/streaming.qmd
            - docs/multipack.qmd
            - docs/mixed_precision.qmd
            - docs/optimizers.qmd
            - docs/attention.qmd

        - section: "Advanced Features"
          contents:
            - docs/fsdp_qlora.qmd
            - docs/unsloth.qmd
            - docs/torchao.qmd
            - docs/custom_integrations.qmd
            - docs/sequence_parallelism.qmd
            - docs/gradient_checkpointing.qmd
            - docs/nd_parallelism.qmd
            - docs/expert_quantization.qmd

        - section: "Troubleshooting"
          contents:
            - docs/faq.qmd
            - docs/debugging.qmd
            - docs/nccl.qmd

format:
  html:
    theme: darkly
    css: styles.css
    toc: true
    # Enable better handling of line breaks in markdown
    preserve-tabs: true
    html-math-method: mathjax
    # Improved markdown processing options
    md-extensions:
      - markdown_it
      - def_list
      - attr_list
      - fenced_divs
      - tables
      - html_admonition
      - lineblocks
      - fancy_lists
    # Control whitespace handling
    whitespace: preserve
    # Process newlines in paragraphs
    wrap: preserve
    # Better line break handling
    preserve-linebreaks: true


================================================
FILE: benchmarks/bench_entropy.py
================================================
"""Benchmark for entropy_from_logits Triton kernel vs original chunked implementation.

Usage: CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_entropy.py
"""

import gc
import statistics

import torch
import torch.nn.functional as F

from axolotl.monkeypatch.trainer.utils import entropy_from_logits

V = 151936  # Qwen vocab
WARMUP = 5
BENCH_ITERS = 20
MEM_ITERS = 10


def entropy_from_logits_original(logits: torch.Tensor, chunk_size: int = 128):
    """Original chunked implementation (reference)."""
    original_shape = logits.shape[:-1]
    num_classes = logits.shape[-1]
    flat_logits = logits.reshape(-1, num_classes)
    entropies = []
    for chunk in flat_logits.split(chunk_size, dim=0):
        logps = F.log_softmax(chunk, dim=-1)
        chunk_entropy = -(torch.exp(logps) * logps).sum(-1)
        entropies.append(chunk_entropy)
    return torch.cat(entropies, dim=0).reshape(original_shape)


def _clean_gpu():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.reset_accumulated_memory_stats()
    torch.cuda.synchronize()


def profile_time(fn, logits, n_iters=BENCH_ITERS):
    for _ in range(WARMUP):
        out = fn(logits, chunk_size=128)
        del out
    torch.cuda.synchronize()

    times = []
    for _ in range(n_iters):
        s = torch.cuda.Event(enable_timing=True)
        e = torch.cuda.Event(enable_timing=True)
        s.record()
        out = fn(logits, chunk_size=128)
        e.record()
        torch.cuda.synchronize()
        times.append(s.elapsed_time(e))
        del out
    return times


def profile_memory(fn, logits, n_iters=MEM_ITERS):
    for _ in range(WARMUP):
        out = fn(logits, chunk_size=128)
        del out
    torch.cuda.synchronize()

    peaks = []
    for _ in range(n_iters):
        _clean_gpu()
        base = torch.cuda.max_memory_allocated()
        out = fn(logits, chunk_size=128)
        torch.cuda.synchronize()
        peaks.append(torch.cuda.max_memory_allocated() - base)
        del out
    return [p / 1e6 for p in peaks]


def fmt(values, unit=""):
    mean = statistics.mean(values)
    std = statistics.stdev(values) if len(values) > 1 else 0.0
    return f"{mean:8.2f} ± {std:5.2f} {unit}  [min={min(values):.2f}, max={max(values):.2f}]"


def benchmark_contiguous():
    print("=" * 60)
    print(
        f"CONTIGUOUS BENCHMARK  (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})"
    )
    print("=" * 60)

    configs = [
        (1, 2048),
        (1, 8192),
        (1, 16384),
        (4, 4096),
        (8, 2048),
        (16, 2048),
        (16, 4096),
    ]

    for B, L in configs:
        mem_gb = B * L * V * 2 / 1e9
        if mem_gb > 28:
            print(f"\n  skip B={B}, L={L} ({mem_gb:.1f} GB)")
            continue

        N = B * L
        print(f"\n{'─' * 60}")
        print(f"B={B:2d}, L={L:5d}  ({N:6d} rows, logits {mem_gb:.2f} GB)")
        print(f"{'─' * 60}")

        torch.manual_seed(42)
        logits = torch.randn(B, L, V, device="cuda", dtype=torch.bfloat16)

        t_orig = profile_time(entropy_from_logits_original, logits)
        t_triton = profile_time(entropy_from_logits, logits)
        orig_mean = statistics.mean(t_orig)
        triton_mean = statistics.mean(t_triton)

        print("  TIME (ms):")
        print(f"    original: {fmt(t_orig, 'ms')}")
        print(f"    triton:   {fmt(t_triton, 'ms')}")
        print(f"    speedup:  {orig_mean / triton_mean:.2f}x")

        m_orig = profile_memory(entropy_from_logits_original, logits)
        m_triton = profile_memory(entropy_from_logits, logits)
        orig_peak = statistics.mean(m_orig)
        triton_peak = statistics.mean(m_triton)

        print("  MEMORY (peak overhead):")
        print(f"    original: {fmt(m_orig, 'MB')}")
        print(f"    triton:   {fmt(m_triton, 'MB')}")
        print(f"    saved:    {orig_peak - triton_peak:.1f} MB")

        del logits
        _clean_gpu()


def benchmark_noncontiguous():
    print("\n" + "=" * 60)
    print(
        f"NON-CONTIGUOUS BENCHMARK  (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})"
    )
    print("=" * 60)

    configs = [
        (4, 2048, "transpose"),
        (4, 8192, "transpose"),
        (8, 2048, "transpose"),
        (4, 4096, "slice_batch"),
    ]

    for B, L, method in configs:
        torch.manual_seed(42)

        if method == "transpose":
            raw = torch.randn(L, B, V, device="cuda", dtype=torch.bfloat16)
            logits_nc = raw.transpose(0, 1)
            raw_gb = L * B * V * 2 / 1e9
        elif method == "slice_batch":
            raw = torch.randn(B * 2, L, V, device="cuda", dtype=torch.bfloat16)
            logits_nc = raw[::2]
            raw_gb = B * 2 * L * V * 2 / 1e9
        else:
            continue

        if raw_gb > 28:
            print(f"\n  skip B={B}, L={L}, {method} ({raw_gb:.1f} GB)")
            del raw, logits_nc
            torch.cuda.empty_cache()
            continue

        N = B * L
        print(f"\n{'─' * 60}")
        print(f"B={B}, L={L}  {method}  ({N} rows, raw {raw_gb:.2f} GB)")
        print(f"{'─' * 60}")

        def original_with_copy(logits, chunk_size=128):
            return entropy_from_logits_original(
                logits.contiguous(), chunk_size=chunk_size
            )

        t_orig = profile_time(original_with_copy, logits_nc)
        t_triton = profile_time(entropy_from_logits, logits_nc)
        orig_mean = statistics.mean(t_orig)
        triton_mean = statistics.mean(t_triton)

        print("  TIME (ms):")
        print(f"    orig+copy:     {fmt(t_orig, 'ms')}")
        print(f"    triton-strided:{fmt(t_triton, 'ms')}")
        print(f"    speedup:       {orig_mean / triton_mean:.2f}x")

        m_orig = profile_memory(original_with_copy, logits_nc)
        m_triton = profile_memory(entropy_from_logits, logits_nc)
        orig_peak = statistics.mean(m_orig)
        triton_peak = statistics.mean(m_triton)

        print("  MEMORY (peak overhead):")
        print(f"    orig+copy:     {fmt(m_orig, 'MB')}")
        print(f"    triton-strided:{fmt(m_triton, 'MB')}")
        print(f"    saved:         {orig_peak - triton_peak:.1f} MB")

        del raw, logits_nc
        _clean_gpu()


if __name__ == "__main__":
    benchmark_contiguous()
    benchmark_noncontiguous()


================================================
FILE: benchmarks/bench_scattermoe_lora.py
================================================
"""Benchmark for ScatterMoE LoRA Triton kernels.

Measures forward, backward dX, and backward dA/dB kernels at common MoE
model shapes. Reports per-kernel timings, LoRA overhead vs base scatter2scatter,
and full fwd+bwd autograd throughput.

Usage:
  CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_scattermoe_lora.py
  CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_scattermoe_lora.py --ranks 16 64
  CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_scattermoe_lora.py --models Qwen/Qwen3.5-35B-A3B
"""

import argparse
import gc
import time
from functools import partial

import torch

from axolotl.integrations.kernels.libs.scattermoe_lora.kernels import (
    lora_ops,
    ops as base_ops,
)
from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_experts import (
    flatten_sort_count,
)
from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_linear_lora import (
    ScatterMoELoRA,
)

DEVICE = "cuda"
DTYPE = torch.bfloat16
WARMUP = 5
ITERS = 20

# ─── Model configs ──────────────────────────────────────────────────────────

BUILTIN_CONFIGS = {
    "Qwen3.5-35B-A3B": (256, 2048, 512, 8),  # E, H, I, k
    "Qwen3-30B-A3B": (128, 2048, 768, 8),
    "OLMoE-1B-7B": (64, 2048, 1024, 8),
    "Mixtral-8x7B": (8, 4096, 14336, 2),
}


def _resolve_config(spec):
    """Resolve a model spec to (E, H, I, k). Accepts builtin names or HF IDs."""
    key = spec.lower().replace("/", "-")
    for name, cfg in BUILTIN_CONFIGS.items():
        if key in name.lower() or name.lower() in key:
            return name, cfg

    from transformers import AutoConfig

    hf_cfg = AutoConfig.from_pretrained(spec, trust_remote_code=True)
    if callable(getattr(hf_cfg, "get_text_config", None)):
        tc = hf_cfg.get_text_config()
        if hasattr(tc, "model_type") and tc.model_type != hf_cfg.model_type:
            hf_cfg = tc
    hidden = hf_cfg.hidden_size
    inter = getattr(hf_cfg, "moe_intermediate_size", None) or hf_cfg.intermediate_size
    experts = (
        getattr(hf_cfg, "num_experts", None)
        or getattr(hf_cfg, "num_local_experts", None)
        or getattr(hf_cfg, "n_routed_experts", None)
    )
    top_k = (
        getattr(hf_cfg, "num_experts_per_tok", None)
        or getattr(hf_cfg, "num_experts_per_token", None)
        or 2
    )
    name = spec.split("/")[-1]
    return name, (experts, hidden, inter, top_k)


# ─── Benchmark helpers ──────────────────────────────────────────────────────


def _clean():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.synchronize()


def _bench(fn, warmup=WARMUP, iters=ITERS):
    for _ in range(warmup):
        fn()
    torch.cuda.synchronize()
    times = []
    for _ in range(iters):
        torch.cuda.synchronize()
        t0 = time.perf_counter()
        fn()
        torch.cuda.synchronize()
        times.append((time.perf_counter() - t0) * 1000)
    times.sort()
    return times[len(times) // 2]


def _setup(num_experts, K, N, T, top_k, R):
    torch.manual_seed(42)
    x = torch.randn(T, K, device=DEVICE, dtype=DTYPE)
    W = torch.randn(num_experts, K, N, device=DEVICE, dtype=DTYPE) * 0.02
    lora_A = torch.randn(R * num_experts, K, device=DEVICE, dtype=DTYPE) * 0.01
    lora_B = torch.randn(N, R * num_experts, device=DEVICE, dtype=DTYPE) * 0.01
    logits = torch.randn(T, num_experts, device=DEVICE)
    _, top_idx = torch.topk(torch.softmax(logits, dim=-1), top_k, dim=-1)
    sei, ssi, eo = flatten_sort_count(top_idx, num_experts)
    gx = base_ops.group(x, ssi, fan_out=top_k)
    dy = torch.randn(gx.size(0), N, device=DEVICE, dtype=DTYPE)
    return x, W, lora_A, lora_B, sei, ssi, eo, gx, dy


# ─── Kernel wrappers (avoid B023 loop-variable capture) ──────────────────────


def _call_fwd(x, W, sei, ssi, top_k, lA, lB):
    return lora_ops.scatter2scatter_lora(
        X=x,
        W=W,
        sorted_expert_idxs=sei,
        sorted_scattered_idxs=ssi,
        k=top_k,
        lora_A=lA,
        lora_B=lB,
        scaling=2.0,
    )


def _call_base(x, W, sei, ssi, top_k):
    return base_ops.scatter2scatter(
        X=x,
        W=W,
        sorted_expert_idxs=sei,
        sorted_scattered_idxs=ssi,
        k=top_k,
    )


def _call_dx(dy, W, sei, ssi, lA, lB):
    return lora_ops.scatter2scatter_lora_dX(
        DY=dy,
        W=W,
        sorted_expert_idxs=sei,
        sorted_scattered_idxs=ssi,
        k=1,
        lora_A=lA,
        lora_B=lB,
        scaling=2.0,
        dy_grouped=True,
        dx_grouped=False,
    )


def _call_bwd(dy, gx, lA, lB, eo, num_experts):
    return lora_ops.group_bwd_lora(
        DY=dy,
        X=gx,
        lora_A=lA,
        lora_B=lB,
        expert_offsets=eo,
        E=num_experts,
        scaling=2.0,
    )


# ─── Main ────────────────────────────────────────────────────────────────────


def main():
    parser = argparse.ArgumentParser(description="ScatterMoE LoRA kernel benchmark")
    parser.add_argument(
        "--models",
        "-m",
        nargs="+",
        help="Model names or HF IDs (default: all builtins)",
    )
    parser.add_argument("--ranks", "-r", nargs="+", type=int, default=[16, 32, 64])
    parser.add_argument("--seq-len", "-T", type=int, default=2048)
    args = parser.parse_args()

    T = args.seq_len
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"T={T}, ranks={args.ranks}\n")

    if args.models:
        configs = [_resolve_config(m) for m in args.models]
    else:
        configs = list(BUILTIN_CONFIGS.items())

    for model_name, (num_experts, hidden, inter, top_k) in configs:
        print(f"{'=' * 70}")
        print(f"  {model_name}: E={num_experts}, H={hidden}, I={inter}, k={top_k}")
        print(f"{'=' * 70}")

        for R in args.ranks:
            for proj, K, N in [("gate_up", hidden, 2 * inter), ("down", inter, hidden)]:
                _clean()
                x, W, lA, lB, sei, ssi, eo, gx, dy = _setup(
                    num_experts, K, N, T, top_k, R
                )

                # Forward with LoRA (auto-dispatched: fused or split)
                dispatch = (
                    "split"
                    if (
                        num_experts <= lora_ops._SPLIT_LORA_FWD_MAX_EXPERTS
                        and K * N >= lora_ops._SPLIT_LORA_FWD_THRESHOLD
                    )
                    else "fused"
                )
                t_fwd = _bench(partial(_call_fwd, x, W, sei, ssi, top_k, lA, lB))
                t_base = _bench(partial(_call_base, x, W, sei, ssi, top_k))
                t_dx = _bench(partial(_call_dx, dy, W, sei, ssi, lA, lB))
                t_bwd = _bench(partial(_call_bwd, dy, gx, lA, lB, eo, num_experts))

                total = t_fwd + t_dx + t_bwd
                overhead = t_fwd / t_base - 1 if t_base > 0 else 0

                print(
                    f"  R={R:>2} {proj:<8}  "
                    f"fwd={t_fwd:>6.2f}ms [{dispatch}]  "
                    f"base={t_base:>6.2f}ms "
                    f"(+{overhead * 100:.0f}%)  "
                    f"dx={t_dx:>6.2f}ms  bwd={t_bwd:>6.2f}ms  "
                    f"total={total:>6.2f}ms"
                )

                # Full autograd fwd+bwd with memory measurement
                x_ag = x.clone().requires_grad_(True)
                lA_ag = lA.clone().requires_grad_(True)
                lB_ag = lB.clone().requires_grad_(True)

                def _run_autograd(
                    _x=x_ag,
                    _W=W,
                    _k=top_k,
                    _sei=sei,
                    _ssi=ssi,
                    _eo=eo,
                    _lA=lA_ag,
                    _lB=lB_ag,
                ):
                    out = ScatterMoELoRA.apply(
                        _x,
                        _W,
                        _k,
                        _sei,
                        _ssi,
                        _eo,
                        _lA,
                        _lB,
                        2.0,
                        None,
                        None,
                        False,
                        False,
                        True,
                        False,
                    )
                    out.sum().backward()
                    _x.grad = None
                    _lA.grad = None
                    _lB.grad = None

                t_full = _bench(_run_autograd)

                _clean()
                torch.cuda.reset_peak_memory_stats()
                mem_before = torch.cuda.memory_allocated()
                _run_autograd()
                torch.cuda.synchronize()
                mem_peak = torch.cuda.max_memory_allocated() - mem_before

                print(
                    f"         full_fwd_bwd={t_full:>6.2f}ms  "
                    f"peak_delta={mem_peak / 1e6:>6.1f}MB"
                )

        print()


if __name__ == "__main__":
    main()


================================================
FILE: benchmarks/bench_selective_logsoftmax.py
================================================
"""Benchmark for selective_log_softmax Triton kernel vs original implementation.

Usage: CUDA_VISIBLE_DEVICES=0 python benchmarks/bench_selective_logsoftmax.py
"""

import gc
import statistics

import torch

from axolotl.monkeypatch.trainer.utils import (
    selective_log_softmax,
    selective_log_softmax_original,
)

V = 151936  # Qwen vocab
WARMUP = 5
BENCH_ITERS = 20
MEM_ITERS = 10


def _clean_gpu():
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.reset_peak_memory_stats()
    torch.cuda.reset_accumulated_memory_stats()
    torch.cuda.synchronize()


def profile_time(fn, args, n_iters=BENCH_ITERS):
    for _ in range(WARMUP):
        fn(*args)
    torch.cuda.synchronize()

    times = []
    for _ in range(n_iters):
        s = torch.cuda.Event(enable_timing=True)
        e = torch.cuda.Event(enable_timing=True)
        s.record()
        fn(*args)
        e.record()
        torch.cuda.synchronize()
        times.append(s.elapsed_time(e))
    return times


def profile_memory(fn, args, n_iters=MEM_ITERS):
    for _ in range(WARMUP):
        out = fn(*args)
        del out
    torch.cuda.synchronize()

    peaks = []
    for _ in range(n_iters):
        _clean_gpu()
        base = torch.cuda.max_memory_allocated()
        out = fn(*args)
        torch.cuda.synchronize()
        peaks.append(torch.cuda.max_memory_allocated() - base)
        del out
    return [p / 1e6 for p in peaks]


def fmt(values, unit=""):
    mean = statistics.mean(values)
    std = statistics.stdev(values) if len(values) > 1 else 0.0
    return f"{mean:8.2f} ± {std:5.2f} {unit}  [min={min(values):.2f}, max={max(values):.2f}]"


def benchmark_forward():
    print("=" * 60)
    print(f"FORWARD BENCHMARK  (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})")
    print("=" * 60)

    configs = [
        (1, 2048),
        (1, 8192),
        (4, 4096),
        (8, 2048),
        (16, 2048),
        (16, 4096),
    ]

    for B, L in configs:
        mem_gb = B * L * V * 2 / 1e9
        if mem_gb > 28:
            print(f"\n  skip B={B}, L={L} ({mem_gb:.1f} GB)")
            continue

        N = B * L
        print(f"\n{'─' * 60}")
        print(f"B={B:2d}, L={L:5d}  ({N:6d} rows, logits {mem_gb:.2f} GB)")
        print(f"{'─' * 60}")

        torch.manual_seed(42)
        logits = torch.randn(B, L, V, device="cuda", dtype=torch.bfloat16)
        index = torch.randint(0, V, (B, L), device="cuda")

        t_orig = profile_time(selective_log_softmax_original, (logits, index))
        t_triton = profile_time(selective_log_softmax, (logits, index))
        orig_mean = statistics.mean(t_orig)
        triton_mean = statistics.mean(t_triton)

        print("  TIME (ms):")
        print(f"    original: {fmt(t_orig, 'ms')}")
        print(f"    triton:   {fmt(t_triton, 'ms')}")
        print(f"    speedup:  {orig_mean / triton_mean:.2f}x")

        m_orig = profile_memory(selective_log_softmax_original, (logits, index))
        m_triton = profile_memory(selective_log_softmax, (logits, index))
        orig_peak = statistics.mean(m_orig)
        triton_peak = statistics.mean(m_triton)

        print("  MEMORY (peak overhead):")
        print(f"    original: {fmt(m_orig, 'MB')}")
        print(f"    triton:   {fmt(m_triton, 'MB')}")
        print(f"    saved:    {orig_peak - triton_peak:.1f} MB")

        del logits, index
        _clean_gpu()


def benchmark_backward():
    print("\n" + "=" * 60)
    print(f"FWD+BWD BENCHMARK  (warmup={WARMUP}, time={BENCH_ITERS}, mem={MEM_ITERS})")
    print("=" * 60)

    configs = [
        (1, 2048),
        (1, 8192),
        (4, 4096),
        (8, 2048),
        (16, 2048),
        (16, 4096),
    ]

    def fwd_bwd_original(logits, index):
        logits.grad = None
        out = selective_log_softmax_original(logits, index)
        out.sum().backward()

    def fwd_bwd_triton(logits, index):
        logits.grad = None
        out = selective_log_softmax(logits, index)
        out.sum().backward()

    for B, L in configs:
        mem_gb = B * L * V * 2 / 1e9
        if mem_gb > 20:
            print(f"\n  skip B={B}, L={L} ({mem_gb:.1f} GB, need room for grads)")
            continue

        N = B * L
        print(f"\n{'─' * 60}")
        print(f"B={B:2d}, L={L:5d}  ({N:6d} rows, logits {mem_gb:.2f} GB)")
        print(f"{'─' * 60}")

        torch.manual_seed(42)
        logits_orig = torch.randn(
            B, L, V, device="cuda", dtype=torch.bfloat16, requires_grad=True
        )
        logits_tri = logits_orig.detach().clone().requires_grad_(True)
        index = torch.randint(0, V, (B, L), device="cuda")

        t_orig = profile_time(fwd_bwd_original, (logits_orig, index))
        t_triton = profile_time(fwd_bwd_triton, (logits_tri, index))
        orig_mean = statistics.mean(t_orig)
        triton_mean = statistics.mean(t_triton)

        print("  FWD+BWD TIME (ms):")
        print(f"    original: {fmt(t_orig, 'ms')}")
        print(f"    triton:   {fmt(t_triton, 'ms')}")
        print(f"    speedup:  {orig_mean / triton_mean:.2f}x")

        m_orig = profile_memory(fwd_bwd_original, (logits_orig, index))
        m_triton = profile_memory(fwd_bwd_triton, (logits_tri, index))
        orig_peak = statistics.mean(m_orig)
        triton_peak = statistics.mean(m_triton)

        print("  FWD+BWD MEMORY (peak overhead):")
        print(f"    original: {fmt(m_orig, 'MB')}")
        print(f"    triton:   {fmt(m_triton, 'MB')}")
        print(f"    saved:    {orig_peak - triton_peak:.1f} MB")

        del logits_orig, logits_tri, index
        _clean_gpu()


if __name__ == "__main__":
    benchmark_forward()
    benchmark_backward()


================================================
FILE: cicd/Dockerfile-uv.jinja
================================================
FROM axolotlai/axolotl-base-uv:{{ BASE_TAG }}

ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"
ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
ENV CUDA="{{ CUDA }}"
ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
ENV GITHUB_REF="{{ GITHUB_REF }}"
ENV GITHUB_SHA="{{ GITHUB_SHA }}"
ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
ENV HF_HOME="{{ HF_HOME }}"

RUN apt-get update && \
    apt-get install -y --allow-change-held-packages vim curl nano zstd libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm

WORKDIR /workspace

RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git

WORKDIR /workspace/axolotl

RUN git fetch origin +$GITHUB_REF && \
    git checkout FETCH_HEAD

# If AXOLOTL_EXTRAS is set, append it in brackets
RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi

RUN uv pip install packaging==26.0 setuptools==78.1.1
RUN uv pip install torchvision
RUN uv pip uninstall causal_conv1d
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
        uv pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
    fi

RUN python scripts/unsloth_install.py --uv | sh
RUN python scripts/cutcrossentropy_install.py --uv | sh

# So we can test the Docker image
RUN uv pip install -r requirements-dev.txt -r requirements-tests.txt

# fix so that git fetch/pull from remote works
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
    git config --get remote.origin.fetch

# helper for huggingface-login cli
RUN git config --global credential.helper store


================================================
FILE: cicd/Dockerfile.jinja
================================================
FROM axolotlai/axolotl-base:{{ BASE_TAG }}

ENV TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 8.7 8.9 9.0+PTX"
ENV AXOLOTL_EXTRAS="{{ AXOLOTL_EXTRAS }}"
ENV AXOLOTL_ARGS="{{ AXOLOTL_ARGS }}"
ENV CUDA="{{ CUDA }}"
ENV PYTORCH_VERSION="{{ PYTORCH_VERSION }}"
ENV GITHUB_REF="{{ GITHUB_REF }}"
ENV GITHUB_SHA="{{ GITHUB_SHA }}"
ENV NIGHTLY_BUILD="{{ NIGHTLY_BUILD }}"
ENV HF_HOME="{{ HF_HOME }}"
ENV AXOLOTL_DATASET_NUM_PROC="8"

RUN apt-get update && \
    apt-get install -y --allow-change-held-packages vim curl nano zstd libnccl2 libnccl-dev ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm

WORKDIR /workspace

RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git

WORKDIR /workspace/axolotl

RUN git fetch origin +$GITHUB_REF && \
    git checkout FETCH_HEAD

# If AXOLOTL_EXTRAS is set, append it in brackets
RUN if [ "$NIGHTLY_BUILD" = "true" ] ; then \
        sed -i 's#^transformers.*#transformers @ git+https://github.com/huggingface/transformers.git@main#' requirements.txt; \
        sed -i 's#^peft.*#peft @ git+https://github.com/huggingface/peft.git@main#' requirements.txt; \
        sed -i 's#^accelerate.*#accelerate @ git+https://github.com/huggingface/accelerate.git@main#' requirements.txt; \
        sed -i 's#^trl.*#trl @ git+https://github.com/huggingface/trl.git@main#' requirements.txt; \
        sed -i 's#^datasets.*#datasets @ git+https://github.com/huggingface/datasets.git@main#' requirements.txt; \
    fi

RUN pip install packaging==26.0 setuptools==78.1.1 psutil
RUN pip uninstall -y causal_conv1d
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
        pip install --no-build-isolation -e .[deepspeed,flash-attn,ring-flash-attn,optimizers,ray] $AXOLOTL_ARGS; \
    fi

RUN python scripts/unsloth_install.py | sh
RUN python scripts/cutcrossentropy_install.py | sh

# So we can test the Docker image
RUN pip install -r requirements-dev.txt -r requirements-tests.txt

# fix so that git fetch/pull from remote works
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
    git config --get remote.origin.fetch

# helper for huggingface-login cli
RUN git config --global credential.helper store


================================================
FILE: cicd/__init__.py
================================================


================================================
FILE: cicd/cicd.sh
================================================
#!/bin/bash
set -e

python -c "import torch; assert '$PYTORCH_VERSION' in torch.__version__"

curl -L https://axolotl-ci.b-cdn.net/hf-cache.tar.zst | tar -xpf - -C "${HF_HOME}/hub/"  --use-compress-program unzstd --strip-components=1
# hf download "NousResearch/Meta-Llama-3-8B"
# hf download "NousResearch/Meta-Llama-3-8B-Instruct"
# hf download "microsoft/Phi-4-reasoning"
# hf download "microsoft/Phi-3.5-mini-instruct"
# hf download "microsoft/Phi-3-medium-128k-instruct"

# Run unit tests with initial coverage report
pytest -v --durations=10 -n8 \
  --ignore=tests/e2e/ \
  --ignore=tests/patched/ \
  --ignore=tests/cli \
  /workspace/axolotl/tests/ \
  --cov=axolotl

# Run lora kernels tests with coverage append
pytest -v --durations=10 \
  /workspace/axolotl/tests/e2e/patched/lora_kernels \
  --cov=axolotl \
  --cov-append

# Run patched tests excluding lora kernels with coverage append
pytest --full-trace -vvv --durations=10 \
  --ignore=tests/e2e/patched/lora_kernels \
  /workspace/axolotl/tests/e2e/patched \
  --cov=axolotl \
  --cov-append

# Run solo tests with coverage append
pytest -v --durations=10 -n1 \
  /workspace/axolotl/tests/e2e/solo/ \
  --cov=axolotl \
  --cov-append

# Run integration tests with coverage append
pytest -v --durations=10 \
  /workspace/axolotl/tests/e2e/integrations/ \
  --cov=axolotl \
  --cov-append

pytest -v --durations=10 /workspace/axolotl/tests/cli \
  --cov=axolotl \
  --cov-append

# Run remaining e2e tests with coverage append and final report
pytest -v --durations=10 \
  --ignore=tests/e2e/solo/ \
  --ignore=tests/e2e/patched/ \
  --ignore=tests/e2e/multigpu/ \
  --ignore=tests/e2e/integrations/ \
  --ignore=tests/cli \
  /workspace/axolotl/tests/e2e/ \
  --cov=axolotl \
  --cov-append \
  --cov-report=xml:e2e-coverage.xml

codecov upload-process -t $CODECOV_TOKEN -f e2e-coverage.xml -F e2e,pytorch-${PYTORCH_VERSION} || true


================================================
FILE: cicd/cleanup.py
================================================
"""Modal app to run axolotl GPU cleanup"""

from .single_gpu import VOLUME_CONFIG, app, cicd_image, run_cmd


@app.function(
    image=cicd_image,
    timeout=60 * 60,
    cpu=8.0,
    memory=131072,
    volumes=VOLUME_CONFIG,
)
def cleanup():
    run_cmd("./cicd/cleanup.sh", "/workspace/axolotl")


@app.local_entrypoint()
def main():
    cleanup.remote()


================================================
FILE: cicd/cleanup.sh
================================================
#!/bin/bash
set -e

# cleanup old cache files for datasets processing and intermediate mappings
find /workspace/data/huggingface-cache/hub/datasets -name "cache-*" -type f -mtime +1 -exec rm {} \;
find /workspace/data/huggingface-cache/hub/datasets -name "*.lock" -type f -mtime +1 -exec rm {} \;


================================================
FILE: cicd/e2e_tests.py
================================================
"""Modal app to run axolotl GPU tests"""

from .single_gpu import GPU_CONFIG, VOLUME_CONFIG, app, cicd_image, run_cmd


@app.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
    timeout=120 * 60,  # 90 min
    cpu=8.0,
    memory=131072,
    volumes=VOLUME_CONFIG,
)
def cicd_pytest():
    run_cmd("./cicd/cicd.sh", "/workspace/axolotl")


@app.local_entrypoint()
def main():
    cicd_pytest.remote()


================================================
FILE: cicd/multigpu.py
================================================
"""
modal application to run axolotl gpu tests in Modal
"""

import os
import pathlib
import tempfile

import jinja2
import modal
from jinja2 import select_autoescape
from modal import App, Image

cicd_path = pathlib.Path(__file__).parent.resolve()

template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
)
dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
df_template = template_env.get_template(dockerfile)

df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
    "CUDA": os.environ.get("CUDA", "126"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
    "HF_HOME": "/workspace/data/huggingface-cache/hub",
    "PYTHONUNBUFFERED": os.environ.get("PYTHONUNBUFFERED", "1"),
    "DEEPSPEED_LOG_LEVEL": os.environ.get("DEEPSPEED_LOG_LEVEL", "WARNING"),
}

dockerfile_contents = df_template.render(**df_args)

temp_dir = tempfile.mkdtemp()
with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
    f.write(dockerfile_contents)

cicd_image = Image.from_dockerfile(
    pathlib.Path(temp_dir) / "Dockerfile",
    force_build=True,
    gpu="A10G",
).env(df_args)

app = App("Axolotl CI/CD", secrets=[])

hf_cache_volume = modal.Volume.from_name(
    "axolotl-ci-hf-hub-cache", create_if_missing=True
)
VOLUME_CONFIG = {
    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
}

N_GPUS = int(os.environ.get("N_GPUS", 2))
GPU_CONFIG = f"H100:{N_GPUS}"


def run_cmd(cmd: str, run_folder: str):
    import subprocess  # nosec

    # Propagate errors from subprocess.
    if exit_code := subprocess.call(cmd.split(), cwd=run_folder):  # nosec
        exit(exit_code)


@app.function(
    image=cicd_image,
    gpu=GPU_CONFIG,
    timeout=120 * 60,
    cpu=16.0,
    memory=131072 * N_GPUS,
    volumes=VOLUME_CONFIG,
)
def cicd_pytest():
    run_cmd("./cicd/multigpu.sh", "/workspace/axolotl")


@app.local_entrypoint()
def main():
    cicd_pytest.remote()


================================================
FILE: cicd/multigpu.sh
================================================
#!/bin/bash
set -e

# Only run two tests at a time to avoid OOM on GPU (with coverage collection)
pytest -v --durations=10 -n2 --maxfail=3 \
  --ignore=/workspace/axolotl/tests/e2e/multigpu/solo/ \
  --ignore=/workspace/axolotl/tests/e2e/multigpu/patched/ \
  /workspace/axolotl/tests/e2e/multigpu/ \
  --cov=axolotl

# Run solo tests with coverage append
pytest -v --durations=10 -n1 \
  /workspace/axolotl/tests/e2e/multigpu/solo/ \
  --cov=axolotl \
  --cov-append

pytest -v  --durations=10 -n1 /workspace/axolotl/tests/e2e/multigpu/patched/ \
  --cov=axolotl \
  --cov-append \
  --cov-report=xml:multigpu-coverage.xml

# Upload coverage to Codecov if CODECOV_TOKEN is available
if [ -n "$CODECOV_TOKEN" ]; then
  codecov upload-process -t "${CODECOV_TOKEN}" -f multigpu-coverage.xml -F multigpu,docker-tests,pytorch-${PYTORCH_VERSION} || true
fi


================================================
FILE: cicd/single_gpu.py
================================================
"""Modal app to run axolotl GPU tests"""

import os
import pathlib
import tempfile

import jinja2
import modal
import modal.experimental
from jinja2 import select_autoescape
from modal import App

cicd_path = pathlib.Path(__file__).parent.resolve()

template_loader = jinja2.FileSystemLoader(searchpath=cicd_path)
template_env = jinja2.Environment(
    loader=template_loader, autoescape=select_autoescape()
)
dockerfile = os.environ.get("E2E_DOCKERFILE", "Dockerfile.jinja")
df_template = template_env.get_template(dockerfile)

df_args = {
    "AXOLOTL_EXTRAS": os.environ.get("AXOLOTL_EXTRAS", ""),
    "AXOLOTL_ARGS": os.environ.get("AXOLOTL_ARGS", ""),
    "PYTORCH_VERSION": os.environ.get("PYTORCH_VERSION", "2.6.0"),
    "BASE_TAG": os.environ.get("BASE_TAG", "main-base-py3.11-cu126-2.6.0"),
    "CUDA": os.environ.get("CUDA", "126"),
    "GITHUB_REF": os.environ.get("GITHUB_REF", "refs/heads/main"),
    "GITHUB_SHA": os.environ.get("GITHUB_SHA", ""),
    "NIGHTLY_BUILD": os.environ.get("NIGHTLY_BUILD", ""),
    "CODECOV_TOKEN": os.environ.get("CODECOV_TOKEN", ""),
    "HF_HOME": "/workspace/data/huggingface-cache/hub",
    "PYTHONUNBUFFERED": os.environ.get("PYTHONUNBUFFERED", "1"),
    "DEEPSPEED_LOG_LEVEL": os.environ.get("DEEPSPEED_LOG_LEVEL", "WARNING"),
}

dockerfile_contents = df_template.render(**df_args)

temp_dir = tempfile.mkdtemp()
with open(pathlib.Path(temp_dir) / "Dockerfile", "w", encoding="utf-8") as f:
    f.write(dockerfile_contents)

cicd_image = modal.experimental.raw_dockerfile_image(
    pathlib.Path(temp_dir) / "Dockerfile",
    # context_mount=None,
    force_build=True,
    # gpu="A10G",
).env(df_args)

app = App("Axolotl CI/CD", secrets=[])

hf_cache_volume = modal.Volume.from_name(
    "axolotl-ci-hf-hub-cache", create_if_missing=True
)
VOLUME_CONFIG = {
    "/workspace/data/huggingface-cache/hub": hf_cache_volume,
}

N_GPUS = int(os.environ.get("N_GPUS", 1))
GPU_TYPE = os.environ.get("GPU_TYPE", "L40S")
GPU_CONFIG = f"{GPU_TYPE}:{N_GPUS}"


def run_cmd(cmd: str, run_folder: str):
    import subprocess  # nosec

    sp_env = os.environ.copy()
    sp_env["AXOLOTL_DATASET_NUM_PROC"] = "8"

    # Propagate errors from subprocess.
    exit_code = subprocess.call(cmd.split(), cwd=run_folder, env=sp_env)  # nosec
    if exit_code:
        raise RuntimeError(f"Command '{cmd}' failed with exit code {exit_code}")


================================================
FILE: codecov.yml
================================================
codecov:
  require_ci_to_pass: yes
  notify:
    wait_for_ci: true

coverage:
  precision: 2
  round: down
  range: "70...100"
  status:
    project:
      default:
        # basic
        target: auto
        threshold: 1%
        base: auto
        # advanced
        branches: null
        if_no_uploads: error
        if_not_found: success
        if_ci_failed: error
        only_pulls: true
        flags: null
        paths: null
        informational: true
    patch:
      default:
        # basic
        target: auto
        threshold: 1%
        base: auto
        # advanced
        branches: null
        if_no_uploads: error
        if_not_found: success
        if_ci_failed: error
        only_pulls: false
        flags: null
        paths: null

parsers:
  gcov:
    branch_detection:
      conditional: yes
      loop: yes
      method: no
      macro: no

comment:
  layout: "reach,diff,flags,files,footer"
  behavior: default
  require_changes: no
  require_base: no
  require_head: yes

github_checks:
  annotations: false


================================================
FILE: deepspeed_configs/zero1.json
================================================
{
  "zero_optimization": {
    "stage": 1,
    "overlap_comm": true
  },
  "bf16": {
    "enabled": "auto"
  },
  "fp16": {
    "enabled": "auto",
    "auto_cast": false,
    "loss_scale": 0,
    "initial_scale_power": 32,
    "loss_scale_window": 1000,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
}


================================================
FILE: deepspeed_configs/zero1_torch_compile.json
================================================
{
  "zero_optimization": {
    "stage": 1,
    "overlap_comm": true
  },
  "bf16": {
    "enabled": "auto"
  },
  "fp16": {
    "enabled": "auto",
    "auto_cast": false,
    "loss_scale": 0,
    "initial_scale_power": 32,
    "loss_scale_window": 1000,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "compile": {
    "disable": false,
    "backend": "inductor"
  },
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
}


================================================
FILE: deepspeed_configs/zero2.json
================================================
{
  "zero_optimization": {
    "stage": 2,
    "offload_optimizer": {
      "device": "cpu"
    },
    "contiguous_gradients": true,
    "overlap_comm": true
  },
  "bf16": {
    "enabled": "auto"
  },
  "fp16": {
    "enabled": "auto",
    "auto_cast": false,
    "loss_scale": 0,
    "initial_scale_power": 32,
    "loss_scale_window": 1000,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
}


================================================
FILE: deepspeed_configs/zero2_torch_compile.json
================================================
{
  "compile": {
    "disable": false,
    "backend": "inductor"
  },
  "zero_optimization": {
    "stage": 2,
    "offload_optimizer": {
      "device": "cpu"
    },
    "contiguous_gradients": true,
    "overlap_comm": true
  },
  "bf16": {
    "enabled": "auto"
  },
  "fp16": {
    "enabled": "auto",
    "auto_cast": false,
    "loss_scale": 0,
    "initial_scale_power": 32,
    "loss_scale_window": 1000,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
}


================================================
FILE: deepspeed_configs/zero3.json
================================================
{
  "zero_optimization": {
    "stage": 3,
    "overlap_comm": true,
    "contiguous_gradients": true,
    "sub_group_size": 0,
    "reduce_bucket_size": "auto",
    "stage3_prefetch_bucket_size": "auto",
    "stage3_param_persistence_threshold": "auto",
    "max_live_parameters": 0,
    "max_reuse_distance": 0,
    "gather_16bit_weights_on_model_save": true
  },
  "bf16": {
    "enabled": "auto"
  },
  "fp16": {
    "enabled": "auto",
    "auto_cast": false,
    "loss_scale": 0,
    "initial_scale_power": 32,
    "loss_scale_window": 1000,
    "hysteresis": 2,
    "min_loss_scale": 1
  },
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
}


================================================
FILE: deepspeed_configs/zero3_bf16.json
================================================
{
  "zero_optimization": {
    "stage": 3,
    "overlap_comm": true,
    "contiguous_gradients": true,
    "sub_group_size": 0,
    "reduce_bucket_size": "auto",
    "stage3_prefetch_bucket_size": "auto",
    "stage3_param_persistence_threshold": "auto",
    "max_live_parameters": 0,
    "max_reuse_distance": 0,
    "gather_16bit_weights_on_model_save": true
  },
  "bf16": {
    "enabled": true
  },
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
}


================================================
FILE: deepspeed_configs/zero3_bf16_cpuoffload_all.json
================================================
{
  "zero_force_ds_cpu_optimizer": false,
  "zero_allow_untested_optimizer": true,
  "zero_optimization": {
    "stage": 3,
    "offload_optimizer": {
      "device": "cpu",
      "pin_memory": true
    },
    "offload_param": {
      "device": "cpu",
      "pin_memory": true
    },
    "overlap_comm": true,
    "contiguous_gradients": true,
    "sub_group_size": 0,
    "reduce_bucket_size": "auto",
    "stage3_prefetch_bucket_size": "auto",
    "stage3_param_persistence_threshold": "auto",
    "max_live_parameters": 0,
    "max_reuse_distance": 0,
    "gather_16bit_weights_on_model_save": true
  },
  "bf16": {
    "enabled": true
  },
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
}


================================================
FILE: deepspeed_configs/zero3_bf16_cpuoffload_params.json
================================================
{
  "zero_force_ds_cpu_optimizer": false,
  "zero_allow_untested_optimizer": true,
  "zero_optimization": {
    "stage": 3,
    "offload_param": {
      "device": "cpu",
      "pin_memory": true
    },
    "overlap_comm": true,
    "contiguous_gradients": true,
    "sub_group_size": 0,
    "reduce_bucket_size": "auto",
    "stage3_prefetch_bucket_size": "auto",
    "stage3_param_persistence_threshold": "auto",
    "max_live_parameters": 0,
    "max_reuse_distance": 0,
    "gather_16bit_weights_on_model_save": true
  },
  "bf16": {
    "enabled": true
  },
  "gradient_accumulation_steps": "auto",
  "gradient_clipping": "auto",
  "train_batch_size": "auto",
  "train_micro_batch_size_per_gpu": "auto",
  "wall_clock_breakdown": false
}


================================================
FILE: devtools/README.md
================================================
This directory contains example config files that might be useful for debugging. Please see [docs/debugging.qmd](../docs/debugging.qmd) for more information.


================================================
FILE: devtools/dev_chat_template.yml
================================================
# Example config for debugging the chat_template prompt format
base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer

load_in_8bit: true
load_in_4bit: false

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
    shards: 10
val_set_size: 0
output_dir: temp_debug/axolotl_outputs/model
dataset_prepared_path: temp_debug/axolotl_outputs/data
dataset_num_proc: 1

sequence_len: 4096
sample_packing: false
pad_to_sequence_len: true

adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_fan_in_fan_out:

micro_batch_size: 1
num_epochs: 1
max_steps: 10
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

train_on_inputs: false
group_by_length: false
bf16: false
fp16: true
tf32: false

gradient_checkpointing: true
logging_steps: 1
flash_attention: true

warmup_steps: 10
weight_decay: 0.0


================================================
FILE: docker/Dockerfile
================================================
ARG BASE_TAG=main-base
FROM axolotlai/axolotl-base:$BASE_TAG

ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
ARG AXOLOTL_EXTRAS=""
ARG AXOLOTL_ARGS=""
ARG CUDA="118"
ARG PYTORCH_VERSION="2.1.2"
ARG TARGETARCH

ENV PYTORCH_VERSION=$PYTORCH_VERSION

RUN apt-get update && \
    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs && \
    rm -rf /var/cache/apt/archives && \
    rm -rf /var/lib/apt/lists/*

WORKDIR /workspace

RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git

WORKDIR /workspace/axolotl

# If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
RUN pip uninstall -y causal_conv1d
RUN if [ "$TARGETARCH" = "arm64" ]; then \
        BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
    else \
        BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \
    fi && \
    if [ "$AXOLOTL_EXTRAS" != "" ]; then \
        pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
        pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
    fi && \    python scripts/unsloth_install.py | sh && \
    python scripts/cutcrossentropy_install.py | sh && \
    pip install pytest && \
    pip cache purge

# fix so that git fetch/pull from remote works with shallow clone
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
    git config --get remote.origin.fetch && \
    git config --global credential.helper store

COPY .axolotl-complete.bash /root/.axolotl-complete.bash
RUN chmod +x /root/.axolotl-complete.bash && \
    echo 'source /root/.axolotl-complete.bash' >> ~/.bashrc


================================================
FILE: docker/Dockerfile-base
================================================
ARG CUDA_VERSION="11.8.0"
ARG CUDNN_VERSION="8"
ARG UBUNTU_VERSION="22.04"
ARG MAX_JOBS=4
ARG TARGETARCH

FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder

ENV PATH="/root/miniconda3/bin:${PATH}"

ARG TARGETARCH
ARG PYTHON_VERSION="3.11"
ARG PYTORCH_VERSION="2.1.2"
ARG CUDA="128"
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

ENV PYTHON_VERSION=$PYTHON_VERSION
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST

RUN apt-get update \
    && apt-get install -y --no-install-recommends \
        wget git build-essential ninja-build git-lfs libaio-dev pkg-config \
        ibverbs-providers ibverbs-utils infiniband-diags  \
        librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm \
    && rm -rf /var/cache/apt/archives \
    && rm -rf /var/lib/apt/lists/* \
    && if [ "$TARGETARCH" = "amd64" ]; then \
        MINICONDA_ARCH="x86_64"; \
    elif [ "$TARGETARCH" = "arm64" ]; then \
        MINICONDA_ARCH="aarch64"; \
    else \
        echo "Unsupported architecture: $TARGETARCH"; exit 1; \
    fi \
    && wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh \
    && mkdir /root/.conda \
    && bash Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh -b \
    && rm -f Miniconda3-latest-Linux-${MINICONDA_ARCH}.sh \
    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"

ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"

WORKDIR /workspace

RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==26.0 setuptools==75.8.0 wheel psutil && \
    python3 -m pip install --no-cache-dir -U torch==${PYTORCH_VERSION}+cu${CUDA} torchvision --extra-index-url https://download.pytorch.org/whl/cu$CUDA && \
    python3 -m pip cache purge

RUN if [ "$CUDA" != "130" ] ; then \
        CAUSAL_CONV1D_FORCE_CXX11_ABI=TRUE CAUSAL_CONV1D_FORCE_BUILD=TRUE python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@v1.5.4"; \
        python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"; \
        python3 -m pip cache purge; \
    fi

RUN git lfs install --skip-repo && \
    pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10 && \
    pip3 cache purge

# Map Python version (e.g., 3.12 -> cp312)
RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \
    # Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10)
    TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \
    # Map architecture
    case "$TARGETARCH" in \
        amd64) ARCH_TAG="x86_64" ;; \
        arm64) ARCH_TAG="aarch64" ;; \
        *) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \
    esac && \
    WHL_VERSION="v0.7.16" && \
    WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-linux_${ARCH_TAG}.whl" && \
    wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \
    pip3 install --no-cache-dir "${WHL_FILE}" && \
    rm "${WHL_FILE}"


================================================
FILE: docker/Dockerfile-base-next
================================================
ARG CUDA_VERSION="12.8.1"
ARG CUDNN_VERSION="8"
ARG UBUNTU_VERSION="22.04"
ARG MAX_JOBS=4

FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder

ENV PATH="/root/miniconda3/bin:${PATH}"

ARG PYTHON_VERSION="3.11"
ARG PYTORCH_VERSION="next"
ARG CUDA="128"
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

ENV PYTHON_VERSION=$PYTHON_VERSION
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST

RUN apt-get update \
    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
    && wget \
    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
    && mkdir /root/.conda \
    && bash Miniconda3-latest-Linux-x86_64.sh -b \
    && rm -f Miniconda3-latest-Linux-x86_64.sh \
    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"

ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"

WORKDIR /workspace

RUN python3 -m pip install --upgrade pip && pip3 install packaging && \
    python3 -m pip install --no-cache-dir -U torch==2.7.1 --extra-index-url https://download.pytorch.org/whl/test/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"

RUN git lfs install --skip-repo && \
    pip3 install awscli && \
    pip3 install -U --no-cache-dir pydantic==2.10.6


================================================
FILE: docker/Dockerfile-base-nightly
================================================
ARG CUDA_VERSION="12.8.1"
ARG CUDNN_VERSION="8"
ARG UBUNTU_VERSION="22.04"
ARG MAX_JOBS=4

FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder

ENV PATH="/root/miniconda3/bin:${PATH}"

ARG PYTHON_VERSION="3.11"
ARG PYTORCH_VERSION="nightly"
ARG CUDA="128"
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

ENV PYTHON_VERSION=$PYTHON_VERSION
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST

RUN apt-get update \
    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config && rm -rf /var/lib/apt/lists/* \
    && wget \
    https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh \
    && mkdir /root/.conda \
    && bash Miniconda3-latest-Linux-x86_64.sh -b \
    && rm -f Miniconda3-latest-Linux-x86_64.sh \
    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/main \
    && conda tos accept --override-channels --channel https://repo.anaconda.com/pkgs/r \
    && conda create -n "py${PYTHON_VERSION}" python="${PYTHON_VERSION}"

ENV PATH="/root/miniconda3/envs/py${PYTHON_VERSION}/bin:${PATH}"

WORKDIR /workspace

RUN python3 -m pip install --upgrade pip && pip3 install -U packaging==26.0 setuptools==75.8.0 wheel && \
    python3 -m pip install --no-cache-dir -U torch --extra-index-url https://download.pytorch.org/whl/nightly/cu$CUDA && \
    python3 -m pip install --no-cache-dir "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main" && \
    python3 -m pip install --no-cache-dir "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main" && \
    python3 -m pip cache purge

RUN git lfs install --skip-repo && \
    pip3 install awscli && \
    # The base image ships with `pydantic==1.8.2` which is not working
    pip3 install -U --no-cache-dir pydantic==1.10.10 && \
    pip3 cache purge


================================================
FILE: docker/Dockerfile-cloud
================================================
ARG BASE_TAG=main
FROM axolotlai/axolotl:$BASE_TAG

ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
ENV HF_HOME="/workspace/data/huggingface-cache/hub"
ENV HF_HUB_ENABLE_HF_TRANSFER="1"

EXPOSE 8888
EXPOSE 22

COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
COPY scripts/motd /etc/motd

RUN pip install jupyterlab notebook ipywidgets && \
    jupyter lab clean
RUN apt update && \
    apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \
    rm -rf /var/cache/apt/archives && \
    rm -rf /var/lib/apt/lists/* && \
    mkdir -p ~/.ssh && \
    chmod 700 ~/.ssh && \
    printf "\n[[ -z \"\$TMUX\"  ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
    printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
    chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
    chmod +x /root/cloud-entrypoint.sh && \
    echo 'set-option -g history-limit 5000' >> ~/.tmux.conf

ENTRYPOINT ["/root/cloud-entrypoint.sh"]
CMD ["sleep", "infinity"]


================================================
FILE: docker/Dockerfile-cloud-no-tmux
================================================
ARG BASE_TAG=main
FROM axolotlai/axolotl:$BASE_TAG

ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
ENV HF_HOME="/workspace/data/huggingface-cache/hub"
ENV HF_HUB_ENABLE_HF_TRANSFER="1"

EXPOSE 8888
EXPOSE 22

COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
COPY scripts/motd /etc/motd

RUN pip install jupyterlab notebook ipywidgets && \
    jupyter lab clean
RUN apt update && \
    apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop ibverbs-providers ibverbs-utils infiniband-diags librdmacm-dev librdmacm1 rdmacm-utils slurm-wlm && \
    rm -rf /var/cache/apt/archives && \
    rm -rf /var/lib/apt/lists/* && \
    mkdir -p ~/.ssh && \
    chmod 700 ~/.ssh && \
    printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
    chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
    chmod +x /root/cloud-entrypoint.sh

ENTRYPOINT ["/root/cloud-entrypoint.sh"]
CMD ["sleep", "infinity"]


================================================
FILE: docker/Dockerfile-cloud-uv
================================================
ARG BASE_TAG=main
FROM axolotlai/axolotl-uv:$BASE_TAG

ENV HF_DATASETS_CACHE="/workspace/data/huggingface-cache/datasets"
ENV HF_HUB_CACHE="/workspace/data/huggingface-cache/hub"
ENV HF_HOME="/workspace/data/huggingface-cache/hub"
ENV HF_HUB_ENABLE_HF_TRANSFER="1"

EXPOSE 8888
EXPOSE 22

COPY scripts/cloud-entrypoint.sh /root/cloud-entrypoint.sh
COPY scripts/motd /etc/motd

RUN uv pip install jupyterlab notebook ipywidgets && \
    jupyter lab clean
RUN apt update && \
    apt install --yes --no-install-recommends openssh-server tmux iproute2 nvtop && \
    rm -rf /var/cache/apt/archives && \
    rm -rf /var/lib/apt/lists/* && \
    mkdir -p ~/.ssh && \
    chmod 700 ~/.ssh && \
    printf "\n[[ -z \"\$TMUX\"  ]] && { tmux attach-session -t ssh_tmux || tmux new-session -s ssh_tmux; exit; }\n" >> ~/.bashrc && \
    printf "[ ! -z \"\$TERM\" -a -r /etc/motd ] && cat /etc/motd\n" >> ~/.bashrc && \
    chmod +x /workspace/axolotl/scripts/cloud-entrypoint.sh && \
    chmod +x /root/cloud-entrypoint.sh && \
    echo 'set-option -g history-limit 5000' >> ~/.tmux.conf

ENTRYPOINT ["/root/cloud-entrypoint.sh"]
CMD ["sleep", "infinity"]


================================================
FILE: docker/Dockerfile-tests
================================================
ARG BASE_TAG=main-base
FROM axolotlai/axolotl-base:$BASE_TAG

ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
ARG AXOLOTL_EXTRAS=""
ARG AXOLOTL_ARGS=""
ARG CUDA="118"
ARG PYTORCH_VERSION="2.1.2"
ARG GITHUB_REF="main"

ENV PYTORCH_VERSION=$PYTORCH_VERSION

RUN apt-get update && \
    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev

WORKDIR /workspace

RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git

WORKDIR /workspace/axolotl

RUN git fetch origin +$GITHUB_REF && \
    git checkout FETCH_HEAD

# If AXOLOTL_EXTRAS is set, append it in brackets
RUN if [ "$AXOLOTL_EXTRAS" != "" ] ; then \
        pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
        pip install --no-build-isolation -e .[deepspeed,flash-attn,mamba-ssm] $AXOLOTL_ARGS; \
    fi

# So we can test the Docker image
RUN pip install pytest

# fix so that git fetch/pull from remote works
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
    git config --get remote.origin.fetch

# helper for huggingface-login cli
RUN git config --global credential.helper store


================================================
FILE: docker/Dockerfile-uv
================================================
ARG BASE_TAG=main-base
FROM axolotlai/axolotl-base-uv:$BASE_TAG

ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6+PTX"
ARG AXOLOTL_EXTRAS=""
ARG AXOLOTL_ARGS=""
ARG CUDA="118"
ARG PYTORCH_VERSION="2.1.2"
ARG TARGETARCH

ENV PYTORCH_VERSION=$PYTORCH_VERSION

RUN apt-get update && \
    apt-get install -y --allow-change-held-packages vim curl nano libnccl2 libnccl-dev rsync s3fs && \
    rm -rf /var/cache/apt/archives && \
    rm -rf /var/lib/apt/lists/*

WORKDIR /workspace

RUN git clone --depth=1 https://github.com/axolotl-ai-cloud/axolotl.git

WORKDIR /workspace/axolotl

# If AXOLOTL_EXTRAS is set, append it in brackets; don't install deepspeed with arm64
RUN uv pip uninstall causal_conv1d
RUN if [ "$TARGETARCH" = "arm64" ]; then \
        BASE_EXTRAS="flash-attn,ring-flash-attn,optimizers,ray"; \
    else \
        BASE_EXTRAS="deepspeed,flash-attn,ring-flash-attn,optimizers,ray"; \
    fi && \
    if [ "$AXOLOTL_EXTRAS" != "" ]; then \
        uv pip install --no-build-isolation -e .[$BASE_EXTRAS,$AXOLOTL_EXTRAS] $AXOLOTL_ARGS; \
    else \
        uv pip install --no-build-isolation -e .[$BASE_EXTRAS] $AXOLOTL_ARGS; \
    fi && \
    python scripts/unsloth_install.py --uv | sh && \
    python scripts/cutcrossentropy_install.py --uv | sh && \
    uv pip install pytest && \
    uv cache clean

# fix so that git fetch/pull from remote works with shallow clone
RUN git config remote.origin.fetch "+refs/heads/*:refs/remotes/origin/*" && \
    git config --get remote.origin.fetch && \
    git config --global credential.helper store

COPY .axolotl-complete.bash /root/.axolotl-complete.bash
RUN chmod +x /root/.axolotl-complete.bash && \
    echo 'source /root/.axolotl-complete.bash' >> ~/.bashrc


================================================
FILE: docker/Dockerfile-uv-base
================================================
ARG CUDA_VERSION="12.6.3"
ARG CUDNN_VERSION=""
ARG UBUNTU_VERSION="22.04"
ARG MAX_JOBS=4
ARG TARGETARCH

FROM nvidia/cuda:$CUDA_VERSION-cudnn$CUDNN_VERSION-devel-ubuntu$UBUNTU_VERSION AS base-builder

ARG TARGETARCH
ARG PYTHON_VERSION="3.11"
ARG PYTORCH_VERSION="2.6.0"
ARG CUDA="126"
ARG TORCH_CUDA_ARCH_LIST="7.0 7.5 8.0 8.6 9.0+PTX"

ENV PYTHON_VERSION=$PYTHON_VERSION
ENV TORCH_CUDA_ARCH_LIST=$TORCH_CUDA_ARCH_LIST
ENV UV_TORCH_BACKEND="cu${CUDA}"

RUN apt-get update \
    && apt-get install -y wget git build-essential ninja-build git-lfs libaio-dev pkg-config curl && rm -rf /var/lib/apt/lists/* \
    && git lfs install --skip-repo \
    && curl -LsSf https://astral.sh/uv/install.sh | sh

ENV PATH="/root/.local/bin:${PATH}"

RUN uv python install ${PYTHON_VERSION}

WORKDIR /workspace

RUN uv venv --no-project --relocatable axolotl-venv

ENV PATH="/workspace/axolotl-venv/bin:${PATH}"

RUN uv pip install packaging setuptools wheel psutil \
    && uv pip install torch==${PYTORCH_VERSION} torchvision \
    && uv pip install awscli pydantic

RUN if [ "$TARGETARCH" = "amd64" ]; then \
        uv pip install --no-build-isolation "causal_conv1d @ git+https://github.com/Dao-AILab/causal-conv1d.git@main"; \
        uv pip install "mamba_ssm @ git+https://github.com/state-spaces/mamba.git@main"; \
    fi

# Map Python version (e.g., 3.12 -> cp312)
RUN PYTHON_CP="cp$(echo $PYTHON_VERSION | tr -d '.')" && \
    # Map PyTorch version (e.g., 2.9.1 -> torch2.9, 2.10.0 -> torch2.10)
    TORCH_TAG="torch$(echo $PYTORCH_VERSION | grep -oP '^\d+\.\d+')" && \
    # Map architecture
    case "$TARGETARCH" in \
        amd64) ARCH_TAG="x86_64" ;; \
        arm64) ARCH_TAG="aarch64" ;; \
        *) echo "Unsupported architecture: $TARGETARCH"; exit 1 ;; \
    esac && \
    WHL_VERSION="v0.7.16" && \
    WHL_FILE="flash_attn-2.8.3+cu${CUDA}${TORCH_TAG}-${PYTHON_CP}-${PYTHON_CP}-linux_${ARCH_TAG}.whl" && \
    wget -nv "https://github.com/mjun0812/flash-attention-prebuild-wheels/releases/download/${WHL_VERSION}/${WHL_FILE}" && \
    uv pip install --no-cache-dir "${WHL_FILE}" && \
    rm "${WHL_FILE}"


================================================
FILE: docker-compose.yaml
================================================
# version: '3.8'
services:
  axolotl:
    build:
      context: .
      dockerfile: ./docker/Dockerfile
    volumes:
      - .:/workspace/axolotl
      - ~/.cache/huggingface/:/root/.cache/huggingface/
    # set environment variables
    environment:
      # Set environment variables
      - GIT_AUTHOR_NAME=${GIT_AUTHOR_NAME}
      - GIT_AUTHOR_EMAIL=${GIT_AUTHOR_EMAIL}
      - GIT_COMMITTER_NAME=${GIT_COMMITTER_NAME}
      - GIT_COMMITTER_EMAIL=${GIT_COMMITTER_EMAIL}
      - WANDB_API_KEY=${WANDB_API_KEY}
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              # count: 1
              capabilities: [gpu]
    command: tail -f /dev/null


================================================
FILE: docs/.gitignore
================================================
/.quarto/
_site/
/api/*.qmd
/api/*.html
config-reference.qmd
models/**/*.qmd
models/**/*.html


================================================
FILE: docs/amd_hpc.qmd
================================================
---
title: AMD GPUs on HPC Systems
description: A comprehensive guide for using Axolotl on distributed systems with AMD GPUs
---

This guide provides step-by-step instructions for installing and configuring Axolotl on a High-Performance Computing (HPC) environment equipped with AMD GPUs.

## Setup

### 1. Install Python

We recommend using Miniforge, a minimal conda-based Python distribution:

```bash
curl -L -O "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-$(uname)-$(uname -m).sh"
bash Miniforge3-$(uname)-$(uname -m).sh
```

### 2. Configure Python Environment
Add Python to your PATH and ensure it's available at login:

```bash
echo 'export PATH=~/miniforge3/bin:$PATH' >> ~/.bashrc
echo 'if [ -f ~/.bashrc ]; then . ~/.bashrc; fi' >> ~/.bash_profile
```

### 3. Load AMD GPU Software

Load the ROCm module:

```bash
module load rocm/5.7.1
```

Note: The specific module name and version may vary depending on your HPC system. Consult your system documentation for the correct module name.

### 4. Install PyTorch

Install PyTorch with ROCm support:

```bash
pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/rocm5.7 --force-reinstall
```

### 5. Install Flash Attention

Clone and install the Flash Attention repository:

```bash
git clone --recursive https://github.com/ROCmSoftwarePlatform/flash-attention.git
export GPU_ARCHS="gfx90a"
cd flash-attention
export PYTHON_SITE_PACKAGES=$(python -c 'import site; print(site.getsitepackages()[0])')
patch "${PYTHON_SITE_PACKAGES}/torch/utils/hipify/hipify_python.py" hipify_patch.patch
pip install --no-build-isolation .
```

### 6. Install Axolotl

Clone and install Axolotl:

```bash
git clone https://github.com/axolotl-ai-cloud/axolotl
cd axolotl
pip install packaging ninja
pip install --no-build-isolation -e .
```

### 7. Apply xformers Workaround

xformers appears to be incompatible with ROCm. Apply the following workarounds:
 - Edit $HOME/packages/axolotl/src/axolotl/monkeypatch/llama_attn_hijack_flash.py modifying the code to always return `False` for SwiGLU availability from xformers.
 - Edit $HOME/miniforge3/lib/python3.10/site-packages/xformers/ops/swiglu_op.py replacing the "SwiGLU" function with a pass statement.

### 8. Prepare Job Submission Script

Create a script for job submission using your HPC's particular software (e.g. Slurm, PBS). Include necessary environment setup and the command to run Axolotl training. If the compute node(s) do(es) not have internet access, it is recommended to include

```bash
export TRANSFORMERS_OFFLINE=1
export HF_DATASETS_OFFLINE=1
```

### 9. Download Base Model

Download a base model using the Hugging Face CLI:

```bash
hf download meta-llama/Meta-Llama-3.1-8B --local-dir ~/hfdata/llama3.1-8B
```

### 10. Create Axolotl Configuration

Create an Axolotl configuration file (YAML format) tailored to your specific training requirements and dataset. Use FSDP for multi-node training.

Note: Deepspeed did not work at the time of testing. However, if anyone managed to get it working, please let us know.

### 11. Preprocess Data

Run preprocessing on the login node:

```bash
CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess /path/to/your/config.yaml
```

### 12. Train

You are now ready to submit your previously prepared job script. 🚂


================================================
FILE: docs/attention.qmd
================================================
---
title: Attention
description: Supported attention modules in Axolotl
---

## SDP Attention

This is the default built-in attention in PyTorch.

```yaml
sdp_attention: true
```

For more details: [PyTorch docs](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)

## Flash Attention

Axolotl supports Flash Attention 2, 3, and 4. The best available version is used automatically
based on your installed packages and GPU.

```yaml
flash_attention: true
```

For more details: [Flash Attention](https://github.com/Dao-AILab/flash-attention/)

### Flash Attention 2

Requirements: Ampere, Ada, or Hopper GPUs (Turing or lower not supported)

```bash
pip install flash-attn --no-build-isolation
```

::: {.callout-tip}

If you get `undefined symbol` while training, ensure you installed PyTorch prior to Axolotl.
Alternatively, try reinstall or downgrade a version.

:::

### Flash Attention 3

Requirements: Hopper only and CUDA 12.8 (recommended)

```bash
git clone https://github.com/Dao-AILab/flash-attention.git
cd flash-attention/hopper

python setup.py install
```

### Flash Attention 4

Requirements: Hopper or Blackwell GPUs

```bash
pip install flash-attn-4
```

Or from source:

```bash
git clone https://github.com/Dao-AILab/flash-attention.git
cd flash-attention/flash_attn/cute

pip install -e .

# FA2's flash_attn package includes a cute/ stub that shadows FA4.
# Remove it so Python can find the real FA4 module:
rm -r $(python -c "import flash_attn; print(flash_attn.__path__[0])")/cute
```

::: {.callout-note}

**Hopper (SM90) users**: The backward kernel is not yet included in the pip package. To use FA4
for training on Hopper, install from source using the instructions above.

:::

::: {.callout-warning}

FA4 only supports head dimensions up to 128 (`d ≤ 128`). The DeepSeek shape `(192, 128)` is
also supported but only on Blackwell. Axolotl automatically detects incompatible head dimensions
and falls back to FA2/3.

:::

For more details: [flash-attention/flash_attn/cute](https://github.com/Dao-AILab/flash-attention/tree/main/flash_attn/cute)

### AMD

Requirements: ROCm 6.0 and above.

See [Flash Attention AMD docs](https://github.com/Dao-AILab/flash-attention/tree/main?tab=readme-ov-file#amd-rocm-support).

## Flex Attention

A flexible PyTorch API for attention used in combination with `torch.compile`.

```yaml
flex_attention: true

# recommended
torch_compile: true
```

::: {.callout-note}

We recommend using latest stable version of PyTorch for best performance.

:::

For more details: [PyTorch docs](https://pytorch.org/blog/flexattention/)

## SageAttention

Attention kernels with QK Int8 and PV FP16 accumulator.

```yaml
sage_attention: true
```

Requirements: Ampere, Ada, or Hopper GPUs

```bash
pip install sageattention==2.2.0 --no-build-isolation
```

::: {.callout-warning}

Only LoRA/QLoRA recommended at the moment. We found loss drop to 0 for full finetuning. See [GitHub Issue](https://github.com/thu-ml/SageAttention/issues/198).

:::

For more details: [Sage Attention](https://github.com/thu-ml/SageAttention)

::: {.callout-note}

We do not support SageAttention 3 at the moment. If you are interested on adding this or improving SageAttention implementation, please make an Issue.

:::


## xFormers

```yaml
xformers_attention: true
```

::: {.callout-tip}

We recommend using with Turing GPUs or below (such as on Colab).

:::

For more details: [xFormers](https://github.com/facebookresearch/xformers)

## Shifted Sparse Attention

::: {.callout-warning}

We plan to deprecate this! If you use this feature, we recommend switching to methods above.

:::

Requirements: LLaMA model architecture

```yaml
flash_attention: true
s2_attention: true
```

::: {.callout-tip}

No sample packing support!

:::


================================================
FILE: docs/batch_vs_grad.qmd
================================================
---
title: Batch size vs Gradient accumulation
description: Understanding of batch size and gradient accumulation steps
---

Gradient accumulation means accumulating gradients over several mini-batches and updating the model weights afterward. When the samples in each batch are diverse, this technique doesn't significantly impact learning.

This method allows for effective training with larger effective batch sizes without needing proportionally larger memory. Here's why:

1. **Memory Consumption with Batch Size**: The primary reason increasing the batch size impacts memory is due to the storage requirements for intermediate activations. When you forward propagate a batch through a network, you have to store the activations at each layer for each sample in the batch, because these activations are used during backpropagation to compute gradients. Therefore, larger batches mean more activations, leading to greater GPU memory consumption.

2. **Gradient Accumulation**: With gradient accumulation, you're effectively simulating a larger batch size by accumulating gradients over several smaller batches (or micro-batches). However, at any given time, you're only forward and backward propagating a micro-batch. This means you only store activations for the micro-batch, not the full accumulated batch. As a result, you can simulate the effect of a larger batch size without the memory cost of storing activations for a large batch.

**Example 1:**
Micro batch size: 3
Gradient accumulation steps: 2
Number of GPUs: 3
Total batch size = 3 * 2 * 3 = 18

```
| GPU 1          | GPU 2          | GPU 3          |
|----------------|----------------|----------------|
| S1, S2, S3     | S4, S5, S6     | S7, S8, S9     |
| e1, e2, e3     | e4, e5, e6     | e7, e8, e9     |
|----------------|----------------|----------------|
| → (accumulate) | → (accumulate) | → (accumulate) |
|----------------|----------------|----------------|
| S10, S11, S12  | S13, S14, S15  | S16, S17, S18  |
| e10, e11, e12  | e13, e14, e15  | e16, e17, e18  |
|----------------|----------------|----------------|
| → (apply)      | → (apply)      | → (apply)      |

Accumulated gradient for the weight w1 after the second iteration (considering all GPUs):
Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6 + e7 + e8 + e9 + e10 + e11 + e12 + e13 + e14 + e15 + e16 + e17 + e18

Weight update for w1:
w1_new = w1_old - learning rate x (Total gradient for w1 / 18)
```

**Example 2:**
Micro batch size: 2
Gradient accumulation steps: 1
Number of GPUs: 3
Total batch size = 2 * 1 * 3 = 6

```
| GPU 1     | GPU 2     | GPU 3     |
|-----------|-----------|-----------|
| S1, S2    | S3, S4    | S5, S6    |
| e1, e2    | e3, e4    | e5, e6    |
|-----------|-----------|-----------|
| → (apply) | → (apply) | → (apply) |

Accumulated gradient for the weight w1 (considering all GPUs):
Total gradient for w1 = e1 + e2 + e3 + e4 + e5 + e6

Weight update for w1:
w1_new = w1_old - learning rate × (Total gradient for w1 / 6)
```


================================================
FILE: docs/checkpoint_saving.qmd
================================================
---
title: "Checkpoint Saving"
format:
  html:
    toc: true
    toc-depth: 2
    number-sections: true
execute:
  enabled: false
---

## Overview

Axolotl supports on-demand checkpoint saving during training. You can trigger checkpoints via file-based triggers (for programmatic control) or Control+C (for interactive use).

## File-Based Checkpoint Trigger

### Configuration

Enable in your config:

```yaml
dynamic_checkpoint:
  enabled: true
  check_interval: 100  # Optional: check every N steps (default: 100)
  trigger_file_path: "axolotl_checkpoint.save"  # Optional: custom filename
```

**Options:**
- `enabled`: `true` to enable (required)
- `check_interval`: Steps between file checks. Default: 100. Lower = faster response, higher I/O overhead.
- `trigger_file_path`: Custom trigger filename. Default: `axolotl_checkpoint.save`

### How It Works

1. Rank 0 checks for trigger file every `check_interval` steps in `output_dir`
2. When detected, file is deleted and checkpoint is saved
3. In distributed training, rank 0 broadcasts to synchronize all ranks

### Usage

**Command line:**
```bash
touch /path/to/output_dir/axolotl_checkpoint.save
```

**Programmatic:**
```python
from pathlib import Path
Path("/path/to/output_dir/axolotl_checkpoint.save").touch()
```

Checkpoint saves within the next `check_interval` steps. The trigger file is auto-deleted after detection, so you can create it multiple times.

**Custom filename:**
```yaml
dynamic_checkpoint:
  enabled: true
  trigger_file_path: "my_trigger.save"
```
```bash
touch /path/to/output_dir/my_trigger.save
```

## Control+C (SIGINT) Checkpoint

Pressing `Ctrl+C` during training saves the model state and exits gracefully. **Note:** This saves only the model weights, not optimizer state. For resumable checkpoints, use the file-based trigger.

## Best Practices

- **Check interval**: Lower values (10-50) for fast training, default 100 for slower training
- **Distributed training**: Create trigger file once; rank 0 handles synchronization
- **Resume**: Dynamic checkpoints can be resumed like regular checkpoints via `resume_from_checkpoint`

## Example

```yaml
output_dir: ./outputs/lora-out
save_steps: 500  # Scheduled checkpoints

dynamic_checkpoint:
  enabled: true
  check_interval: 50
```

This enables scheduled checkpoints every 500 steps plus on-demand saves via file trigger (checked every 50 steps).


================================================
FILE: docs/cli.qmd
================================================
---
title: "Command Line Interface (CLI)"
format:
  html:
    toc: true
    toc-expand: 2
    toc-depth: 3
execute:
  enabled: false
---

The Axolotl CLI provides a streamlined interface for training and fine-tuning large language models. This guide covers
the CLI commands, their usage, and common examples.


## Basic Commands

All Axolotl commands follow this general structure:

```bash
axolotl <command> [config.yml] [options]
```

The config file can be local or a URL to a raw YAML file.

### Launcher Arguments

For commands that support multi-GPU (`train`, `evaluate`, ...), you can pass launcher-specific arguments using the `--` separator:

```bash
# Pass torchrun arguments
axolotl train config.yml --launcher torchrun -- --nproc_per_node=2 --nnodes=1

# Pass accelerate arguments
axolotl train config.yml --launcher accelerate -- --config_file=accelerate_config.yml --num_processes=4
```

Arguments after `--` are passed directly to the launcher (torchrun, accelerate launch, etc.).

## Command Reference

### fetch

Downloads example configurations and deepspeed configs to your local machine.

```bash
# Get example YAML files
axolotl fetch examples

# Get deepspeed config files
axolotl fetch deepspeed_configs

# Specify custom destination
axolotl fetch examples --dest path/to/folder
```

### preprocess

Preprocesses and tokenizes your dataset before training. This is recommended for large datasets.

```bash
# Basic preprocessing
axolotl preprocess config.yml

# Preprocessing with one GPU
CUDA_VISIBLE_DEVICES="0" axolotl preprocess config.yml

# Debug mode to see processed examples
axolotl preprocess config.yml --debug

# Debug with limited examples
axolotl preprocess config.yml --debug --debug-num-examples 5
```

Configuration options:

```yaml
dataset_prepared_path: Local folder for saving preprocessed data
push_dataset_to_hub: HuggingFace repo to push preprocessed data (optional)
```

### train

Trains or fine-tunes a model using the configuration specified in your YAML file.

```bash
# Basic training
axolotl train config.yml

# Train and set/override specific options
axolotl train config.yml \
    --learning-rate 1e-4 \
    --micro-batch-size 2 \
    --num-epochs 3

# Training without accelerate
axolotl train config.yml --launcher python

# Pass launcher-specific arguments using -- separator
axolotl train config.yml --launcher torchrun -- --nproc_per_node=2 --nnodes=1
axolotl train config.yml --launcher accelerate -- --config_file=accelerate_config.yml

# Resume training from checkpoint
axolotl train config.yml --resume-from-checkpoint path/to/checkpoint
```

It is possible to run sweeps over multiple hyperparameters by passing in a sweeps config.

```bash
# Basic training with sweeps
axolotl train config.yml --sweep path/to/sweep.yaml
```

Example sweep config:
```yaml
_:
  # This section is for dependent variables we need to fix
  - load_in_8bit: false
    load_in_4bit: false
    adapter: lora
  - load_in_8bit: true
    load_in_4bit: false
    adapter: lora

# These are independent variables
learning_rate: [0.0003, 0.0006]
lora_r:
  - 16
  - 32
lora_alpha:
  - 16
  - 32
  - 64
```


### inference

Runs inference using your trained model in either CLI or Gradio interface mode.

```bash
# CLI inference with LoRA
axolotl inference config.yml --lora-model-dir="./outputs/lora-out"

# CLI inference with full model
axolotl inference config.yml --base-model="./completed-model"

# Gradio web interface
axolotl inference config.yml --gradio \
    --lora-model-dir="./outputs/lora-out"

# Inference with input from file
cat prompt.txt | axolotl inference config.yml \
    --base-model="./completed-model"
```

### merge-lora

Merges trained LoRA adapters into the base model.

```bash
# Basic merge
axolotl merge-lora config.yml

# Specify LoRA directory (usually used with checkpoints)
axolotl merge-lora config.yml --lora-model-dir="./lora-output/checkpoint-100"

# Merge using CPU (if out of GPU memory)
CUDA_VISIBLE_DEVICES="" axolotl merge-lora config.yml
```

Configuration options:

```yaml
gpu_memory_limit: Limit GPU memory usage
lora_on_cpu: Load LoRA weights on CPU
```

### merge-sharded-fsdp-weights

Merges sharded FSDP model checkpoints into a single combined checkpoint.

```bash
# Basic merge
axolotl merge-sharded-fsdp-weights config.yml
```

### evaluate

Evaluates a model's performance (loss etc) on the train and eval datasets.

```bash
# Basic evaluation
axolotl evaluate config.yml

# Evaluation with launcher arguments
axolotl evaluate config.yml --launcher torchrun -- --nproc_per_node=2
```

### lm-eval

Runs LM Evaluation Harness on your model.

```bash
# Basic evaluation
axolotl lm-eval config.yml
```

Configuration options:

```yaml
lm_eval_model: # model to evaluate (local or hf path)

# List of tasks to evaluate
lm_eval_tasks:
  - arc_challenge
  - hellaswag
lm_eval_batch_size: # Batch size for evaluation
output_dir: # Directory to save evaluation results
```

See [LM Eval Harness integration docs](https://docs.axolotl.ai/docs/custom_integrations.html#language-model-evaluation-harness-lm-eval) for full configuration details.

### delinearize-llama4

Delinearizes a Llama 4 linearized model into a regular HuggingFace Llama 4 model. This only works with the non-quantized linearized model.

```bash
axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
```

This would be necessary to use with other frameworks. If you have an adapter, merge it with the non-quantized linearized model before delinearizing.

### quantize

Quantizes a model using the quantization configuration specified in your YAML file.

```bash
axolotl quantize config.yml
```

See [Quantization](./quantize.qmd) for more details.


## Legacy CLI Usage

While the new Click-based CLI is preferred, Axolotl still supports the legacy module-based CLI:

```bash
# Preprocess
python -m axolotl.cli.preprocess config.yml

# Train
accelerate launch -m axolotl.cli.train config.yml

# Inference
accelerate launch -m axolotl.cli.inference config.yml \
    --lora_model_dir="./outputs/lora-out"

# Gradio interface
accelerate launch -m axolotl.cli.inference config.yml \
    --lora_model_dir="./outputs/lora-out" --gradio
```

::: {.callout-important}
When overriding CLI parameters in the legacy CLI, use same notation as in yaml file (e.g., `--lora_model_dir`).

**Note:** This differs from the new Click-based CLI, which uses dash notation (e.g., `--lora-model-dir`). Keep this in mind if you're referencing newer documentation or switching between CLI versions.
:::

## Remote Compute with Modal Cloud

Axolotl supports running training and inference workloads on Modal cloud infrastructure. This is configured using a
cloud YAML file alongside your regular Axolotl config.

### Cloud Configuration

Create a cloud config YAML with your Modal settings:

```yaml
# cloud_config.yml
provider: modal
gpu: a100       # Supported: l40s, a100-40gb, a100-80gb, a10g, h100, t4, l4
gpu_count: 1    # Number of GPUs to use
timeout: 86400  # Maximum runtime in seconds (24 hours)
branch: main    # Git branch to use (optional)

volumes:        # Persistent storage volumes
  - name: axolotl-cache
    mount: /workspace/cache
  - name: axolotl-data
    mount: /workspace/data
  - name: axolotl-artifacts
    mount: /workspace/artifacts

secrets:        # Secrets to inject
  - WANDB_API_KEY
  - HF_TOKEN
```

### Running on Modal Cloud

Commands that support the --cloud flag:

```bash
# Preprocess on cloud
axolotl preprocess config.yml --cloud cloud_config.yml

# Train on cloud
axolotl train config.yml --cloud cloud_config.yml

# Run lm-eval on cloud
axolotl lm-eval config.yml --cloud cloud_config.yml
```

### Cloud Configuration Options

```yaml
provider:    # compute provider, currently only `modal` is supported
gpu:         # GPU type to use
gpu_count:   # Number of GPUs (default: 1)
memory:      # RAM in GB (default: 128)
timeout:     # Maximum runtime in seconds
timeout_preprocess: # Preprocessing timeout
branch:      # Git branch to use
docker_tag:  # Custom Docker image tag
volumes:     # List of persistent storage volumes

# Environment variables to pass. Can be specified in two ways:
# 1. As a string: Will load the value from the host computer's environment variables
# 2. As a key-value pair: Will use the specified value directly
# Example:
# env:
#   - CUSTOM_VAR  # Loads from host's $CUSTOM_VAR
#   - {CUSTOM_VAR: "value"}  # Uses "value" directly
env:

# Secrets to inject. Same input format as `env` but for sensitive data.
secrets:
  # - HF_TOKEN
  # - WANDB_API_KEY
```


================================================
FILE: docs/custom_integrations.qmd
================================================
---
title: Custom Integrations
toc: true
toc-depth: 3
---

```{python}
#| echo: false

import os
import re

def process_readme(integration_name):
    try:
        path = f'../src/axolotl/integrations/{integration_name}/README.md'
        with open(path, 'r') as f:
            txt = f.read()
            # Remove h1 headings
            txt = re.sub(r'^# .*\n?', '', txt, flags=re.MULTILINE)
            # Convert h2 to h3
            txt = re.sub(r'^## ', '### ', txt, flags=re.MULTILINE)
            return txt
    except FileNotFoundError:
        return None

def print_section(name, folder_name):
    output = f"\n## {name}\n"
    content = process_readme(folder_name)
    if content:
        output += content
    output += f"\nPlease see reference [here](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations/{folder_name})\n"
    return output
```

```{python}
#| output: asis
#| echo: false

# Introduction text
print("""
Axolotl adds custom features through `integrations`. They are located within the `src/axolotl/integrations` directory.

To enable them, please check the respective documentations.
""")

# Sections
sections = [
    ("Cut Cross Entropy", "cut_cross_entropy"),
    ("Grokfast", "grokfast"),
    ("Knowledge Distillation (KD)", "kd"),
    ("Liger Kernels", "liger"),
    ("Language Model Evaluation Harness (LM Eval)", "lm_eval"),
    ("Spectrum", "spectrum"),
    ("LLMCompressor", "llm_compressor")
]

for folder_name in os.listdir("../src/axolotl/integrations/"):
    if folder_name in [path for name, path in sections]:
        # skip if already in sections
        continue
    if os.path.exists(f"../src/axolotl/integrations/{folder_name}/README.md"):
        # grab the first heading in README.md as the section name
        with open(f"../src/axolotl/integrations/{folder_name}/README.md", "r") as f:
            txt = f.read()
            matches = re.search(r'^# (.*)\n?', txt, flags=re.MULTILINE)
            if matches:
                name = matches.group(1)
            else:
                continue
            sections.append((name, folder_name))

# sort sections by name
sections = sorted(sections, key=lambda x: x[0])

for section_name, folder_name in sections:
    print(print_section(section_name, folder_name))
```

## Adding a new integration

Plugins can be used to customize the behavior of the training pipeline through [hooks](https://en.wikipedia.org/wiki/Hooking). See [`axolotl.integrations.BasePlugin`](https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/integrations/base.py) for the possible hooks.

To add a new integration, please follow these steps:

1. Create a new folder in the `src/axolotl/integrations` directory.
2. Add any relevant files (`LICENSE`, `README.md`, `ACKNOWLEDGEMENTS.md`, etc.) to the new folder.
3. Add `__init__.py` and `args.py` files to the new folder.
  - `__init__.py` should import the integration and hook into the appropriate functions.
  - `args.py` should define the arguments for the integration.
4. (If applicable) Add CPU tests under `tests/integrations` or GPU tests under `tests/e2e/integrations`.

::: {.callout-tip}

See [src/axolotl/integrations/cut_cross_entropy](https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations/cut_cross_entropy) for a minimal integration example.

:::

::: {.callout-warning}

If you could not load your integration, please ensure you are pip installing in editable mode.

```bash
pip install -e .
```

and correctly spelled the integration name in the config file.

```yaml
plugins:
  - axolotl.integrations.your_integration_name.YourIntegrationPlugin
```

:::

::: {.callout-note}

It is not necessary to place your integration in the `integrations` folder. It can be in any location, so long as it's installed in a package in your python env.

See this repo for an example: [https://github.com/axolotl-ai-cloud/diff-transformer](https://github.com/axolotl-ai-cloud/diff-transformer)

:::


================================================
FILE: docs/dataset-formats/conversation.qmd
================================================
---
title: Conversation
description: Conversation format for supervised fine-tuning.
order: 3
---

## chat_template

Chat Template strategy uses a jinja2 template that converts a list of messages into a prompt. Support using tokenizer's template, a supported template, or custom jinja2.

```{.json filename="data.jsonl"}
{"messages": [{"role": "...", "content": "..."}, {"role": "...", "content": "..."}, ...]}
```

See [configs](../config-reference.qmd) for full configs and supported templates.

### Migrating from sharegpt

Most configs can be adapted as follows:

```yaml
# old
chat_template: chatml
datasets:
  - path: ...
    type: sharegpt
    conversation: chatml

# new (if using tokenizer's chat_template)
datasets:
  - path: ...
    type: chat_template

    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

# new (if setting a new chat_template like chatml, gemma, etc)
chat_template: chatml
datasets:
  - path: ...
    type: chat_template

    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
```

We recommend checking the below examples for other usecases.

### Examples

#### Training on last message

(Legacy) Using the default chat template in the tokenizer_config.json on OpenAI messages format, training on only last message.

```yaml
datasets:
  - path: ...
    type: chat_template
    roles_to_train:
    train_on_eos:
```

::: {.callout-tip}
If you receive an error like "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null.", it means the tokenizer does not have a default `chat_template`. Follow the examples below instead to set a custom `chat_template`.
:::

#### Overriding default chat template

Using the `gemma` chat template to override the tokenizer_config.json's chat template on OpenAI messages format, training on all assistant messages.

```yaml
chat_template: gemma # this overwrites the tokenizer's chat_template
datasets:
  - path: ...
    type: chat_template
    roles_to_train: ["assistant"]  # default value
```

::: {.callout-note}
If you want to use built-in chat_template, use `chat_template: tokenizer_default` (this is set by default).
:::

#### Using default chat template with fallback

Using the tokenizer_config.json's chat template or `chatml` as fallback if the former's chat template does not exist, on OpenAI messages format, training on all assistant messages.

```yaml
chat_template: tokenizer_default_fallback_chatml # this overwrites the tokenizer's chat_template
datasets:
  - path: ...
    type: chat_template
```

#### Custom Jinja template

Using a custom jinja template on OpenAI messages format, training on all assistant messages.

```yaml
# chat_template: jinja # `jinja` will be implied if the `chat_template_jinja` is set and this field is empty
chat_template_jinja: "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}"

datasets:
  - path: ...
    type: chat_template
```

::: {.callout-important}
Please make sure that your `tokenizer.eos_token` is same as EOS (End-of-Sequence) token in template. Otherwise, set `eos_token` under `special_tokens: `.
:::

#### Using template with different token for EOT and EOS

- If you are using a template that has a different EOT (End-of-Turn) token from EOS token or multiple EOT tokens (like Mistral V7 Tekken), set the `eot_tokens: ` config. The handling of EOT tokens follows `train_on_eos: ` which defaults to turn.

```yaml
eot_tokens:
  - "[/INST]"
  # - "[/SYSTEM_PROMPT]"

datasets:
  - path: ...
    type: chat_template

    # optional
    train_on_eot: turn  # defaults read from train_on_eos (which defaults to turn)
```

::: {.callout-tip}
See [config documentation](../config-reference.qmd) for detailed explanations of "turn", "last", and "all" options for training on tokens.
:::

::: {.callout-note}
Using `eot_tokens` requires each token that exists in `chat_template` to be a single token in the tokenizer. Otherwise, the tokenizer will split the token and cause unexpected behavior.

You can add those tokens as new tokens under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `. See [config](../config-reference.qmd) for more details.
:::

- Continuing from the previous example, if you want to train on all EOT token trainable turns but only last EOS token, set `train_on_eos: last`.

```yaml
eot_tokens:
  - "[/INST]"
  # ...

datasets:
  - path: ...
    type: chat_template

    train_on_eos: last
    train_on_eot: turn
```

::: {.callout-tip}
If EOS token only appears at the end of a prompt, `train_on_eos: last` is equivalent to `train_on_eos: turn`. Therefore, generally, you can leave them to their defaults and omit them.
:::


#### Using tool use

Instead of passing `tools` via the system prompt, an alternative method would be to have the `tools` in a separate column and loaded via `chat_template` to let the template dynamically build it.

```json
{
    "tools": [
        {
            "type": "...",
            "function": {
                "name": "...",
                "description": "...",
                "parameters": {
                    "type": "...",
                    "properties": {
                        // ...
                    },
                    "required": ["..."],
                },
            },
        },
    ],
    "messages": [
        // ...
        {
            "role": "assistant", // call the function via assistant
            "tool_calls": [
                {
                    "id": "...",  // required only for mistral
                    "type": "function",
                    "function": {
                        "name": "...",
                        "arguments": {
                            "...": "...",
                        }
                    }
                }
            ]
        },
        {
            "role": "tool",
            "tool_call_id": "...",  // required only for mistral
            "name": "...",
            "content": "..."
        },
    ],
}
```

::: {.callout-note}
Tools need to follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).
:::

::: {.callout-warning}
If you have tool arguments with same name but different dtypes (like `"time": string` and `"time": number`), please save `arguments: ` as JSON string to prevent `datasets` from having casting issues.

```
"arguments": "{\"...\": \"...\"}"
```

The same is applicable for tool parameters.

```
"parameters": "{\"...\": \"...\"}"
```

:::

Example config for Llama4:
```yaml
chat_template: llama4
datasets:
  - path: Nanobit/text-tools-2k-test
    type: chat_template
    # field_tools: tools # default is `tools`
```

::: {.callout-tip}
Look into the `chat_template` you are using to see if it supports `tools` and what the expected role is for the tool answer. In the example above, the tool answer is expected to be in the `tool` or `ipython` role for `llama4` template.
:::


#### Using fine-grained control over token masking

(Advanced) Using fine-grained control over tokens and turns to train in a conversation

For a data sample that looks like:

```{.json filename="data.jsonl"}
{
  "conversations": [
    {"from": "system", "value": "You are an AI assistant.", "train": false},
    {"from": "human", "value": "Hello", "train": false},
    {"from": "assistant", "value": "Hello", "train": true},
    {"from": "human", "value": "How are you?", "train": true},
    {
      "from": "assistant",
      "value": "I'm doing very well, thank you!",
      "train_detail": [
        {"begin_offset": 0, "end_offset": 8, "train": false},
        {"begin_offset": 9, "end_offset": 18, "train": true},
        {"begin_offset": 19, "end_offset": 30, "train": false},
      ],
    },
    {
        "from": "human",
        "value": "I'm doing very well, thank you!",
        "train": true,
    },
    {"from": "assistant", "value": "Hi there!", "train": true}
  ]
}
```

The configuration would look like:

```yaml
datasets:
  - path: ...
    type: chat_template
    chat_template: tokenizer_default
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
    roles_to_train: []
    train_on_eos: turn
    message_field_training: train
    message_field_training_detail: train_detail
```

::: {.callout-tip}
It is not necessary to set both `message_field_training` and `message_field_training_detail` at once.
:::

#### Reasoning split

(For Qwen3 template only) Enable reasoning split, where the reasoning is split from the content and passed as a separate field into the template.

```yaml
datasets:
  - path: ...
    type: chat_template
    chat_template: qwen3
    split_thinking: true
```

For example, a content can look like:

```json
{
  "content": "<think>Some thinking outputs</think>Output after thinking."
}
```

After split, it will look like:

```json
{
  "reasoning_content": "Some thinking outputs",
  "content": "Output after thinking..."
}
```


## sharegpt

::: {.callout-important}
ShareGPT is deprecated!. Please see [chat_template](#chat_template) section.
:::

## pygmalion

```{.json filename="data.jsonl"}
{"conversations": [{"role": "...", "value": "..."}]}
```


================================================
FILE: docs/dataset-formats/index.qmd
================================================
---
title: Dataset Formats
description: Guide to Dataset Formats in Axolotl
back-to-top-navigation: true
toc: true
toc-depth: 5
---


Axolotl is a training framework that aims to make the process convenient yet flexible to users by simply passing a config yaml file.

As there are a lot of available options in Axolotl, this guide aims to provide an simplify the user experience to choosing the proper choice.

Axolotl supports 3 kinds of training methods: pre-training, supervised fine-tuning, and preference-based post-training (e.g. DPO, ORPO, PRMs). Each method has their own dataset format which are described below.

::: {.callout-tip}

This guide will mainly use JSONL as an introduction. Please refer to the [dataset loading docs](../dataset_loading.qmd) to understand how to load datasets from other sources.

For `pretraining_dataset:` specifically, please refer to the [Pre-training section](#pre-training).
:::

## Pre-training

When aiming to train on large corpora of text datasets, pre-training is your go-to choice. Due to the size of these datasets, downloading the entire-datasets before beginning training would be prohibitively time-consuming. Axolotl supports [streaming](https://huggingface.co/docs/datasets/en/stream) to only load batches into memory at a time.

A sample format for a pre-training dataset is as follows:

```json
{"text": "first row"}
{"text": "second row"}
...
```

It is typically recommended to save your dataset as `.jsonl` due to its flexibility and simplicity.

Axolotl supports loading from a Hugging Face hub repo or from local files.

### Pre-training from Hugging Face hub datasets

As an example, to train using a Hugging Face dataset `hf_org/name`, you can pass the following config:

```yaml
pretraining_dataset: hf_org/name
```

### Pre-training from local dataset files

Given a few corpus files: `A.jsonl`, `B.jsonl`, and `C.jsonl`, your config will look like the below:

```yaml
pretraining_dataset:
  - path: json
    data_files:
      - A.jsonl
      - B.jsonl
      - C.jsonl
```

While we recommend `.jsonl`, you can also use the other formats (`csv`, `parquet`, `arrow`, `SQL`, `Webdataset`) that are supported by [`Dataset.load_dataset`](https://huggingface.co/docs/datasets/loading#local-and-remote-files)

### Pre-training without streaming

In the case that the dataset is small and can be loaded entirely into memory, another approach to running pre-training is to use the `completion` format. This would mean that the entire dataset is pre-tokenized instead of on-demand in streaming.

One benefit of this is that the tokenization can be performed separately on a CPU-only machine, and then transferred to a GPU machine for training to save costs.

From Hugging Face:

```yaml
datasets:
  - path: hf_org/name
    type: completion
```

From local files:

```yaml
datasets:
  - path: A.jsonl
    type: completion

  - path: B.jsonl
    type: completion
```

::: {.callout-important}
For `completion` only, Axolotl would split texts if it exceeds the context length into multiple smaller prompts. If you are interested in having this for `pretraining_dataset` too, please let us know or help make a PR!
:::

### Pre-training dataset configuration tips

#### Setting max_steps

When using streaming for large datasets, Axolotl does not know in advance how large the dataset is and does not know when to stop.

Therefore, it is necessary to set `max_steps: int` in your config for pre-training to run, so that Axolotl knows when to stop training.

One step is equal to `sequence_len * micro_batch_size * gradient_accumulation_steps * total_num_gpus` tokens.

#### Group_by_length

It is recommended to leave this off if downloading from Hugging Face hub as it would download the entire dataset which can be very large.

### Reference

Please see docs [here](pretraining.qmd).

## Supervised fine-tuning (SFT)

Supervised fine-tuning is the process of training models to respond to an instruction or chat input.

As there are a wide variety of dataset formats, Axolotl tries to support a majority of the formats available in public datasets.

Axolotl provides four approaches for loading datasets, however, it's easier to work backwards from the dataset you have available to figure out which approach to use.

A flow chart is as follows:

1. Do you already have the dataset tokenized? If yes, check [Pre-Tokenized Dataset](#pre-tokenized-dataset).

2. Do you want to format the dataset yourself and manually choose each section to mask? If yes, check [Template Free Dataset](#template-free-dataset)

3. Is your dataset in a "conversation" format, containing a `list[messages]`? If yes, check [Conversation Dataset](#conversation-dataset)

4. Is your dataset in an "instruct" format, containing `{ instruction, response }`? If yes, check [Instruction Dataset](#instruction-dataset)

If you went through the flow chart and did not find one that matches, it is recommended to preprocess your dataset into one of the above or create a thread on Github Discussion.

::: {.callout-tip}
You can mix and match within each approach or across approaches to train a model on a variety of datasets.
:::

### Pre-Tokenized Dataset

We suggest this approach when you want to bring your own tokenized dataset.

Axolotl expects the dataset to have three keys:

- `input_ids`: from tokenizing formatted prompt
- `attention_mask`: for masking padding. If you don't add padding, it would be equal to `len(input_ids) * [1]`
- `labels`: this is the same as `input_ids`, however, if you want to mask certain tokens, you would set those indices to `-100`.

::: {.callout-tip}
Make sure to add BOS/EOS tokens to your prompt and mask it appropriately.
:::

A config for this would look like:

```yaml
datasets:
  - path: A.jsonl
    type:
```

::: {.callout-note}
`type: ` is empty!
:::

Reference: [Pre-Tokenized Dataset Documentation](tokenized.qmd).

### Template Free Dataset

We reccomend this approach when you want granular control over the prompt formatting, special tokens, and masking, whilst letting Axolotl handle the tokenization. This is very useful if your dataset has unique prompts that differ across samples and where one single general template wouldn't suffice.

In the example below, you could see that there is no proper structure. At the same time, it's very flexible as there are no constraints on how your prompt can look.

```json
{
    "segments": [
        {
            "label": true,
            "text": "<s>Hello\n"
        },
        {
            "label": true,
            "text": "hi there!. "
        },
        {
            "label": false,
            "text": "goodbye "
        },
        {
            "label": true,
            "text": "farewell</s>"
        }
    ]
}
```

Each prompt must be have a key called `segments` which is a list of `{ text, label }`.

```yaml
datasets:
  - path: A.jsonl
    type: input_output
```

Reference: [Template Free Documentation](template_free.qmd).

### Conversation Dataset

`conversation` messages are a list of messages which usually contain a `role` and `content` key.

::: {.callout-tip}
Fun fact: Axolotl synonymously refers to "chat" messages as `conversation` messages due to how FastChat initially used this term to build a widely used [fastchat conversation](https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py) method for formatting chat messages prior to the creation of `chat_templates`.
:::

#### What are `chat_templates`?

The current most popular and convenient method for inference is to use `chat_templates` for formatting prompts. Axolotl supports using `chat_templates` for training to ensure that the model performs in the same environment as in inference.

Here's a quick rundown on `chat_template`: A `chat_template` is a Jinja2 template which formats a list of messages into a prompt.

An example of a prompt formatted into a popular template called ChatML can be seen below:

Single prompt (pretty-printed):
```json
{
    "messages": [
        {
            "role": "user",
            "content": "Hi"
        },
        {
            "role": "assistant",
            "content": "How can I help you?"
        },
        {
            "role": "user",
            "content": "Can you add 3+5?"
        },
        {
            "role": "assistant",
            "content": "The answer is 8."
        }
    ]
}
```

The ChatML template is as follows:
```jinja2
{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}
```

The above prompt formatted into this template will result in:

```
<|im_start|>user
Hi<|im_end|>
<|im_start|>assistant
How can I help you?<|im_end|>
<|im_start|>user
Can you add 3+5?<|im_end|>
<|im_start|>assistant
The answer is 8.<|im_end|>
```

By using delimiters (`<|im_start|>` and `<|im_end|>`), a prompt separates different speakers which helps the model identify which portion belongs to whom.

#### Common Conversation Dataset formats

Older conversation datasets with the following format are colloquially called `sharegpt` datasets.

```json
{"conversations": [{"from": "...", "value": "..."}]}
```

Newer conversation datasets usually follow the OpenAI format.

```json
{"messages": [{"role": "...", "content": "..."}]}
```

Axolotl supports both as well as allowing customization of any kind of key.

#### Chat Template Usage

To properly use this method, it is important to identify three things:

1. Which `chat_template` would you use?

2. What are the keys in your dataset, and what are the possible roles? For example, in OpenAI format, the keys would be `messages`, `role`, and `content`, respectively, whereas the possible roles are `system`, `user`, and `assistant`.

3. What do you want to mask? For instance, only assistant messages, only last message, or nothing.

##### Choosing a `chat_template`

There are a lot of `chat_templates` out there. Axolotl supports the common ones: [supported chat templates](https://github.com/axolotl-ai-cloud/axolotl/blob/860609392184cf62a7e0ca676658b170e059ce6c/src/axolotl/utils/chat_templates.py#L17). For example, to use ChatML, it would be `chat_template: chatml`.

However, it is also possible to use the already configured template within the tokenizer by specifying `chat_template: tokenizer_default`. If you want a fallback (in case some tokenizer does not have it pre-configured), you can do `chat_template: tokenizer_default_fallback_chatml` to fallback to the ChatML template if a tokenizer template was not found.

One last but powerful approach is to bring your own template. This can be set via:

```yaml
chat_template_jinja: # your template
```

##### Setting `chat_template` dataset keys

We currently default to OpenAI format for dataset keys, so if that's your current dataset format, there's nothing to do here.

If your dataset format is different, here are the keys you should check (with their defaults):

```yaml
datasets:
    ...
    field_messages: messages  # this should point to the key containing the list of conversations
    message_property_mappings:  # this is a mapping from keys in your dataset to keys in chat_template
      role: role
      content: content
```

In some `chat_templates` (e.g. [Gemma](https://huggingface.co/google/gemma-2b-it/blob/main/tokenizer_config.json#L1507)), the roles are hardcoded to `user` and `assistant`. Consequently, you may find it necessary to map the roles in your dataset to these above. We currently have some defaults that should work for common datasets, but if you get a `KeyError`, it would be necessary to add mapping for your roles. Here is an example of how it would look like:

```yaml
datasets:
    ...
    roles:
      assistant:
        - gpt
        - model
      user:
        - human
```

In the example above, all `gpt` and `model` values are converted to `assistant`. All `human` values are converted to `user.`

##### Handling masking

The common use case for `chat_template` is for chat messages, therefore, it is common to mask all non-assistant messages. Assistant messages refer to the bot messages that you want the model to learn on.

To train on all `assistant` messages, you would set the following configs.

```yaml
datasets:
    ...
    roles_to_train: ["assistant"]
    train_on_eos: "turn"
```

The `train_on_eos` config means that it would mask all EOS tokens for turns that aren't assistant-turns. The other options are: `all` and `last` to choose which EOS to train on.

Perhaps, you want to train on `assistant` and `narrator` roles, you can simply add `narrator` to the list of `roles_to_train`. You would also need to add it to the mapping of `roles` above.

```yaml
datasets:
    ...
    roles_to_train: ["assistant", "narrator"]
    roles:
      assistant:
        - gpt
        - model
      user:
        - human
      narrator: ["narrator"]
```

::: {.callout-tip}
As chat_templates may use hardcoded EOS/EOT tokens that are different from the tokenizer's EOS, it is highly recommended to set them. For example, `ChatML` uses `<|im_end|>` to end turns.

```yaml
special_tokens:
  eos_token: <|im_end|>
```

:::

##### Applying `chat_template`

Once all the above steps are completed, you could combine all these configs together to form a bespoke configuration for your custom dataset.

```yaml
datasets:
  - path: A.jsonl
    type: chat_template

    # step 1
    chat_template: chatml

    # step 2
    field_messages: messages
    message_property_mappings:
      role: role
      content: content

    roles:
      assistant:
        - gpt
        - model
        - assistant
      user:
        - human
        - user

    # step 3
    roles_to_train: ["assistant"]
    train_on_eos: "turn"

special_tokens:
  eos_token: <|im_end|>
```

If this config were to be applied to the sample dataset above, the output would look as such (which can be retrieved via `axolotl preprocess config.yaml --debug`):

```
<|im_start|>(-100, 128256) user(-100, 882)
(-100, 198) Hi(-100, 13347) <|im_end|>(-100, 128257)
(-100, 198) <|im_start|>(-100, 128256) assistant(-100, 78191)
(-100, 198) How(4438, 4438)  can(649, 649)  I(358, 358)  help(1520, 1520)  you(499, 499) ?(30, 30) <|im_end|>(128257, 128257)
(-100, 198) <|im_start|>(-100, 128256) user(-100, 882)
(-100, 198) Can(-100, 6854)  you(-100, 499)  add(-100, 923)  (-100, 220) 3(-100, 18) +(-100, 10) 5(-100, 20) ?(-100, 30) <|im_end|>(-100, 128257)
(-100, 198) <|im_start|>(-100, 128256) assistant(-100, 78191)
(-100, 198) The(791, 791)  answer(4320, 4320)  is(374, 374)  (220, 220) 8(23, 23) .(13, 13) <|im_end|>(128257, 128257)
(-100, 198)
```

The first number refers to the label, the second refers to the `token_id`. For example, `-100` labels appear on non-assistant portions, meaning that they are masked during. For assistant portions, the label is the same as the `token_id`.

::: {.callout-note}

If during `preprocess`, there are a lot of warnings of `Could not find content __ boundary`, please check the FAQ section for [chat_templates](../faq.qmd#chat-templates).

:::

#### Reference

Please see docs [here](conversation.qmd).

### Instruction Dataset

Instruction datasets are used to train instruction-following models and comprise a prompt, containing an instruction, and a single response. In contrast to chat datasets which may be multi-turn, instruct datasets are typically single-turn.

An example is of a common format called Alpaca:
```json
{"instruction": "...", "input": "...", "output": "..."}
```

Using those keys, a prompt can be built based on it.
```
Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:
{output}
```

This can be configured as such:
```yaml
datasets:
  - path: A.jsonl
    type: alpaca
```

Axolotl supports many kinds of instruction dataset. All of them can be found in the [Instruction Dataset Documentation](inst_tune.qmd) with their respective type and sample row format.

#### Custom Instruct Prompt Format

Due to the myriad possibilities of instruction formats, Axolotl allows customizing your own instruction format without having to dive into the code directly.

In the example below, a sample row is used to output in `mistral_v1` format.
```json
{"input": "...", "output": "..."}
```

```yaml
datasets:
  - path: repo
    type:
      system_prompt: ""

      field_system:
      field_instruction: input
      field_input:
      field_output: output

      # multi-line example with input
      format: |-
        [INST] {instruction} {input} [/INST]

      # single-line example without input
      no_input_format: "[INST] {instruction} [/INST]"
```

The config sets that the `field_instruction` is actually named `input`, and the `field_input` is empty as we don't have an `input` in this sample. Generally, `instruction` can be thought as the question to the model, and `input` as the additional information with `output` being the response. It is not necessary to have an `input` nor `system`. In the end, the most important part is to understand what format you want it to look like and how you can customize this to your use case.

Reference: [Custom Instruct Prompt Format Documentation](inst_tune.qmd#how-to-add-custom-prompt-format).

## Reinforcement Learning from Human Feedback (RLHF)

As there are multiple RLHF methods with their own dataset requirements. Please see [RLHF documentation](../rlhf.qmd) for more detail.


================================================
FILE: docs/dataset-formats/inst_tune.qmd
================================================
---
title: Instruction Tuning
description: Instruction tuning formats for supervised fine-tuning.
order: 2
---

## alpaca

instruction; input(optional)

```{.json filename="data.jsonl"}
{"instruction": "...", "input": "...", "output": "..."}
```

## jeopardy

question and answer

```{.json filename="data.jsonl"}
{"question": "...", "category": "...", "answer": "..."}
```

## oasst

instruction

```{.json filename="data.jsonl"}
{"INSTRUCTION": "...", "RESPONSE": "..."}
```

## gpteacher

instruction; input(optional)

```{.json filename="data.jsonl"}
{"instruction": "...", "input": "...", "response": "..."}
```

## reflection

instruction with reflect; input(optional)

```{.json filename="data.jsonl"}
{"instruction": "...", "input": "...", "output": "...", "reflection": "...", "corrected": "..."}
```

## explainchoice

question, choices, (solution OR explanation)

```{.json filename="data.jsonl"}
{"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
```

## concisechoice

question, choices, (solution OR explanation)

```{.json filename="data.jsonl"}
{"question": "...", "choices": ["..."], "solution": "...", "explanation": "..."}
```

## summarizetldr

article and summary

```{.json filename="data.jsonl"}
{"article": "...", "summary": "..."}
```

## alpaca_chat

basic instruct for alpaca chat

```{.json filename="data.jsonl"}
{"instruction": "...", "input": "...", "response": "..."}
```

## alpaca_chat.load_qa

question and answer for alpaca chat

```{.json filename="data.jsonl"}
{"question": "...", "answer": "..."}
```

## alpaca_chat.load_concise

question and answer for alpaca chat, for concise answers

```{.json filename="data.jsonl"}
{"instruction": "...", "input": "...", "response": "..."}
```

## alpaca_chat.load_camel_ai

question and answer for alpaca chat, for load_camel_ai

```{.json filename="data.jsonl"}
{"message_1": "...", "message_2": "..."}
```

## alpaca_w_system.load_open_orca

support for open orca datasets with included system prompts, instruct

```{.json filename="data.jsonl"}
{"system_prompt": "...", "question": "...", "response": "..."}
```

## context_qa

in context question answering from an article

```{.json filename="data.jsonl"}
{"article": "...", "question": "...", "answer": "..."}
```

## context_qa.load_v2

in context question answering (alternate)

```{.json filename="data.jsonl"}
{"context": "...", "question": "...", "answer": "..."}
```

## context_qa.load_404

in context question answering from an article, with default response for no answer from context

```{.json filename="data.jsonl"}
{"article": "...", "unanswerable_question": "..."}
```

## creative_acr.load_answer

instruction and revision

```{.json filename="data.jsonl"}
{"instruction": "...", "revision": "..."}
```

## creative_acr.load_critique

critique

```{.json filename="data.jsonl"}
{"scores": "...", "critiques": "...", "instruction": "...", "answer": "..."}
```

## creative_acr.load_revise

critique and revise

```{.json filename="data.jsonl"}
{"scores": "...", "critiques": "...", "instruction": "...", "answer": "...", "revision": "..."}
```

## metharme

instruction, adds additional eos tokens

```{.json filename="data.jsonl"}
{"prompt": "...", "generation": "..."}
```

## How to add custom prompt format

For a dataset that is preprocessed for instruction purposes:

```{.json filename="data.jsonl"}
{"input": "...", "output": "..."}
```

You can use this example in your YAML config:

```{.yaml filename="config.yaml"}
datasets:
  - path: repo
    type:
      system_prompt: ""
      field_system: system
      field_instruction: input
      field_output: output
      format: "[INST] {instruction} [/INST]"
      no_input_format: "[INST] {instruction} [/INST]"
```

See full config options under [here](../config-reference.qmd).


================================================
FILE: docs/dataset-formats/pretraining.qmd
================================================
---
title: Pre-training
description: Data format for a pre-training completion task.
order: 1
---

For pretraining, there is no prompt template or roles.  The only required field is `text`:

```{.json filename="data.jsonl"}
{"text": "first row"}
{"text": "second row"}
...
```

:::{.callout-note}

### Streaming is recommended for large datasets

Axolotl usually loads the entire dataset into memory. This will be challenging for large datasets. Use the following config to enable streaming:

```{.yaml filename="config.yaml"}
pretraining_dataset:
  - name:
    path:
    split:
    text_column: # column in dataset with the data, usually `text`
    type: pretrain
    trust_remote_code:
    skip: # number of rows of data to skip over from the beginning
```

:::


================================================
FILE: docs/dataset-formats/stepwise_supervised.qmd
================================================
---
title: Stepwise Supervised Format
description: Format for datasets with stepwise completions and labels
order: 3
---

## Stepwise Supervised

The stepwise supervised format is designed for chain-of-thought (COT) reasoning
datasets where each example contains multiple completion steps and a preference label
for each step.

### Example

Here's a simple example of a stepwise supervised dataset entry:

```json
{
  "prompt": "Which number is larger, 9.8 or 9.11?",
  "completions": [
    "The fractional part of 9.8 is 0.8, while the fractional part of 9.11 is 0.11.",
    "Since 0.11 is greater than 0.8, the number 9.11 is larger than 9.8."
  ],
  "labels": [true, false]
}
```


================================================
FILE: docs/dataset-formats/template_free.qmd
================================================
---
title: Template-Free
description: Construct prompts without a template.
toc: true
toc-depth: 3
order: 4
---

## Background {#sec-background}

### Masking Inputs {#masking-inputs}

One of the most popular features of
[axolotl](https://github.com/axolotl-ai-cloud/axolotl) is
setting the following configuration value:


```yaml
train_on_inputs: false
```

If you declare a [dataset formats](https://github.com/axolotl-ai-cloud/axolotl?tab=readme-ov-file#dataset)
such as `alpaca` or `chatml`, axolotl knows what is an input
(i.e. human) vs. an output (i.e. the assistant) and masks the input
labels so that your model can focus on predicting the outputs only.

### You may not want prompt templates {#sec-you-may-not-want-prompt-templates}

However, there are many situations where you don't want to use one of
these formats or templates. This is because they can:

-   Add unnecessary boilerplate to your prompts.
-   Create artifacts like special delimiters `<|im_start|>` that can
    quickly become footguns if you don't include them correctly at
    inference time.
-   Enforce a *chat* interface when you do not want one. Sometimes you
    just want to fine-tune a model to a very specific task and do NOT
    want multi-turn conversations, roles, etc.
-   Limit you to only certain roles that the template allows.

### The `input_output` format {#sec-the-inputoutput-format}

You can construct your prompts without a template by using the
`input_output` format, by setting `type: input_output` in your
configuration file like this:

**config.yml**

```yaml
train_on_inputs: false # Mask segments of your data
datasets:
  - path: output.jsonl
    type: input_output  # use template free prompt construction
```

Unlike `type: completion`, which is also template-free,
`type: input_output` allows you to mask segments of your text. More
details on how this works are described below.

## Usage {#sec-usage}

This is how you can use the `input_output` format:

### 1. Prepare Data {#sec-1-prepare-data}

To use the `input_output` format, collect your data in the following
format into a jsonl file (below is the first row from the file
`output`.jsonl` pretty printed):

```bash
$ head -n1 output.jsonl | python -m json.tool
```

:::{.cell-output .cell-output-stdout}
    {
        "segments": [
            {
                "label": true,
                "text": "<s>Hello\n"
            },
            {
                "label": true,
                "text": "hi there!. "
            },
            {
                "label": false,
                "text": "goodbye "
            },
            {
                "label": true,
                "text": "farewell</s>"
            }
        ]
    }
:::

Set `label:false` when you want to mask a segment of text so that the
model isn't trained on it. Some things to keep in mind:

> [!IMPORTANT]
> 1.  **EOS, BOS, spaces, newlines etc. are entirely up to you. Axolotl
    concatenates all the segments as-is.** The tokenizer doesn't add
    anything additional. Notice how I added spaces, newlines, `<s>`
    (BOS), and `</s>` (EOS) myself.
> 2.  Make sure you check the materialized output to validate that the
    prompt is getting assembled how you like.

### 2. Use `type: input_output` {#sec-2-use-type-inputoutput}

Let's materialize data with our `output.jsonl` file by setting
`type: input_output` in our axolotl config:

```yaml
# training_config.yaml
base_model: mistralai/Mistral-7B-v0.1
data_seed: 49
seed: 49

datasets:
  - path: output.jsonl
    type: input_output
val_set_size: 0.1

sequence_len: 896
sample_packing: false

micro_batch_size: 2
gradient_accumulation_steps: 3
eval_batch_size: 2
num_epochs: 1
learning_rate: 0.0002

train_on_inputs: false
special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
```

You can use the following command to materialize your data. The
`--debug` flag will print the tokens, along with the labels so you can
verify that the correct items are being ignored:

```bash
axolotl preprocess training_config.yaml --debug

...
[2024-03-05 23:36:46,969] [INFO] [axolotl.check_example_labels:35] [PID:607731] [RANK:0] <s>(1, 1) Hello(22557, 22557)
(13, 13) hi(12014, 12014) there(736, 736) !(28808, 28808) .(28723, 28723) (28705, 28705) good(-100, 1179) bye(-100, 17664) (-100, 28705) fare(19111, 19111) well(5458, 5458) </s>(2, 2)

```

The format is `decoded_token`(`label`, `token_id`), for example,
`<s>(1, 1)` means that the token is `<s>`, the label is `1` and the
token_id is `1`. When the label is `-100` then that token is ignored for
training.

### 3. Check the prompts {#sec-3-check-the-prompts}

Here is another way to check the materialized output:

```python
from transformers import AutoTokenizer
from datasets import load_from_disk
import yaml

directory = !ls last_run_prepared/
with open('training_config.yaml', 'r') as f:
    cfg = yaml.safe_load(f)
model_id = cfg['base_model']
tok = AutoTokenizer.from_pretrained(model_id)
ds = load_from_disk(f'last_run_prepared/{directory[0]}/')
```

```python
>>> row = ds[0]
>>> print(tok.decode(row['input_ids']))
<s> Hello
    hi there!.  goodbye  farewell</s>
```

We can check that the right tokens are ignored by comparing the labels
to each token:

```python
import pandas as pd
pd.DataFrame([{'token': tok.decode(i), 'label': l, 'id':i} for i,l in
              zip(row['input_ids'], row['labels'])])
```

| token | label | id    |
|-------|-------|-------|
| 0     | \<s\> | 1     |
| 1     | Hello | 22557 |
| 2     | \\n   | 13    |
| 3     | hi    | 12014 |
| 4     | there | 736   |
| 5     | !     | 28808 |
| 6     | .     | 28723 |
| 7     |       | 28705 |
| 8     | good  | -100  |
| 9     | bye   | -100  |
| 10    |       | -100  |
| 11    | fare  | 19111 |
| 12    | well  | 5458  |
| 13    | \</s\>| 2     |


If we look at the input data, the above table seems correct! (The jsonl
version is repeated below for reference):


```bash
$ head -n1 output.jsonl | python -m json.tool
```

:::{.cell-output .cell-output-stdout}
    {
        "segments": [
            {
                "label": true,
                "text": "<s>Hello\n"
            },
            {
                "label": true,
                "text": "hi there!. "
            },
            {
                "label": false,
                "text": "goodbye "
            },
            {
                "label": true,
                "text": "farewell</s>"
            }
        ]
    }
:::


================================================
FILE: docs/dataset-formats/tokenized.qmd
================================================
---
title: Custom Pre-Tokenized Dataset
description: How to use a custom pre-tokenized dataset.
order: 5
---

- Pass an empty `type:` in your axolotl config.
- Columns in Dataset must be exactly `input_ids`, `attention_mask`, `labels`
- To indicate that a token should be ignored during training, set its corresponding label to `-100`.
- You must add BOS and EOS, and make sure that you are training on EOS by not setting its label to -100.
- For pretraining, do not truncate/pad documents to the context window length.
- For instruction training, documents must be truncated/padded as desired.

Sample config:

```{.yaml filename="config.yml"}
datasets:
  - path: /path/to/your/file.jsonl
    ds_type: json
    type:
```

Sample jsonl:

```jsonl
{"input_ids":[271,299,99],"attention_mask":[1,1,1],"labels":[271,-100,99]}
{"input_ids":[87,227,8383,12],"attention_mask":[1,1,1,1],"labels":[87,227,8383,12]}
```


================================================
FILE: docs/dataset_loading.qmd
================================================
---
title: Dataset Loading
description: Understanding how to load datasets from different sources
back-to-top-navigation: true
toc: true
toc-depth: 5
---

## Overview

Datasets can be loaded in a number of different ways depending on the how it is saved (the extension of the file) and where it is stored.

## Loading Datasets

We use the `datasets` library to load datasets and a mix of `load_dataset` and `load_from_disk` to load them.

You may recognize the similar named configs between `load_dataset` and the `datasets` section of the config file.

```yaml
datasets:
  - path:
    name:
    data_files:
    split:
    revision:
    trust_remote_code:
```

::: {.callout-tip}

Do not feel overwhelmed by the number of options here. A lot of them are optional. In fact, the most common config to use would be `path` and sometimes `data_files`.

:::

This matches the API of [`datasets.load_dataset`](https://github.com/huggingface/datasets/blob/0b5998ac62f08e358f8dcc17ec6e2f2a5e9450b6/src/datasets/load.py#L1838-L1858), so if you're familiar with that, you will feel right at home.

For HuggingFace's guide to load different dataset types, see [here](https://huggingface.co/docs/datasets/loading).

For full details on the config, see [config-reference.qmd](config-reference.qmd).

::: {.callout-note}

You can set multiple datasets in the config file by more than one entry under `datasets`.

```yaml
datasets:
  - path: /path/to/your/dataset
  - path: /path/to/your/other/dataset
```

:::

### Local dataset

#### Files

To load a JSON file, you would do something like this:

```python
from datasets import load_dataset

dataset = load_dataset("json", data_files="data.json")
```

Which translates to the following config:

```yaml
datasets:
  - path: data.json
    ds_type: json
```

In the example above, it can be seen that we can just point the `path` to the file or directory along with the `ds_type` to load the dataset.

This works for CSV, JSON, Parquet, and Arrow files.

::: {.callout-tip}

If `path` points to a file and `ds_type` is not specified, we will automatically infer the dataset type from the file extension, so you could omit `ds_type` if you'd like.

:::

#### Directory

If you're loading a directory, you can point the `path` to the directory.

Then, you have two options:

##### Loading entire directory

You do not need any additional configs.

We will attempt to load in the following order:
- datasets saved with `datasets.save_to_disk`
- loading entire directory of files (such as with parquet/arrow files)

```yaml
datasets:
  - path: /path/to/your/directory
```

##### Loading specific files in directory

Provide `data_files` with a list of files to load.

```yaml
datasets:
    # single file
  - path: /path/to/your/directory
    ds_type: csv
    data_files: file1.csv

    # multiple files
  - path: /path/to/your/directory
    ds_type: json
    data_files:
      - file1.jsonl
      - file2.jsonl

    # multiple files for parquet
  - path: /path/to/your/directory
    ds_type: parquet
    data_files:
      - file1.parquet
      - file2.parquet

```

### HuggingFace Hub

The method you use to load the dataset depends on how the dataset was created, whether a folder was uploaded directly or a HuggingFace Dataset was pushed.

::: {.callout-note}

If you're using a private dataset, you will need to enable the `hf_use_auth_token` flag in the root-level of the config file.

:::

#### Folder uploaded

This would mean that the dataset is a single file or file(s) uploaded to the Hub.

```yaml
datasets:
  - path: org/dataset-name
    data_files:
      - file1.jsonl
      - file2.jsonl
```

#### HuggingFace Dataset

This means that the dataset is created as a HuggingFace Dataset and pushed to the Hub via `datasets.push_to_hub`.

```yaml
datasets:
  - path: org/dataset-name
```

::: {.callout-note}

There are some other configs which may be required like `name`, `split`, `revision`, `trust_remote_code`, etc depending on the dataset.

:::

### Remote Filesystems

Via the `storage_options` config under `load_dataset`, you can load datasets from remote filesystems like S3, GCS, Azure, and OCI.

::: {.callout-warning}

This is currently experimental. Please let us know if you run into any issues!

:::

The only difference between the providers is that you need to prepend the path with the respective protocols.

```yaml
datasets:
    # Single file
  - path: s3://bucket-name/path/to/your/file.jsonl

    # Directory
  - path: s3://bucket-name/path/to/your/directory
```

For directory, we load via `load_from_disk`.

#### S3

Prepend the path with `s3://`.

The credentials are pulled in the following order:

- `AWS_ACCESS_KEY_ID`, `AWS_SECRET_ACCESS_KEY`, and `AWS_SESSION_TOKEN` environment variables
- from the `~/.aws/credentials` file
- for nodes on EC2, the IAM metadata provider

::: {.callout-note}

We assume you have credentials setup and not using anonymous access. If you want to use anonymous access, let us know! We may have to open a config option for this.

:::

Other environment variables that can be set can be found in [boto3 docs](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/configuration.html#using-environment-variables)

#### GCS

Prepend the path with `gs://` or `gcs://`.

The credentials are loaded in the following order:

- gcloud credentials
- for nodes on GCP, the google metadata service
- anonymous access

#### Azure

##### Gen 1

Prepend the path with `adl://`.

Ensure you have the following environment variables set:

- `AZURE_STORAGE_TENANT_ID`
- `AZURE_STORAGE_CLIENT_ID`
- `AZURE_STORAGE_CLIENT_SECRET`

##### Gen 2

Prepend the path with `abfs://` or `az://`.

Ensure you have the following environment variables set:

- `AZURE_STORAGE_ACCOUNT_NAME`
- `AZURE_STORAGE_ACCOUNT_KEY`

Other environment variables that can be set can be found in [adlfs docs](https://github.com/fsspec/adlfs?tab=readme-ov-file#setting-credentials)

#### OCI

Prepend the path with `oci://`.

It would attempt to read in the following order:

- `OCIFS_IAM_TYPE`, `OCIFS_CONFIG_LOCATION`, and `OCIFS_CONFIG_PROFILE` environment variables
- when on OCI resource, resource principal

Other environment variables:

- `OCI_REGION_METADATA`

Please see the [ocifs docs](https://ocifs.readthedocs.io/en/latest/getting-connected.html#Using-Environment-Variables).

### HTTPS

The path should start with `https://`.

```yaml
datasets:
  - path: https://path/to/your/dataset/file.jsonl
```

This must be publically accessible.

## Next steps

Now that you know how to load datasets, you can learn more on how to load your specific dataset format into your target output format [dataset formats docs](dataset-formats).


================================================
FILE: docs/dataset_preprocessing.qmd
================================================
---
title: Dataset Preprocessing
description: How datasets are processed
---

## Overview

Dataset pre-processing is the step where Axolotl takes each dataset you've configured alongside
the [dataset format](dataset-formats) and prompt strategies to:

 - parse the dataset based on the *dataset format*
 - transform the dataset to how you would interact with the model based on the *prompt strategy*
 - tokenize the dataset based on the configured model & tokenizer
 - shuffle and merge multiple datasets together if using more than one

The processing of the datasets can happen one of two ways:

1. Before kicking off training by calling `axolotl preprocess config.yaml --debug`
2. When training is started

### What are the benefits of pre-processing?

When training interactively or for sweeps
(e.g. you are restarting the trainer often), processing the datasets can oftentimes be frustratingly
slow. Pre-processing will cache the tokenized/formatted datasets according to a hash of dependent
training parameters so that it will intelligently pull from its cache when possible.

The path of the cache is controlled by `dataset_prepared_path:` and is often left blank in example
YAMLs as this leads to a more robust solution that prevents unexpectedly reusing cached data.

If `dataset_prepared_path:` is left empty, when training, the processed dataset will be cached in a
default path of `./last_run_prepared/`, but will ignore anything already cached there. By explicitly
setting `dataset_prepared_path: ./last_run_prepared`, the trainer will use whatever pre-processed
data is in the cache.

### What are the edge cases?

Let's say you are writing a custom prompt strategy or using a user-defined
prompt template. Because the trainer cannot readily detect these changes, we cannot change the
calculated hash value for the pre-processed dataset.

If you have `dataset_prepared_path: ...` set
and change your prompt templating logic, it may not pick up the changes you made and you will be
training over the old prompt.


================================================
FILE: docs/debugging.qmd
================================================
---
title: Debugging
description: How to debug Axolotl
---


This document provides some tips and tricks for debugging Axolotl.  It also provides an example configuration for debugging with VSCode.  A good debugging setup is essential to understanding how Axolotl code works behind the scenes.

## Table of Contents

- [General Tips](#general-tips)
- [Debugging with VSCode](#debugging-with-vscode)
    - [Background](#background)
    - [Configuration](#configuration)
    - [Customizing your debugger](#customizing-your-debugger)
    - [Video Tutorial](#video-tutorial)
- [Debugging With Docker](#debugging-with-docker)
    - [Setup](#setup)
    - [Attach To Container](#attach-to-container)
    - [Video - Attaching To Docker On Remote Host](#video---attaching-to-docker-on-remote-host)

## General Tips

While debugging it's helpful to simplify your test scenario as much as possible.  Here are some tips for doing so:

> [!Important]
> All of these tips are incorporated into the [example configuration](#configuration) for debugging with VSCode below.

1. **Make sure you are using the latest version of axolotl**:  This project changes often and bugs get fixed fast.  Check your git branch and make sure you have pulled the latest changes from `main`.
1. **Eliminate concurrency**: Restrict the number of processes to 1 for both training and data preprocessing:
    - Set `CUDA_VISIBLE_DEVICES` to a single GPU, ex: `export CUDA_VISIBLE_DEVICES=0`.
    - Set `dataset_num_proc: 1` in your axolotl config or run the training command with `--dataset_num_proc=1`.
2. **Use a small dataset**: Construct or use a small dataset from HF Hub. When using a small dataset, you will often have to make sure `sample_packing: False` and `eval_sample_packing: False` to avoid errors.  If you are in a pinch and don't have time to construct a small dataset but want to use from the HF Hub, you can shard the data (this will still tokenize the entire dataset, but will only use a fraction of the data for training.  For example, to shard the dataset into 20 pieces, add the following to your axolotl config):

    ```yaml
    datasets:
        ...
        shards: 20
    ```

3. **Use a small model**: A good example of a small model is [TinyLlama/TinyLlama-1.1B-Chat-v1.0](https://huggingface.co/TinyLlama/TinyLlama-1.1B-Chat-v1.0).
4. **Minimize iteration time**: Make sure the training loop finishes as fast as possible, with these settings.
    - `micro_batch_size: 1`
    - `max_steps: 1`
    - `val_set_size: 0`
5. **Clear Caches:** Axolotl caches certain steps and so does the underlying HuggingFace trainer.  You may want to clear some of these caches when debugging.
    - Data preprocessing: When debugging data preprocessing, which includes prompt template formation, you may want to delete the directory set in `dataset_prepared_path:` in your axolotl config.  If you didn't set this value, the default is `last_run_prepared`.
    - HF Hub: If you are debugging data preprocessing, you should clear the relevant HF cache [HuggingFace cache](https://huggingface.co/docs/datasets/cache), by deleting the appropriate `~/.cache/huggingface/datasets/...` folder(s).
    - **The recommended approach is to redirect all outputs and caches to a temporary folder and delete selected subfolders before each run.  This is demonstrated in the example configuration below.**


## Debugging with VSCode

### Background

The below example shows how to configure VSCode to debug data preprocessing of the `chat_template` format.  This is the format used when you have the following in your axolotl config:

```yaml
datasets:
  - path: <path to your chat_template formatted dataset> # example on HF Hub: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
```

>[!Important]
> If you are already familiar with advanced VSCode debugging, you can skip the below explanation and look at the files [.vscode/launch.json](../.vscode/launch.json) and [.vscode/tasks.json](../.vscode/tasks.json) for an example configuration.

>[!Tip]
> If you prefer to watch a video, rather than read, you can skip to the [video tutorial](#video-tutorial) below (but doing both is recommended).

### Setup

Make sure you have an [editable install](https://setuptools.pypa.io/en/latest/userguide/development_mode.html) of Axolotl, which ensures that changes you make to the code are reflected at runtime.  Run the following commands from the root of this project:

```bash
pip3 install packaging
pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
```

#### Remote Hosts

If you developing on a remote host, you can easily use VSCode to debug remotely.  To do so, you will need to follow this [remote - SSH guide](https://code.visualstudio.com/docs/remote/ssh).  You can also see the video below on [Docker and Remote SSH debugging](#video---attaching-to-docker-on-remote-host).


### Configuration

The easiest way to get started is to modify the [.vscode/launch.json](../.vscode/launch.json) file in this project.  This is just an example configuration, so you may need to modify or copy it to suit your needs.

For example, to mimic the command `cd devtools && CUDA_VISIBLE_DEVICES=0 accelerate launch -m axolotl.cli.train dev_chat_template.yml`, you would use the below configuration[^1].  Note that we add additional flags that override the axolotl config and incorporate the tips above (see the comments). We also set the working directory to `devtools` and set the `env` variable `HF_HOME` to a temporary folder that is later partially deleted.  This is because we want to delete the HF dataset cache before each run in order to ensure that the data preprocessing code is run from scratch.

```json
// .vscode/launch.json
{
    "version": "0.2.0",
    "configurations": [
        {
            "name": "Debug axolotl prompt - chat_template",
            "type": "python",
            "module": "accelerate.commands.launch",
            "request": "launch",
            "args": [
                "-m", "axolotl.cli.train", "dev_chat_template.yml",
                // The flags below simplify debugging by overriding the axolotl config
                // with the debugging tips above.  Modify as needed.
                "--dataset_num_proc=1",      // limits data preprocessing to one process
                "--max_steps=1",              // limits training to just one step
                "--batch_size=1",             // minimizes batch size
                "--micro_batch_size=1",       // minimizes batch size
                "--val_set_size=0",           // disables validation
                "--sample_packing=False",     // disables sample packing which is necessary for small datasets
                "--eval_sample_packing=False",// disables sample packing on eval set
                "--dataset_prepared_path=temp_debug/axolotl_outputs/data", // send data outputs to a temp folder
                "--output_dir=temp_debug/axolotl_outputs/model" // send model outputs to a temp folder
                ],
            "console": "integratedTerminal",      // show output in the integrated terminal
            "cwd": "${workspaceFolder}/devtools", // set working directory to devtools from the root of the project
            "justMyCode": true,                   // step through only axolotl code
            "env": {"CUDA_VISIBLE_DEVICES": "0",  // Since we aren't doing distributed training, we need to limit to one GPU
                    "HF_HOME": "${workspaceFolder}/devtools/temp_debug/.hf-cache"}, // send HF cache to a temp folder
            "preLaunchTask": "cleanup-for-dataprep", // delete temp folders (see below)
        }
    ]
}
```

**Additional notes about this configuration:**

- The argument `justMyCode` is set to `true` such that you step through only the axolotl code.  If you want to step into dependencies, set this to `false`.
- The `preLaunchTask`: `cleanup-for-dataprep` is defined in [.vscode/tasks.json](../.vscode/tasks.json) and is used to delete the following folders before debugging, which is essential to ensure that the data pre-processing code is run from scratch:
    -  `./devtools/temp_debug/axolotl_outputs`
    - `./devtools/temp_debug/.hf-cache/datasets`

>[!Tip]
> You may not want to delete these folders. For example, if you are debugging model training instead of data pre-processing, you may NOT want to delete the cache or output folders. You may also need to add additional tasks to the `tasks.json` file depending on your use case.

Below is the [./vscode/tasks.json](../.vscode/tasks.json) file that defines the `cleanup-for-dataprep` task.  This task is run before each debugging session when you use the above configuration.  Note how there are two tasks that delete the two folders mentioned above.  The third task `cleanup-for-dataprep` is a composite task that combines the two tasks.  A composite task is necessary because VSCode does not allow you to specify multiple tasks in the `preLaunchTask` argument of the `launch.json` file.

```json
// .vscode/tasks.json
// this file is used by launch.json
{
    "version": "2.0.0",
    "tasks": [
      // this task changes into the devtools directory and deletes the temp_debug/axolotl_outputs folder
      {
        "label": "delete-outputs",
        "type": "shell",
        "command": "rm -rf temp_debug/axolotl_outputs",
        "options":{ "cwd": "${workspaceFolder}/devtools"},
        "problemMatcher": []
      },
      // this task changes into the devtools directory and deletes the `temp_debug/.hf-cache/datasets` folder
      {
        "label": "delete-temp-hf-dataset-cache",
        "type": "shell",
        "command": "rm -rf temp_debug/.hf-cache/datasets",
        "options":{ "cwd": "${workspaceFolder}/devtools"},
        "problemMatcher": []
      },
        // this task combines the two tasks above
      {
       "label": "cleanup-for-dataprep",
       "dependsOn": ["delete-outputs", "delete-temp-hf-dataset-cache"],
      }
    ]
}
```

### Customizing your debugger

Your debugging use case may differ from the example above.  The easiest thing to do is to put your own axolotl config in the `devtools` folder and modify the `launch.json` file to use your config.  You may also want to modify the `preLaunchTask` to delete different folders or not delete anything at all.

### Video Tutorial

The following video tutorial walks through the above configuration and demonstrates how to debug with VSCode, (click the image below to watch):

<div style="text-align: center; line-height: 0;">

<a href="https://youtu.be/xUUB11yeMmc" target="_blank"
title="How to debug Axolotl (for fine tuning LLMs)"><img
src="https://i.ytimg.com/vi/xUUB11yeMmc/maxresdefault.jpg"
style="border-radius: 10px; display: block; margin: auto;" width="560" height="315" /></a>

<figcaption style="font-size: smaller;"><a href="https://hamel.dev">Hamel Husain's</a> tutorial: <a href="https://www.youtube.com/watch?v=xUUB11yeMmc">Debugging Axolotl w/VSCode</a></figcaption>

</div>
<br>

## Debugging With Docker

Using [official Axolotl Docker images](https://hub.docker.com/r/axolotlai/axolotl/tags) is a great way to debug your code, and is a very popular way to use Axolotl.  Attaching VSCode to Docker takes a few more steps.

### Setup

On the host that is running axolotl (ex: if you are using a remote host), clone the axolotl repo and change your current directory to the root:

```bash
git clone https://github.com/axolotl-ai-cloud/axolotl
cd axolotl
```

>[!Tip]
> If you already have axolotl cloned on your host, make sure you have the latest changes and change into the root of the project.

Next, run the desired docker image and mount the current directory. Below is a docker command you can run to do this:[^2]

```bash
docker run --privileged --gpus '"all"' --shm-size 10g --rm -it --name axolotl --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 --mount type=bind,src="${PWD}",target=/workspace/axolotl -v ${HOME}/.cache/huggingface:/root/.cache/huggingface axolotlai/axolotl:main-py3.10-cu118-2.0.1
```

>[!Tip]
> To understand which containers are available, see the [Docker section of the README](../README.md#docker) and the [DockerHub repo](https://hub.docker.com/r/axolotlai/axolotl/tags).  For details of how the Docker containers are built, see axolotl's [Docker CI builds](../.github/workflows/main.yml).

You will now be in the container.  Next, perform an editable install of Axolotl:

```bash
pip3 install packaging
pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
```

### Attach To Container

Next, if you are using a remote host, [Remote into this host with VSCode](https://code.visualstudio.com/docs/remote/ssh).  If you are using a local host, you can skip this step.

Next, select `Dev Containers: Attach to Running Container...` using the command palette (`CMD + SHIFT + P`) in VSCode.  You will be prompted to select a container to attach to.  Select the container you just created.  You will now be in the container with a working directory that is at the root of the project.  Any changes you make to the code will be reflected both in the container and on the host.

Now you are ready to debug as described above (see [Debugging with VSCode](#debugging-with-vscode)).

### Video - Attaching To Docker On Remote Host

Here is a short video that demonstrates how to attach to a Docker container on a remote host:

<div style="text-align: center; line-height: 0;">

<a href="https://youtu.be/0AuoR7QnHR0" target="_blank"
title="Debugging Axolotl Part 2: Attaching to Docker on a Remote Host"><img
src="https://i.ytimg.com/vi/0AuoR7QnHR0/hqdefault.jpg"
style="border-radius: 10px; display: block; margin: auto;" width="560" height="315" /></a>

<figcaption style="font-size: smaller;"><a href="https://hamel.dev">Hamel Husain's</a> tutorial: <a href="https://youtu.be/0AuoR7QnHR0">Debugging Axolotl Part 2: Attaching to Docker on a Remote Host
</a></figcaption>

</div>
<br>

[^1]: The config actually mimics the command `CUDA_VISIBLE_DEVICES=0 python -m accelerate.commands.launch -m axolotl.cli.train devtools/chat_template.yml`, but this is the same thing.

[^2]: Many of the below flags are recommended best practices by Nvidia when using nvidia-container-toolkit.  You can read more about these flags [here](https://docs.nvidia.com/deeplearning/frameworks/user-guide/index.html).


================================================
FILE: docs/docker.qmd
================================================
---
title: "Docker"
format:
  html:
    toc: true
    toc-depth: 4
---

This section describes the different Docker images that are released by AxolotlAI at [Docker Hub](https://hub.docker.com/u/axolotlai).

::: {.callout-important}
For Blackwell GPUs, please use the tags with PyTorch 2.7.1 and CUDA 12.8.
:::

## Base

The base image is the most minimal image that can install Axolotl. It is based on the `nvidia/cuda` image. It includes python, torch, git, git-lfs, awscli, pydantic, and more.

#### Image

```
axolotlai/axolotl-base
```

Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-base)

#### Tags format

```bash
main-base-py{python_version}-cu{cuda_version}-{pytorch_version}
```

Tags examples:

- `main-base-py3.11-cu128-2.8.0`
- `main-base-py3.11-cu128-2.9.1`

## Main

The main image is the image that is used to run Axolotl. It is based on the `axolotlai/axolotl-base` image and includes the Axolotl codebase, dependencies, and more.

#### Image

```
axolotlai/axolotl
```

Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl)

#### Tags format {#sec-main-tags}

```bash
# on push to main
main-py{python_version}-cu{cuda_version}-{pytorch_version}

# latest main (currently torch 2.6.0, python 3.11, cuda 12.4)
main-latest

# nightly build
{branch}-{date_in_YYYYMMDD}-py{python_version}-cu{cuda_version}-{pytorch_version}

# tagged release
{version}
```

:::{.callout-tip}

There may be some extra tags appended to the image, like `-vllm` which installs those packages.

:::

Tags examples:

- `main-py3.11-cu128-2.8.0`
- `main-py3.11-cu128-2.9.1`
- `main-latest`
- `main-20250303-py3.11-cu124-2.6.0`
- `main-20250303-py3.11-cu126-2.6.0`
- `0.12.0`

## Cloud

The cloud image is the image that is used to run Axolotl in the cloud. It is based on the `axolotlai/axolotl` image and sets ENV variables like HuggingFace cache directories for volume mounts, tmux, and more for different cloud providers.

:::{.callout-tip}

Jupyter lab is run by default. Set `JUPYTER_DISABLE=1` in the environment variables to disable it.

:::

#### Image

```
axolotlai/axolotl-cloud
```

Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-cloud)

#### Tags format

This uses the same tags as the [`main` image](#sec-main-tags).

#### Environment variables

- `JUPYTER_DISABLE`: Disable Jupyter lab.
- `JUPYTER_PASSWORD`: Set a password for the Jupyter lab.
- `PUBLIC_KEY` / `SSH_KEY`: Add a public key for the SSH service.

#### Volume mounts

:::{.callout-tip}

We recommend mounting volumes to `/workspace/data` for data persistence. `/workspace/axolotl` contains the source code and is ephemeral.

:::

- `/workspace/data/axolotl-artifacts`: Directory to store Axolotl artifacts.
- `/workspace/data/huggingface-cache`: Directory to store HuggingFace cache.

## Cloud-no-tmux

This is the same as the [`cloud` image](#sec-cloud) but without tmux.

#### Image

```
axolotlai/axolotl-cloud-term
```

Link: [Docker Hub](https://hub.docker.com/r/axolotlai/axolotl-cloud-term)

:::{.callout-note}

The naming may be a bit confusing as it has `-term` appended to the end.

:::

#### Tags format

This uses the same tags as the [`cloud` image](#sec-cloud-tags).


================================================
FILE: docs/expert_quantization.qmd
================================================
---
title: "MoE Expert Quantization"
description: "Reduce VRAM usage when training MoE model adapters by quantizing expert weights on load"
---

Transformers v5 changed MoE expert layers from `nn.Linear` to fused `nn.Parameter` (3D+ tensors).
This means `bitsandbytes` can no longer quantize them during model loading, resulting in all expert
weights being loaded in full bf16 precision and causing massive VRAM usage.

`quantize_moe_experts` solves this by quantizing expert weights during model loading.
It intercepts the weight loading process, quantizes each expert tensor on the fly, and
immediately frees the original bf16 tensor from VRAM. This dramatically reduces peak memory.
For example, GLM-4.7-Flash QLoRA drops from ~127GiB to ~23GiB reserved memory.

## Usage

Enable expert quantization in your Axolotl config:

```yaml
quantize_moe_experts: true
```

This works with both 4-bit (QLoRA) and 8-bit (LoRA) quantization.

### Expert LoRA targeting

You can optionally apply LoRA adapters directly to expert weights using `lora_target_parameters`:

```yaml
lora_target_parameters:
  - mlp.experts.gate_up_proj
  - mlp.experts.down_proj
  # - mlp.gate.weight  # router
```

::: {.callout-note}
`lora_dropout` must be `0` when using `lora_target_parameters`.
:::

## Requirements

- Requires (`adapter: lora` and `load_in_8bit: true`) or (`adapter: qlora` and `load_in_4bit: true`)
- CUDA GPUs only (not tested with ROCm or other backends)
- FSDP2 compatible for distributed training

## Limitations

- `lora_target_linear` is not compatible with `quantize_moe_experts`. See [Expert LoRA targeting](#expert-lora-targeting) instead.
- `cpu_ram_efficient_loading` hangs / takes long time with FSDP2 + QLoRA.
- Total model parameter count may display incorrectly (trainable param count is correct).
- FSDP LoRA (8-bit) may have a large initial VRAM spike at the first 1-2 steps, which then drops. QLoRA does not exhibit this.
- FSDP2 may use more VRAM per GPU than single GPU training due to not all layers being properly sharded across ranks.
- Model loading takes longer due to on-demand quantization, even on consecutive runs.
- DeepSpeed has not been tested.

## Implementation details

The quantization is applied by patching transformers to intercept weight loading.
When a 3D+ CUDA tensor with "expert" in its name is detected:

- **4-bit mode:** Uses bitsandbytes NF4 parametrization (configurable via `bnb_4bit_quant_type`).
- **8-bit mode:** Uses a custom row-wise int8 parametrization with bitsandbytes dequantization.

The original bf16 tensor is freed immediately after quantization. Multiple sub-patches are applied to
transformers, PEFT and accelerate FSDP2 to support these parametrized expert modules.

For full implementation details, see [PR #3439](https://github.com/axolotl-ai-cloud/axolotl/pull/3439).


================================================
FILE: docs/faq.qmd
================================================
---
title: FAQ
description: Frequently asked questions
---

### General

**Q: The trainer stopped and hasn't progressed in several minutes.**

> A: Usually an issue with the GPUs communicating with each other. See the [NCCL doc](nccl.qmd)

**Q: exitcode: -9**

> A: This usually happens when you run out of system RAM.

**Q: exitcode: -7 while using deepspeed**

> A: Try upgrading deepspeed w: `pip install -U deepspeed`

**Q: AttributeError: 'DummyOptim' object has no attribute 'step'**

**Q: ModuleNotFoundError: No module named 'mpi4py' using single GPU with deepspeed**

> A: You may be using deepspeed with single gpu. Please remove the `deepspeed:` section in the yaml file or `--deepspeed` CLI flag.

**Q: The codes is stuck on saving preprocessed datasets.**

> A: This is usually an issue with the GPU. This can be resolved through setting the os environment variable `CUDA_VISIBLE_DEVICES=0`. If you are on runpod, this is usually a pod issue. Starting a new pod should take care of it.

**Q: Received mismatch error on merge adapters / loading adapters between torch.Size of checkpoint and model.**

> A: This is likely due to vocab size mismatch. By default, Axolotl expands the model's embeddings if the tokenizer has more tokens than the model. Please use the `axolotl merge-lora` command to merge the adapters instead of using your own scripts.

> On the other hand, if the model has more tokens than the tokenizer, Axolotl does not shrink the model's embeddings unless `shrink_embeddings: true` is set in the config.

**Q: How to call Axolotl via custom python scripts?**

> A: Since Axolotl is just Python, please see `src/axolotl/cli/main.py` on how each command is called.

**Q: How to know the value to use for `fsdp_transformer_layer_cls_to_wrap`?**

> A: This is the class name of the transformer layer to wrap with FSDP. For example, for `LlamaForCausalLM`, the value is `LlamaDecoderLayer`. To find this for a specific model, check the model's `PreTrainedModel` definition and look for `_no_split_modules` variable in the `modeling_<model_name>.py` file within `transformers` library.

**Q: ValueError: Asking to pad but the tokenizer does not have a padding token. Please select a token to use as pad_token**

> A: This is because the tokenizer does not have a padding token. Please add a padding token to the tokenizer via:

> ```yaml
> special_tokens:
>   # str. If you're not sure, set to same as `eos_token`.
>   pad_token: "..."
> ```

**Q: `IterableDataset error` or `KeyError: 'input_ids'` when using `preprocess` CLI**

> A: This is because you may be using `preprocess` CLI with `pretraining_dataset:` or `skip_prepare_dataset: true` respectively. Please use `axolotl train` CLI directly instead as these datasets are prepared on demand.

**Q: vLLM is not working with Axolotl**

> A: We currently recommend torch 2.6.0 for use with `vllm`. Please ensure you use the right version. For Docker, please use the `main-py3.11-cu124-2.6.0` tag.

**Q: FA2 2.8.0 `undefined symbol` runtime error on CUDA 12.4**

> A: There seems to be a wheel issue with FA2 2.8.0 on CUDA 12.4. Try CUDA 12.6 instead or downgrade to FA2 2.7.4. Please refer to the upstream issue: https://github.com/Dao-AILab/flash-attention/issues/1717.

**Q: Can we mix text and text+image datasets for VLM training?**

> A: Yes, you can for newer VLM arch. The ones that would not work are LLaVA / Pixtral arch. If you notice one not working, please let us know!

**Q: Why is `memory/max_*` different from `nvidia-smi`?**

> A: We use `torch` APIs to retrieve this information. You can see https://docs.pytorch.org/docs/stable/notes/cuda.html#cuda-memory-management for more information.

### Chat templates

**Q: `jinja2.exceptions.UndefinedError: 'dict object' has no attribute 'content' / 'role' / ____`**

> A: This means that the property mapping for the stated attribute does not exist when building `chat_template` prompt. For example, if `no attribute 'content'`, please check you have added the correct mapping for `content` under `message_property_mappings`.

**Q: `Empty template generated for turn ___`**

> A: The `content` is empty for that turn.

**Q: `Could not find content start/end boundary for turn __`**

> A: The specific turn's start/end could not be detected. Please ensure you have set the `eos_token` following your `chat_template`. Otherwise, this could be a `chat_template` which doesn't use proper boundaries for each turn (like system). On the rare occurrence, make sure your content is not `[[dummy_message]]`. Please let us know about this.

**Q: `Content end boundary is before start boundary for turn ___`**

> A: This is an edge case which should not occur. Please create an Issue if this happens.

**Q: `Content end boundary is the same as start boundary for turn ___. This is likely an empty turn.`**

> A: This is likely an empty turn.

**Q: The EOS token is incorrectly being masked or not being masked / `EOS token __ not found in chat template`.**

> A: There can be two reasons:

> 1. This is because of the mismatch between `tokenizer.eos_token` and EOS token in template. Please make sure to set `eos_token: ` under `special_tokens: ` to the same EOS token as in template.

> 2. The EOS token is not in the template. Please check if your template is correct. As an example, `phi_35` template does not use its dedicated EOS token `<|endoftext|>` at the end.

**Q: "`chat_template` choice is `tokenizer_default` but tokenizer's `chat_template` is null. Please add a `chat_template` in tokenizer config"**

> A: This is because the tokenizer does not have a chat template. Please add a chat template in the tokenizer config. See [chat_template](dataset-formats/conversation.qmd#chat-template) for more details.

**Q: The EOT token(s) are incorrectly being masked or not being masked / `EOT token __ not found in chat template`.**

> A: There can be two reasons:

> 1. The EOT token is different from the EOS token and was not specified under `eot_tokens: `. Please set `eot_tokens: ` to the same EOT token(s) as in template.

> 2. There is more than one EOT token per turn in the template. Please raise an issue with examples as we recognize this as an edge case.

**Q: `EOT token encoding failed. Please check if the token is valid and can be encoded.`**

> A: There could be some issue with the tokenizer or unicode encoding. Please raise an issue with examples with the EOT token & tokenizer causing the issue.

**Q: `EOT token __ is encoded as multiple tokens.`**

> A: This is because the EOT token is encoded as multiple tokens which can cause unexpected behavior. Please add it under `tokens: ` or (recommended) override unused added_tokens via `added_tokens_overrides: `.

**Q: `Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot`**

> A: This is because the EOS token is in the `eot_tokens: ` while mismatch between `train_on_eos: ` and `train_on_eot: `. This will cause one to override the other. Please ensure that `train_on_eos: ` and `train_on_eot: ` are the same or remove the EOS token from `eot_tokens: `.

**Q: If `eot_tokens: ` is not provided, what happens?**

> A: If `eot_tokens: ` is not provided, the default behavior is the same as before. EOS tokens used to delimit turns are masked/unmasked depending on whether the turn is trainable.

> Internally, `eot_tokens: tokenizer.eos_token` and `train_on_eot: train_on_eos` (which defaults to `turn`). This transition helps clarify the naming and behavior of EOT/EOS tokens.

**Q: `Data processing error: CAS service error`**

> A: Try disabling XET with `export HF_HUB_DISABLE_XET=1`

**Q: `torch._inductor.exc.LoweringException: NoValidChoicesError: No choices to select, please consider adding ATEN into max_autotune_gemm_backends config (defined in torch/_inductor/config.py) to allow at least one choice. `**

> A: Depending on the version of torch, you may need to include this in your YAML:

> ```yaml
> flex_attn_compile_kwargs:
>   dynamic: false
>   mode: max-autotune-no-cudagraphs
> ```

**Q: `ValueError("Backward pass should have cleared tracker of all tensors")`

> A: This may happen due to edge cases in using the modern OffloadActivations context manager for CUDA streams. If you encounter this error, you may have success using the naive implementation with `offload_activations: legacy` in your YAML.

**Q: `Error parsing tool_calls arguments as JSON.`

> A: There is an error parsing string arguments to a dict. Please check your dataset and the error message for more details.


================================================
FILE: docs/fsdp_qlora.qmd
================================================
---
title: "FSDP + QLoRA"
description: Use FSDP with QLoRA to fine-tune large LLMs on consumer GPUs.
format:
  html:
    toc: true
---

## Background

Using FSDP with QLoRA is essential for **fine-tuning larger (70b+ parameter) LLMs on consumer GPUs.**  For example, you can use FSDP + QLoRA to train a 70b model on two 24GB GPUs[^1].

Below, we describe how to use this feature in Axolotl.

## Usage

To enable `QLoRA` with `FSDP`, you need to perform the following steps:

> ![Tip]
> See the [example config](#example-config) file in addition to reading these instructions.

1. Set `adapter: qlora` in your axolotl config file.
2. Enable FSDP in your axolotl config, as [described here](multi-gpu.qmd#sec-fsdp).
3. Use one of the supported model types: `llama`, `mistral` or `mixtral`.

## Enabling Swap for FSDP2

If available memory is insufficient even after FSDP's CPU offloading, you can enable swap memory usage by setting `cpu_offload_pin_memory: false` alongside `offload_params: true` in FSDP config.

This disables memory pinning, allowing FSDP to use disk swap space as fallback. Disabling memory pinning itself incurs performance overhead, and actually having to use swap adds more, but it may enable training larger models that would otherwise cause OOM errors on resource constrained systems.

## Example Config

[examples/llama-2/qlora-fsdp.yml](../examples/llama-2/qlora-fsdp.yml) contains an example of how to enable QLoRA + FSDP in axolotl.

## References

- [PR #1378](https://github.com/axolotl-ai-cloud/axolotl/pull/1378) enabling QLoRA in FSDP in Axolotl.
- [Blog Post](https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html) from the [Answer.AI](https://www.answer.ai/) team describing the work that enabled QLoRA in FSDP.
- Related HuggingFace PRs Enabling FDSP + QLoRA:
    - Accelerate [PR#2544](https://github.com/huggingface/accelerate/pull/2544 )
    - Transformers [PR#29587](https://github.com/huggingface/transformers/pull/29587)
    - TRL [PR#1416](https://github.com/huggingface/trl/pull/1416)
    - PEFT [PR#1550](https://github.com/huggingface/peft/pull/1550)


[^1]: This was enabled by [this work](https://www.answer.ai/posts/2024-03-06-fsdp-qlora.html) from the Answer.AI team.


================================================
FILE: docs/getting-started.qmd
================================================
---
title: "Quickstart"
format:
  html:
    toc: true
    toc-depth: 3
    number-sections: true
execute:
  enabled: false
---

This guide will walk you through your first model fine-tuning project with Axolotl.

## Quick Example {#sec-quick-example}

Let's start by fine-tuning a small language model using LoRA. This example uses a 1B parameter model to ensure it runs on most GPUs.
Assuming `axolotl` is installed (if not, see our [Installation Guide](installation.qmd))

1. Download example configs:
```bash
axolotl fetch examples
```

2. Run the training:
```bash
axolotl train examples/llama-3/lora-1b.yml
```

That's it! Let's understand what just happened.

## Understanding the Process {#sec-understanding}

### The Configuration File {#sec-config}

The YAML configuration file controls everything about your training. Here's what (part of) our example config looks like:

```yaml
base_model: NousResearch/Llama-3.2-1B

load_in_8bit: true
adapter: lora

datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out
```

::: {.callout-tip}
`load_in_8bit: true` and `adapter: lora` enables LoRA adapter finetuning.

- To perform Full finetuning, remove these two lines.
- To perform QLoRA finetuning, replace with `load_in_4bit: true` and `adapter: qlora`.
:::

See our [config options](config-reference.qmd) for more details.

### Training {#sec-training}

When you run `axolotl train`, Axolotl:

1. Downloads the base model
2. (If specified) applies QLoRA/LoRA adapter layers
3. Loads and processes the dataset
4. Runs the training loop
5. Saves the trained model and / or LoRA weights

## Your First Custom Training {#sec-custom}

Let's modify the example for your own data:

1. Create a new config file `my_training.yml`:

```yaml
base_model: NousResearch/Nous-Hermes-llama-1b-v1

load_in_8bit: true
adapter: lora

# Training settings
micro_batch_size: 2
num_epochs: 3
learning_rate: 0.0003

# Your dataset
datasets:
  - path: my_data.jsonl        # Your local data file
    type: alpaca               # Or other format
```

This specific config is for LoRA fine-tuning a model with instruction tuning data using
the `alpaca` dataset format, which has the following format:

```json
{
    "instruction": "Write a description of alpacas.",
    "input": "",
    "output": "Alpacas are domesticated South American camelids..."
}
```

Please see our [Dataset Formats](dataset-formats) for more dataset formats and how to
format them.

2. Prepare your JSONL data in the specified format (in this case, the expected `alpaca`
format):

```json
{"instruction": "Classify this text", "input": "I love this!", "output": "positive"}
{"instruction": "Classify this text", "input": "Not good at all", "output": "negative"}
```

3. Run the training:

```bash
axolotl train my_training.yml
```

## Common Tasks {#sec-common-tasks}

::: {.callout-tip}

The same yaml file is used for training, inference, and merging.

:::

### Testing Your Model {#sec-testing}

After training, test your model:

```bash
axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out"
```

More details can be found in [Inference](inference.qmd).

### Using a UI {#sec-ui}

Launch a Gradio interface:

```bash
axolotl inference my_training.yml --lora-model-dir="./outputs/lora-out" --gradio
```

### Preprocessing Data {#sec-preprocessing}

For large datasets, preprocess first:

```bash
axolotl preprocess my_training.yml
```

Please make sure to set `dataset_prepared_path: ` in your config to set the path to save the prepared dataset.

More details can be found in [Dataset Preprocessing](dataset_preprocessing.qmd).

### Merging LoRA weights {#sec-merging-lora}

To merge the LoRA weights back into the base model, run:

```bash
axolotl merge-lora my_training.yml --lora-model-dir="./outputs/lora-out"
```

The merged model will be saved in the `{output_dir}/merged` directory.

More details can be found in [Merging LoRA weights](inference.qmd#sec-merging).

## Next Steps {#sec-next-steps}

Now that you have the basics, you might want to:

- Try different model architectures
- Experiment with hyperparameters
- Use more advanced training methods
- Scale up to larger models

Check our other guides for details on these topics:

- [Configuration Guide](config-reference.qmd) - Full configuration options
- [Dataset Loading](dataset_loading.qmd) - Loading datasets from various sources
- [Dataset Formats](dataset-formats) - Working with different data formats
- [Multi-GPU Training](multi-gpu.qmd)
- [Multi-Node Training](multi-node.qmd)


================================================
FILE: docs/gradient_checkpointing.qmd
================================================
---
title: Gradient Checkpointing and Activation Offloading
---

Gradient checkpointing and activation offloading are techniques used to optimize the performance of deep learning
models by reducing the memory footprint and improving computational efficiency.

### Enabling Gradient Checkpointing

```yaml
gradient_checkpointing: true
```

### Enabling Activation Offloading

```yaml
gradient_checkpointing: true  # required for activation offloading
activation_offloading: true
```

Activation offloading variants:

The default `activation_offloading: true` offloads activations to CPU and uses CUDA streams
to overlap the communications and computations when offloading.

The `activation_offloading: legacy` naively offloads activations to CPU and without additional optimizations.

For resource constrained environments with limited CPU memory, `activation_offloading: disk` offloads
activations to disk instead of CPU RAM so that much larger context lengths can be trained with minimal memory.


================================================
FILE: docs/inference.qmd
================================================
---
title: "Inference and Merging"
format:
  html:
    toc: true
    toc-depth: 3
    number-sections: true
execute:
  enabled: false
---

This guide covers how to use your trained models for inference, including model loading, interactive testing, merging adapters, and common troubleshooting steps.

## Quick Start {#sec-quickstart}

::: {.callout-tip}
Use the same config used for training on inference/merging.
:::

### Basic Inference {#sec-basic}

::: {.panel-tabset}

## LoRA Models

```{.bash}
axolotl inference your_config.yml --lora-model-dir="./lora-output-dir"
```

## Full Fine-tuned Models

```{.bash}
axolotl inference your_config.yml --base-model="./completed-model"
```

:::

## Advanced Usage {#sec-advanced}

### Gradio Interface {#sec-gradio}

Launch an interactive web interface:

```{.bash}
axolotl inference your_config.yml --gradio
```

### File-based Prompts {#sec-file-prompts}

Process prompts from a text file:

```{.bash}
cat /tmp/prompt.txt | axolotl inference your_config.yml \
  --base-model="./completed-model" --prompter=None
```

### Memory Optimization {#sec-memory}

For large models or limited memory:

```{.bash}
axolotl inference your_config.yml --load-in-8bit=True
```

## Merging LoRA Weights {#sec-merging}

Merge LoRA adapters with the base model:

```{.bash}
axolotl merge-lora your_config.yml --lora-model-dir="./completed-model"
```

### Memory Management for Merging {#sec-memory-management}

::: {.panel-tabset}

## Configuration Options

```{.yaml}
gpu_memory_limit: 20GiB  # Adjust based on your GPU
lora_on_cpu: true        # Process on CPU if needed
```

## Force CPU Merging

```{.bash}
CUDA_VISIBLE_DEVICES="" axolotl merge-lora ...
```

:::

## Tokenization {#sec-tokenization}

### Common Issues {#sec-tokenization-issues}

::: {.callout-warning}
Tokenization mismatches between training and inference are a common source of problems.
:::

To debug:

1. Check training tokenization:
```{.bash}
axolotl preprocess your_config.yml --debug
```

2. Verify inference tokenization by decoding tokens before model input

3. Compare token IDs between training and inference

### Special Tokens {#sec-special-tokens}

Configure special tokens in your YAML:

```{.yaml}
special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"
tokens:
  - "<|im_start|>"
  - "<|im_end|>"
```

## Troubleshooting {#sec-troubleshooting}

### Common Problems {#sec-common-problems}

::: {.panel-tabset}

## Memory Issues

- Use 8-bit loading
- Reduce batch sizes
- Try CPU offloading

## Token Issues

- Verify special tokens
- Check tokenizer settings
- Compare training and inference preprocessing

## Performance Issues

- Verify model loading
- Check prompt formatting
- Ensure temperature/sampling settings

:::

For more details, see our [debugging guide](debugging.qmd).


================================================
FILE: docs/input_output.qmd
================================================
---
title: Template-free prompt construction
description: "Template-free prompt construction with the `input_output` format"
---

The documentation moved to [here](dataset-formats/template_free.qmd).


================================================
FILE: docs/installation.qmd
================================================
---
title: "Installation"
format:
  html:
    toc: true
    toc-depth: 3
    number-sections: true
execute:
  enabled: false
---

This guide covers all the ways you can install and set up Axolotl for your environment.

## Requirements {#sec-requirements}

- NVIDIA GPU (Ampere architecture or newer for `bf16` and Flash Attention) or AMD GPU
- Python ≥3.11
- PyTorch ≥2.6.0

## Installation Methods {#sec-installation-methods}

::: {.callout-important}
Please make sure to have Pytorch installed before installing Axolotl in your local environment.

Follow the instructions at: [https://pytorch.org/get-started/locally/](https://pytorch.org/get-started/locally/)
:::

::: {.callout-important}
For Blackwell GPUs, please use Pytorch 2.9.1 and CUDA 12.8.
:::

### PyPI Installation (Recommended) {#sec-pypi}

```{.bash}
pip3 install -U packaging setuptools wheel ninja
pip3 install --no-build-isolation axolotl[flash-attn,deepspeed]
```

We use `--no-build-isolation` in order to detect the installed PyTorch version (if
installed) in order not to clobber it, and so that we set the correct version of
dependencies that are specific to the PyTorch version or other installed
co-dependencies.

### uv Installation {#sec-uv}

uv is a fast, reliable Python package installer and resolver built in Rust. It offers significant performance improvements over pip and provides better dependency resolution, making it an excellent choice for complex environments.

Install uv if not already installed
```{.bash}
curl -LsSf https://astral.sh/uv/install.sh | sh
source $HOME/.local/bin/env
```

Choose your CUDA version to use with PyTorch; e.g. `cu124`, `cu126`, `cu128`,
then create the venv and activate
```{.bash}
export UV_TORCH_BACKEND=cu126
uv venv --no-project --relocatable
source .venv/bin/activate
```

Install PyTorch
- PyTorch 2.6.0 recommended
```{.bash}
uv pip install packaging setuptools wheel
uv pip install torch==2.6.0
uv pip install awscli pydantic
```

Install axolotl from PyPi
```{.bash}
uv pip install --no-build-isolation axolotl[deepspeed,flash-attn]

# optionally install with vLLM if you're using torch==2.6.0 and want to train w/ GRPO
uv pip install --no-build-isolation axolotl[deepspeed,flash-attn,vllm]
```

### Edge/Development Build {#sec-edge-build}

For the latest features between releases:

```{.bash}
git clone https://github.com/axolotl-ai-cloud/axolotl.git
cd axolotl
pip3 install -U packaging setuptools wheel ninja
pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
```

### Docker {#sec-docker}

```{.bash}
docker run --gpus '"all"' --rm -it axolotlai/axolotl:main-latest
```

For development with Docker:

```{.bash}
docker compose up -d
```

::: {.callout-tip}
### Advanced Docker Configuration
```{.bash}
docker run --privileged --gpus '"all"' --shm-size 10g --rm -it \
  --name axolotl --ipc=host \
  --ulimit memlock=-1 --ulimit stack=67108864 \
  --mount type=bind,src="${PWD}",target=/workspace/axolotl \
  -v ${HOME}/.cache/huggingface:/root/.cache/huggingface \
  axolotlai/axolotl:main-latest
```
:::

::: {.callout-important}
For Blackwell GPUs, please use `axolotlai/axolotl:main-py3.11-cu128-2.9.1` or the cloud variant `axolotlai/axolotl-cloud:main-py3.11-cu128-2.9.1`.
:::

Please refer to the [Docker documentation](docker.qmd) for more information on the different Docker images that are available.

## Cloud Environments {#sec-cloud}

### Cloud GPU Providers {#sec-cloud-gpu}

For providers supporting Docker:

- Use `axolotlai/axolotl-cloud:main-latest`
- Available on:
    - [RunPod](https://runpod.io/gsc?template=v2ickqhz9s&ref=6i7fkpdz)
    - [Vast.ai](https://cloud.vast.ai?ref_id=62897&template_id=bdd4a49fa8bce926defc99471864cace&utm_source=axolotl&utm_medium=partner&utm_campaign=template_launch_july2025&utm_content=docs_link)
    - [PRIME Intellect](https://app.primeintellect.ai/dashboard/create-cluster?image=axolotl&location=Cheapest&security=Cheapest&show_spot=true)
    - [Modal](https://www.modal.com?utm_source=github&utm_medium=github&utm_campaign=axolotl)
    - [Novita](https://novita.ai/gpus-console?templateId=311)
    - [JarvisLabs.ai](https://jarvislabs.ai/templates/axolotl)
    - [Latitude.sh](https://latitude.sh/blueprint/989e0e79-3bf6-41ea-a46b-1f246e309d5c)

### Google Colab {#sec-colab}

[![](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/axolotl-ai-cloud/axolotl/blob/main/examples/colab-notebooks/colab-axolotl-example.ipynb#scrollTo=msOCO4NRmRLa)

## Platform-Specific Instructions {#sec-platform-specific}

### macOS {#sec-macos}

```{.bash}
pip3 install --no-build-isolation -e '.'
```

See @sec-troubleshooting for Mac-specific issues.

### Windows {#sec-windows}

::: {.callout-important}
We recommend using WSL2 (Windows Subsystem for Linux) or Docker.
:::

## Environment Managers {#sec-env-managers}

### Conda/Pip venv {#sec-conda}

1. Install Python ≥3.11
2. Install PyTorch: https://pytorch.org/get-started/locally/
3. Install Axolotl:
   ```{.bash}
   pip3 install -U packaging setuptools wheel ninja
   pip3 install --no-build-isolation -e '.[flash-attn,deepspeed]'
   ```
4. (Optional) Login to Hugging Face:
   ```{.bash}
   hf auth login
   ```

## Troubleshooting {#sec-troubleshooting}

If you encounter installation issues, see our [FAQ](faq.qmd) and [Debugging Guide](debugging.qmd).


================================================
FILE: docs/lora_optims.qmd
================================================
---
title: "LoRA Optimizations"
description: "Custom autograd functions and Triton kernels in Axolotl for optimized LoRA fine-tuning"
---

Inspired by [Unsloth](https://github.com/unslothai/unsloth), we've implemented two
optimizations for LoRA and QLoRA fine-tuning, supporting both single GPU and multi-GPU
(including the DDP, DeepSpeed, and FSDP2 settings) training. These include (1) SwiGLU
and GEGLU activation function Triton kernels, and (2) LoRA MLP and attention custom
autograd functions. Our goal was to leverage operator fusion and tensor re-use in order
to improve speed and reduce memory usage during the forward and backward passes of
these calculations.

We currently support several common model architectures, including (but not limited to):

- `llama`
- `mistral`
- `qwen2`
- `gemma`
- `gemma2`
- `gemma3`

<details>

The set of models we support is currently limited by our attention patching strategy,
which assumes (and replaces) specific code blocks for query / key / value and output
projections:

```python
ORIGINAL_QKV_CODE = """
    query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
    key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
    value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
""".lstrip(
    "\n"
)

ORIGINAL_O_CODE = """
    attn_output = self.o_proj(attn_output)
""".lstrip(
    "\n"
)
```

Is replaced with:

```python
PATCHED_QKV_CODE = """
    query_states, key_states, value_states = self.apply_qkv(hidden_states)
    query_states = query_states.view(hidden_shape).transpose(1, 2)
    key_states = key_states.view(hidden_shape).transpose(1, 2)
    value_states = value_states.view(hidden_shape).transpose(1, 2)
""".lstrip(
    "\n"
)

PATCHED_O_CODE = """
    attn_output = self.apply_o(attn_output)
""".lstrip(
    "\n"
)
```

Where `apply_qkv` and `apply_o` are defined in the `axolotl.kernels.lora` module.

We welcome testing of other model architectures and / or PRs to expand our patching
logic to be compatible with more of them.

</details>

::: {.callout-tip}
Check out our [LoRA optimizations blog](https://axolotlai.substack.com/p/accelerating-lora-fine-tuning-with).
:::

## Usage

These optimizations can be enabled in your Axolotl config YAML file. The
`lora_mlp_kernel` option enables the optimized MLP path, while `lora_qkv_kernel` and
`lora_o_kernel` enable the fused query-key-value projection and optimized output
projection, respectively.

```yaml
lora_mlp_kernel: true
lora_qkv_kernel: true
lora_o_kernel: true
```

::: {.callout-note}
Currently, LoRA kernels are not supported for RLHF training, only SFT.
:::

::: {.callout-warning}
LoRA kernels do not support remote modeling code.
:::

## Requirements

- One or more NVIDIA or AMD GPUs (in order to use the Triton kernels)
    - Note: Set `TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1` to enable [memory-efficient attention on AMD GPUs](https://github.com/ROCm/aotriton/issues/16#issuecomment-2346675491)
- Targeted LoRA adapters cannot use Dropout
    - This may limit model expressivity / cause overfitting
- Targeted LoRA adapters cannot have bias terms
    - This may limit model expressivity

Models with pre-existing LoRA adapters that use Dropout or have bias terms may need to
be re-finetuned without these features in order to be useful.

## Implementation details

### Custom autograd functions

The LoRA MLP autograd function optimizes the entire MLP computation path. It fuses the
LoRA and base weight computations together and provides a single, efficient backward
pass for the entire MLP block.

For attention components, similar optimizations are provided through a function that
handles the query, key, and value projections, and a function that handles the output
projection. They are designed to work with the existing `transformers` attention
implementation via some monkey-patching logic.

### Triton kernels

Two activation functions (SwiGLU and GeGLU) are implemented with Triton kernels for
improved speed and memory performance. These kernels handle both the forward and
backward passes.

### Integration

The custom autograd functions and Triton kernels are designed to work together. The
autograd function manages the high-level computation flow and gradient tracking, while
calling the Triton kernels for the activation function computation. During the backward
pass, the kernel computes both the activation output and the required gradients, which
the autograd function then uses to compute the final gradients for the entire
computation path.

## Future Work

- Support for additional model architectures
- Support for dropout and bias
- Additional operator fusions


================================================
FILE: docs/lr_groups.qmd
================================================
---
title: Learning Rate Groups
description: "Setting different learning rates by module name"
---

## Background

Inspired by LoRA+, Axolotl allows practitioners to specify separate learning rates for each module or groups of
modules in a model.

## Example

```yaml
lr_groups:
  - name: o_proj
    modules:
      - self_attn.o_proj.weight
    lr: 1e-6
  - name: q_proj
    modules:
      - model.layers.2.self_attn.q_proj.weight
    lr: 1e-5

learning_rate: 2e-5
```

In this example, we have a default learning rate of 2e-5 across the entire model, but we have a separate learning rate
of 1e-6 for all the self attention `o_proj` modules across all layers, and a learning are of 1e-5 to the 3rd layer's
self attention `q_proj` module.

::: {.callout-note}

We currently only support varying `lr` for now. If you're interested in adding support for others (`weight_decay`), we welcome PRs. See https://github.com/axolotl-ai-cloud/axolotl/blob/613bcf90e58f3ab81d3827e7fc572319908db9fb/src/axolotl/core/trainers/mixins/optimizer.py#L17

:::


================================================
FILE: docs/mac.qmd
================================================
---
title: Mac M-series
description: Mac M-series support
---

Currently Axolotl on Mac is partially usable, many of the dependencies of Axolotl including Pytorch do not support MPS or have incomplete support.

Current support:

- [x] Support for all models
- [x] Full training of models
- [x] LoRA training
- [x] Sample packing
- [ ] FP16 and BF16 (awaiting AMP support for MPS in Pytorch)
- [ ] Tri-dao's flash-attn (until it is supported use spd_attention as an alternative)
- [ ] xformers
- [ ] bitsandbytes (meaning no 4/8 bits loading and bnb optimizers)
- [ ] qlora
- [ ] DeepSpeed

Untested:

- FSDP


================================================
FILE: docs/mixed_precision.qmd
================================================
---
title: "Mixed Precision Training"
format:
  html:
    toc: true
    toc-depth: 3
    number-sections: true
    code-tools: true
execute:
  enabled: false
---

Mixed precision training uses lower precision data types to reduce memory usage and increase training speed while maintaining model quality. Axolotl supports several mixed precision formats:

- **FP16** - Half precision 16-bit (Pascal generation+)
- **BF16** - Brain Float 16-bit (Ampere generation+)
- **FP8** - 8-bit floating point (Hopper generation+)

## FP16 Mixed Precision {#sec-fp16}

### Overview {#sec-fp16-overview}

FP16 is the traditional half-precision format, supported on older GPUs but can be less numerically stable than BF16.

### Configuration {#sec-fp16-config}

```{.yaml}
fp16: true
```

### FP16 Considerations {#sec-fp16-considerations}

- May require gradient scaling to prevent underflow
- Less numerically stable than BF16
- Can cause training instability with some model architectures
- Consider using BF16 if your hardware supports it

## BF16 Mixed Precision {#sec-bf16}

### Overview {#sec-bf16-overview}

BF16 (Brain Float 16) offers better numerical stability than FP16 and is the recommended mixed precision format for modern GPUs. It provides the same dynamic range as FP32 while using half the memory.

### Configuration {#sec-bf16-config}

```{.yaml}
# Automatic BF16 detection (recommended)
bf16: auto

# Or explicitly enable
bf16: true

# For evaluation with BF16
bf16: full  # Equivalent to bf16_full_eval in the HF trainer
```

## FP8 Mixed Precision {#sec-fp8}

::: {.callout-note}
FP8 support is experimental and requires compatible hardware (H100, H200) and recent PyTorch versions with TorchAO.
:::

### What is FP8? {#sec-fp8-overview}

FP8 (8-bit floating point) can provide significant time savings compared to FP16/BF16 while maintaining training stability. Axolotl's implementation uses PyTorch's TorchAO library with "tensorwise" scaling strategy.

### Requirements {#sec-fp8-software}

- Hopper+ GPUs (H100/H200)
- PyTorch 2.7+ (+ compatible TorchAO version)
- CUDA 12.4+

### Configuration {#sec-fp8-config}

Add to your YAML config:

```{.yaml}
# Enable FP8 mixed precision
fp8: true

# Optional: Enable FP8 for FSDP all-gather operations
fp8_enable_fsdp_float8_all_gather: true

# Enable torch.compile (almost always necessary for FP8 speedups)
torch_compile: true
```

::: {.callout-important}
**torch.compile is critical for FP8 performance**

FP8 training requires `torch_compile: true` to see meaningful speedups. Without compilation, FP8 may actually be slower and use more memory than FP16/BF16.
:::

### Advanced FP8 Configs {#sec-fp8-advanced}

For [FSDP](multi-gpu.qmd#sec-fsdp) (Fully Sharded Data Parallel) training:

```{.yaml}
fp8: true
fp8_enable_fsdp_float8_all_gather: true

torch_compile: true

# FSDP configuration
fsdp_version: 2
fsdp_config:
  offload_params: false
  cpu_ram_efficient_loading: true
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: LlamaDecoderLayer
  state_dict_type: FULL_STATE_DICT
  reshard_after_forward: true
```

## Best Practices {#sec-best-practices}

### Choosing Precision Format {#sec-choosing-format}

- **Start with automatic detection**: `bf16: auto`
- **For Hopper+ (H100/H200)**: Try FP8 + torch.compile for maximum speed
- **For Ampere (A100/RTX 30/40)**: Use BF16
- **For older Pascal/Turing GPUs**: Use FP16 with caution
- **For very old or unsupported GPUs**: Use FP32

### Validation and Testing {#sec-validation}

Always validate your mixed precision setup:

- **Start with a small dataset** to verify stability
- **Monitor loss curves** for irregularities
- **Compare with FP32 baseline** when possible
- **Test evaluation metrics** match expectations

### FP8 Particulars {#sec-fp8-details}

- Use cases
  - Single GPU training
  - Multi GPU training with FSDP2 or Deepspeed
- Speedups
  - Please refer to the [TorchAO FP8 training benchmarks](https://github.com/pytorch/ao/tree/main/torchao/float8#rowwise-scaling) for expected matmul speedups for different (M, K, N) settings
  - Concrete number for LLaMA 3 8B training can be found [here](https://github.com/pytorch/ao/tree/main/torchao/float8#training-benchmarks)
- Known issues:
  - FP8 + DDP + `torch.compile` (causes [error](https://gist.github.com/djsaunde/0c1664c32e44a64d31b5e01b4aafe5c4))
  - FP8 + FSDP2 + `torch.compile` + FSDP2 activation checkpointing tends to be _slower_ than the BF16 equivalent training
  - Flash Attention 2 does not play nicely with `torch.compile`

See `examples/llama-3/3b-fp8-fsdp2.yaml` for an optimized example config. Enabling FP8 mixed precision + FP8 all-gather training results in ~10% faster iterations per second vs. BF16 for a relatively small (3B param) model

For more information on multi-GPU training, see our [Multi-GPU guide](multi-gpu.qmd).


================================================
FILE: docs/multi-gpu.qmd
================================================
---
title: "Multi-GPU"
format:
  html:
    toc: true
    toc-depth: 3
    # number-sections: true
    code-tools: true
execute:
  enabled: false
---

This guide covers advanced training configurations for multi-GPU setups using Axolotl.

## Overview {#sec-overview}

When training on multiple GPUs, Axolotl supports 3 sharding/parallelism strategies. Additionally, you can layer specific optimization features on top of that strategy.

You generally cannot combine these strategies; they are mutually exclusive.

1.  **DeepSpeed**: Powerful optimization library, supports ZeRO stages 1-3.
2.  **FSDP (Fully Sharded Data Parallel)**: PyTorch's native sharding implementation (Recommended).
3.  **DDP (Distributed Data Parallel)**: PyTorch's native parallelism implementation (Default if neither of the above are selected).

These features can often be combined with the strategies above:

*   **Sequence Parallelism**: Splits long sequences across GPUs (Compatible with DDP, DeepSpeed, and FSDP).
*   **FSDP + QLoRA**: Combines 4-bit quantization with FSDP (Specific to FSDP).

## DeepSpeed {#sec-deepspeed}

### Configuration {#sec-deepspeed-config}

Add to your YAML config:

```{.yaml}
deepspeed: deepspeed_configs/zero1.json
```
### Usage {#sec-deepspeed-usage}

```{.bash}
# Fetch deepspeed configs (if not already present)
axolotl fetch deepspeed_configs

# Passing arg via config
axolotl train config.yml

# Passing arg via cli
axolotl train config.yml --deepspeed deepspeed_configs/zero1.json
```

### ZeRO Stages {#sec-zero-stages}

We provide default configurations for:

- ZeRO Stage 1 (`zero1.json`)
- ZeRO Stage 1 with torch compile (`zero1_torch_compile.json`)
- ZeRO Stage 2 (`zero2.json`)
- ZeRO Stage 3 (`zero3.json`)
- ZeRO Stage 3 with bf16 (`zero3_bf16.json`)
- ZeRO Stage 3 with bf16 and CPU offload params(`zero3_bf16_cpuoffload_params.json`)
- ZeRO Stage 3 with bf16 and CPU offload params and optimizer (`zero3_bf16_cpuoffload_all.json`)

::: {.callout-tip}

Choose the configuration that offloads the least amount to memory while still being able to fit on VRAM for best performance.

Start from Stage 1 -> Stage 2 -> Stage 3.

:::

## Fully Sharded Data Parallel (FSDP) {#sec-fsdp}

FSDP allows you to shard model parameters, gradients, and optimizer states across data parallel workers.

::: {.callout-note}

FSDP2 is recommended for new users. FSDP1 is deprecated and will be removed in an upcoming release of Axolotl.

:::

### FSDP + QLoRA {#sec-fsdp-qlora}

For combining FSDP with QLoRA, see our [dedicated guide](fsdp_qlora.qmd).

### Migrating from FSDP1 to FSDP2 {#sec-migrate-fsdp1-fsdp2}

To migrate your config from FSDP1 to FSDP2, you must use the `fsdp_version` top-level config field to specify the FSDP version, and
also follow the config field mapping below to update field names.

#### Config mapping

FSDP1 | FSDP2
-------- | --------
fsdp_sharding_strategy | reshard_after_forward
fsdp_backward_prefetch_policy | **REMOVED**
fsdp_backward_prefetch | **REMOVED**
fsdp_forward_prefetch | **REMOVED**
fsdp_sync_module_states | **REMOVED**
fsdp_cpu_ram_efficient_loading | cpu_ram_efficient_loading
fsdp_state_dict_type | state_dict_type
fsdp_use_orig_params | **REMOVED**
fsdp_activation_checkpointing | activation_checkpointing

For more details, please see the migration guide in the [torchtitan repo](https://github.com/pytorch/torchtitan/blob/main/docs/fsdp.md). In Axolotl,
if you were using the following FSDP1 config:

```{.yaml}
fsdp_version: 1
fsdp_config:
  fsdp_offload_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
```

You can migrate to the following FSDP2 config:

```{.yaml}
fsdp_version: 2
fsdp_config:
  offload_params: false
  cpu_ram_efficient_loading: true
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Qwen3DecoderLayer
  state_dict_type: FULL_STATE_DICT
  reshard_after_forward: true
```

### FSDP1 (deprecated) {#sec-fsdp-config}

::: {.callout-note}

Using `fsdp` to configure FSDP is deprecated and will be removed in an upcoming release of Axolotl. Please use `fsdp_config` as above instead.

:::

```{.yaml}
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_offload_params: true
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
```


## Sequence parallelism {#sec-sequence-parallelism}

We support sequence parallelism (SP) via the
[ring-flash-attention](https://github.com/zhuzilin/ring-flash-attention) project. This
allows one to split up sequences across GPUs, which is useful in the event that a
single sequence causes OOM errors during model training.

See our [dedicated guide](sequence_parallelism.qmd) for more information.

## Performance Optimization {#sec-performance}

### Liger Kernel Integration {#sec-liger}

Please see [docs](custom_integrations.qmd#liger) for more info.

## Troubleshooting {#sec-troubleshooting}

### NCCL Issues {#sec-nccl}

For NCCL-related problems, see our [NCCL troubleshooting guide](nccl.qmd).

### Common Problems {#sec-common-problems}

::: {.panel-tabset}

## Memory Issues

- Reduce `micro_batch_size`
- Reduce `eval_batch_size`
- Adjust `gradient_accumulation_steps`
- Consider using a higher ZeRO stage

## Training Instability

- Start with DeepSpeed ZeRO-2
- Monitor loss values
- Check learning rates

:::

For more detailed troubleshooting, see our [debugging guide](debugging.qmd).


================================================
FILE: docs/multi-node.qmd
================================================
---
title: Multi Node
description: How to use Axolotl on multiple machines
---

The below are three ways to train multi-node in Axolotl.

::: {.callout-important}
Each machine needs a copy of Axolotl, we suggest using the same commit to ensure compatibility.

You will also need to have the same configuration file for your model on each machine.

Make sure the main machine is reachable by other machines.
:::

## Accelerate

You will need to create a configuration for accelerate, either by using `accelerate config` and follow the instructions or you can use one of the preset below:

~/.cache/huggingface/accelerate/default_config.yaml
```yaml
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: FSDP
downcast_bf16: 'no'
machine_rank: 0 # Set to 0 for the main machine, increment by one for other machines
main_process_ip: 10.0.0.4 # Set to main machine's IP
main_process_port: 5000
main_training_function: main
mixed_precision: bf16
num_machines: 2 # Change to the number of machines
num_processes: 4 # That's the total number of GPUs, (for example: if you have 2 machines with 4 GPU, put 8)
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
```

Configure your model to use FSDP in the Axolotl yaml. For example:
```yaml
fsdp_version: 2
fsdp_config:
  offload_params: true
  state_dict_type: FULL_STATE_DICT
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: LlamaDecoderLayer
  reshard_after_forward: true
```

All you have to do now is launch using accelerate as you would usually do on each machine and voila, the processes will start once you have launched accelerate on every machine.

## Raytrain

Please see ray train doc [here](ray-integration.qmd).

## Torchrun

If you are using Infiniband, we recommend torchrun to utilize the full bandwidth.

Set the following env (change buffersize/socketname depending on your system):

```bash
export NCCL_IB_DISABLE=0
export NCCL_SOCKET_IFNAME="eth0,en,eth,em,bond"
export NCCL_BUFFSIZE=2097152
```

Run the following on each node:

### Option 1: New Axolotl CLI with launcher args (Recommended)

```bash
axolotl train config.yaml --launcher torchrun -- --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:$head_node_port"
```

### Option 2: Direct torchrun (Legacy)

```bash
torchrun --nnodes $num_nodes --nproc_per_node $gpu_per_node --rdzv_id $rdzv_id --rdzv_backend c10d --rdzv_endpoint "$head_node_ip:$head_node_port" -m axolotl.cli.train config.yaml
```

Please make sure to substitute the placeholder variables:

- `num_nodes`: Number of nodes (containing GPUs)
- `gpu_per_node`: Number of gpus per node
- `head_node_ip`: IP of the head node (make sure other machines can connect to this)
- `head_node_port`: Port of the head node (make sure other machines can connect to this. Default 29400)
- `rdzv_id`: A unique job ID that is used by the job across nodes.

The new CLI approach (Option 1) is recommended as it provides consistent argument handling and works seamlessly with other Axolotl CLI features.

More info on the available configs can be found on the Pytorch docs [here](https://pytorch.org/docs/stable/elastic/run.html)


================================================
FILE: docs/multimodal.qmd
================================================
---
title: MultiModal / Vision Language Models (BETA)
format:
  html:
    toc: true
    toc-depth: 3
---

## Supported Models

- [Mllama](#sec-mllama)
- [Llama4](#sec-llama4)
- [Pixtral](#sec-pixtral)
- [Llava-1.5](#sec-llava-15)
- [Mistral-Small-3.1](#sec-mistral-small-31)
- [Mistral-Small-4](#sec-mistral-small-4)
- [Magistral-Small-2509](#sec-magistral-small-2509)
- [Voxtral](#sec-voxtral)
- [Gemma-3](#sec-gemma-3)
- [Gemma-3n](#sec-gemma-3n)
- [Qwen2-VL](#sec-qwen2-vl)
- [Qwen2.5-VL](#sec-qwen25-vl)
- [Qwen3.5](#sec-qwen3-5)
- [GLM-4.6V](#sec-glm-4-6v)
- [SmolVLM2](#sec-smolvlm2)
- [LFM2-VL](#sec-lfm2-vl)
- [Intern-VL](#sec-intern-vl)

## Usage

Multimodal support is limited and doesn't have full feature parity.

Here are the hyperparams you'll need to use to finetune a multimodal model.

```yaml
processor_type: AutoProcessor

skip_prepare_dataset: true
remove_unused_columns: false  # leave columns in place as they are needed to handle image embeddings during training
sample_packing: false  # not yet supported with multimodal

chat_template:  # see in next section if specified

# example dataset
datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]

# (optional) if doing lora, only finetune the Language model,
# leave the vision model and vision tower frozen
# load_in_8bit: true
adapter: lora
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

# (optional) if you want to resize images to a set size
image_size: 512
image_resize_algorithm: bilinear
```

Please see [examples](https://github.com/axolotl-ai/axolotl/tree/main/examples) folder for full configs.

::: {.callout-tip}
Some of our chat_templates have been extended to support broader dataset types. This should not break any existing configs.
:::

::: {.callout-note}
As of now, we do not truncate nor drop samples based on `sequence_len` as each arch has different ways to process non-text tokens. We are looking for help on this.
:::

### Mllama {#sec-mllama}

```yaml
base_model: meta-llama/Llama-3.2-11B-Vision-Instruct

chat_template: llama3_2_vision
```

### Llama4 {#sec-llama4}

```yaml
base_model: meta-llama/Llama-4-Scout-17B-16E-Instruct

chat_template: llama4
```

### Pixtral {#sec-pixtral}

```yaml
base_model: mistralai/Pixtral-12B-2409

chat_template: pixtral
```

### Llava-1.5 {#sec-llava-15}

```yaml
base_model: llava-hf/llava-1.5-7b-hf

chat_template: llava
```

### Mistral-Small-3.1 {#sec-mistral-small-31}

::: {.callout-tip}
Please make sure to install vision lib via `pip install 'mistral-common[opencv]==1.8.5'`
:::

```yaml
base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
```

### Mistral-Small-4 {#sec-mistral-small-4}

```yaml
base_model: mistralai/Mistral-Small-4-119B-2603
```

### Magistral-Small-2509 {#sec-magistral-small-2509}

::: {.callout-tip}
Please make sure to install vision lib via `pip install 'mistral-common[opencv]==1.8.5'`
:::

```yaml
base_model: mistralai/Magistral-Small-2509
```

### Voxtral {#sec-voxtral}

::: {.callout-tip}
Please make sure to install audio lib via `pip3 install librosa==0.11.0 'mistral_common[audio]==1.8.3'`
:::

```yaml
base_model: mistralai/Voxtral-Mini-3B-2507

processor_type: VoxtralProcessor
```

### Gemma-3 {#sec-gemma-3}

::: {.callout-tip}
The Gemma3-1B model is a text-only model, so please train as regular text model.
:::

For multi-modal 4B/12B/27B models, use the following config:

```yaml
base_model: google/gemma-3-4b-it

chat_template: gemma3
```

### Gemma-3n {#sec-gemma-3n}

::: {.callout-warning}
The model's initial loss and grad norm will be very high. We suspect this to be due to the Conv in the vision layers.
:::

::: {.callout-tip}
Please make sure to install `timm` via `pip3 install timm==1.0.17`
:::

```yaml
base_model: google/gemma-3n-E2B-it

chat_template: gemma3n
```

### Qwen2-VL {#sec-qwen2-vl}

```yaml
base_model: Qwen/Qwen2-VL-7B-Instruct

chat_template: qwen2_vl
```

### Qwen2.5-VL {#sec-qwen25-vl}

```yaml
base_model: Qwen/Qwen2.5-VL-7B-Instruct

chat_template: qwen2_vl  # same as qwen2-vl
```

### Qwen3-VL {#sec-qwen3-vl}

```yaml
base_model: Qwen/Qwen3-VL-4B-Instruct

chat_template: qwen2_vl  # same as qwen2-vl
```

### Qwen3.5 {#sec-qwen3-5}

```yaml
base_model: Qwen/Qwen3.5-9B

chat_template: qwen3_5
```

### GLM-4.6V {#sec-glm-4-6v}

Both GLM-4.6V (106B MoE) and GLM-4.6V-Flash (9B) are supported.

```yaml
# GLM-4.6V (106B MoE version)
base_model: zai-org/GLM-4.6V

# OR GLM-4.6V-Flash (9B version)
base_model: zai-org/GLM-4.6V-Flash
```

### SmolVLM2 {#sec-smolvlm2}

::: {.callout-tip}
Please make sure to install `num2words` via `pip3 install num2words==0.5.14`
:::

```yaml
base_model: HuggingFaceTB/SmolVLM2-500M-Video-Instruct
```

### LFM2-VL {#sec-lfm2-vl}

::: {.callout-warning}
Please uninstall `causal-conv1d` via `pip3 uninstall -y causal-conv1d`
:::

```yaml
base_model: LiquidAI/LFM2-VL-450M
```

### Intern-VL {#sec-intern-vl}

::: {.callout-tip}
Please make sure to install `timm` via `pip3 install timm==1.0.19`
:::

```yaml
base_model: OpenGVLab/InternVL3_5-8B
```

## Dataset Format

For multi-modal datasets, we adopt an extended `chat_template` format similar to OpenAI's Message format.

- A message is a list of `role` and `content`.
- `role` can be `system`, `user`, `assistant`, etc.
- `content` is a list of `type` and (`text`, `image`, `path`, `url`, `base64`, or `audio`).

### Image

::: {.callout-note}
For backwards compatibility:

- If the dataset has a `images` or `image` column of `list[Image]`, it will be appended to the first `content` list as `{"type": "image", "image": ...}`. However, if the content already has a `{"type": "image"}` but no `image` key, it will be set the `image` key.
- If `content` is a string, it will be converted to a list with `type` as `text`.
:::

For image loading, you can use the following keys within `content` alongside `"type": "image"`:

- `"path": "/path/to/image.jpg"`
- `"url": "https://example.com/image.jpg"`
- `"base64": "..."`
- `"image": PIL.Image`

### Audio

For audio loading, you can use the following keys within `content` alongside `"type": "audio"`:

- `"path": "/path/to/audio.mp3"`
- `"url": "https://example.com/audio.mp3"`
- `"audio": np.ndarray`

::: {.callout-tip}

You may need to install `librosa` via `pip3 install librosa==0.11.0`.

:::

### Video

::: {.callout-warning}

This is not well tested at the moment. We welcome contributors!

:::

For video loading, you can use the following keys within `content` alongside `"type": "video"`:

- `"path": "/path/to/video.mp4"`
- `"url": "https://example.com/video.mp4"`
- `"video": np.ndarray | list[PIL.Image.Image] | torch.Tensor` (or list of the aforementioned)

### Example

Here is an example of a multi-modal dataset:
```json
[
  {
    "messages": [
        {
            "role": "system",
            "content": [
              {"type": "text", "text": "You are a helpful assistant."}
              ]
        },
        {
            "role": "user",
            "content": [
                {"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/bee.jpg"},
                {"type": "text", "text": "Describe this image in detail."}
            ]
        },
        {
            "role": "assistant",
            "content": [
              {"type": "text", "text": "The image is a bee."}
            ]
        }
    ]
  }
]
```

## FAQ

1. `PIL.UnidentifiedImageError: cannot identify image file ...`

`PIL` could not retrieve the file at `url` using `requests`. Please check for typo. One alternative reason is that the request is blocked by the server.


================================================
FILE: docs/multipack.qmd
================================================
---
title: Multipack (Sample Packing)
description: Multipack is a technique to pack multiple sequences into a single batch to increase training throughput.
---

## Visualization of Multipack with Flash Attention

Because Flash Attention simply drops the attention mask, we do not need to
construct a 4d attention mask. We only need to concatenate the sequences into
a single batch and let flash attention know where each new sequence begins.


4k context, bsz =4,
each character represents 256 tokens
X represents a padding token

```
   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
[[ A A A A A A A A A A A ]
   B B B B B B ]
   C C C C C C C ]
   D D D D ]]

[[ E E E E E E E E ]
 [ F F F F ]
 [ G G G ]
 [ H H H H ]]

[[ I I I ]
 [ J J J ]
 [ K K K K K]
 [ L L L ]]
```

after padding to longest input in each step
```
   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
[[ A A A A A A A A A A A ]
   B B B B B B X X X X X X ]
   C C C C C C C X X X X ]
   D D D D X X X X X X X ]]

[[ E E E E E E E E ]
 [ F F F F X X X X ]
 [ G G G X X X X X ]
 [ H H H H X X X X ]]

[[ I I I X X ]
 [ J J J X X ]
 [ K K K K K ]
 [ L L L X X ]]
```

w packing ( note it's the same effective number of tokens per step, but a true bsz of 1)
```
   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
[[ A A A A A A A A A A A B B B B B
   B C C C C C C C D D D D E E E E
   E E E E F F F F F G G G H H H H
   I I I J J J J K K K K K L L L X ]]
```

cu_seqlens:
[[ 0, 11, 17, 24, 28, 36, 41 44, 48, 51, 55, 60, 64]]


## Multipack without Flash Attention

Multipack can still be achieved without Flash attention, but with lower packing
efficiency as we are not able to join multiple batches into a single batch due to
context length limits without flash attention. We can use either Pytorch's Scaled
Dot Product Attention implementation or native Pytorch attention implementation
along with [4d attention masks](https://github.com/huggingface/transformers/pull/27539)
to pack sequences together and avoid cross attention.

<img src="./images/4d-mask.png" alt="axolotl" width="800">


================================================
FILE: docs/nccl.qmd
================================================
---
title: NCCL
description: Troubleshooting NCCL issues
---

NVIDIA NCCL is a library to facilitate and optimize multi-GPU communication operations, such as broadcast, all-gather, reduce, all-reduce, etc. Broadly, NCCL configuration is highly environment-specific and is configured via several [environment variables](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/env.html). A common NCCL-related problem occurs when a long-running operation times out causing the training process to abort:

```text
Watchdog caught collective operation timeout: WorkNCCL(SeqNum=42, OpType=ALLGATHER, Timeout(ms)=1800000) ran for 1806948 milliseconds before timing out.
```

Often, this timeout will happen after 30 minutes (the default setting) and is accompanied by below-average power consumption with near 100% GPU utilization before the error is raised. Nvidia recommends [disabling PCI access control services (ACS)](https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/troubleshooting.html#pci-access-control-services-acs) as a possible solution if this is available to you.

Forcing cross-GPU communication via [NVLink](https://en.wikipedia.org/wiki/NVLink) may help without increasing timeouts. To verify that your configuration is leveraging NVLink run the following command:

```bash
nvidia-smi nvlink --status
```

To force NCCL to use NVLink, simply set this in the environment:

```bash
export NCCL_P2P_LEVEL=NVL
```

If NVLink is not available in your environment there are other options for ``NCCL_P2P_LEVEL`` in the table below:

| NCCL_P2P_LEVEL | Description |
| -------------- | ----------- |
| PIX | P2P data transfers through no more than a single PCIe bridge. Faster data transfer rates vs to paths involving multiple bridges, but slower compared to direct GPU-to-GPU communication. |
| PXB | P2P data transfers through multiple PCIe bridges but not going through the PCIe Host Bridge; this path involves a complex routing process, potentially incurring a moderate level of latency. |
| PHB | P2P data transfers occur over the PCIe and through a PCIe Host Bridge, typically involving the CPU, which can facilitate direct memory access but might introduce additional latency compared to more direct paths (ex PIX, NVL) |

To validate that acceptable data transfer speeds exist for your training job, running [NCCL Tests](https://github.com/NVIDIA/nccl-tests/blob/master/README.md) can help pinpoint bottlenecks, for example:

```bash
./build/all_reduce_perf -b 8 -e 128M -f 2 -g 3
```

It can be useful when debugging NCCL communication timeouts to activate additional logging in both PyTorch and NCCL:

```bash
export NCCL_DEBUG=INFO
export NCCL_DEBUG_SUBSYS=ALL
export TORCH_DISTRIBUTED_DEBUG=INFO
export TORCHELASTIC_ERROR_FILE=/PATH/TO/torcherror.log
```

Finally, if you believe your training job needs more time you can increase the timeout past 30 minutes by setting the ``ddp_timeout`` value in the Axolotl configuration. See [PyTorch init_process_group](https://pytorch.org/docs/stable/distributed.html#torch.distributed.init_process_group) for documentation on this value.


================================================
FILE: docs/nd_parallelism.qmd
================================================
---
title: "N-D Parallelism (Beta)"
---

Axolotl enables training models at scale by composing different parallelism techniques. This is essential when:

- A model's weights are too large to fit on a single GPU's memory.
- A model's activations, especially with very long contexts, are too large for a single GPU.
- You want to accelerate training by using multiple GPUs or nodes.

or combinations of the above!

## Core Concepts

Parallelism strategies can be combined. The key is understanding how each one divides the workload. PyTorch's `DeviceMesh` is the modern way to manage these combinations, creating a logical grid of your GPUs and assigning different parallel strategies to different dimensions of the grid.

### Data Parallelism {#sec-dp}

Data Parallelism focuses on splitting the global data batch across GPUs.

- Distributed Data Parallel (DDP): The classic approach. The full model is replicated on every GPU. Each GPU processes a different slice of the data batch. Gradients are then averaged across all GPUs after the backward pass to keep the models synchronized. This can substantially improve data throughput compared to single-device training, but requires that each GPU is able to hold the entire model, its gradients, and optimizer states.

- [Fully Sharded Data Parallel (FSDP)](multi-gpu.qmd#fully-sharded-data-parallel-(fsdp)): A highly memory-efficient form of data parallelism (inspired by DeepSpeed's ZeRO). Instead of replicating the model, FSDP shards the model's *parameters, gradients, and optimizer states* across the GPUs in the data-parallel group. During computation, each GPU receives the specific parameters it needs via an `all_gather` operation just before they are used, and they can be discarded immediately after (`reshard-after-forward`).
    - FSDP maps to ZeRO stages:
        - ZeRO-2 (`reshard_after_forward=False`): Shards gradients and optimizer states. Model weights are replicated on each GPU.
        - ZeRO-3 (`reshard_after_forward=True`): Shards gradients, optimizer states, AND model parameters. This provides the most memory savings at the cost of more communication (re-gathering parameters for both forward and backward passes).

### [Experimental] Tensor Parallelism (TP) {#sec-tp}

Also known as "horizontal model parallelism," as described in the [Megatron-LM paper](https://arxiv.org/pdf/1909.08053.pdf). Instead of splitting the batch, TP splits the model's layers themselves across GPUs.

- How it works: For a linear layer `Y = XA`, the weight matrix `A` is split column-wise (`A = [A_1, A_2]`). The computation becomes `Y_1 = XA_1` and `Y_2 = XA_2`, which can happen in parallel on different GPUs. The final output `Y` is simply the concatenation of `Y_1` and `Y_2`. Check [this comment](https://github.com/huggingface/transformers/issues/10321#issuecomment-783543530) for more detailed info.
- Requirement: TP involves frequent, small communications within a forward/backward pass. It requires a very fast interconnect between GPUs (e.g., NVLink) and is typically not recommended across different nodes.

### Context Parallelism (CP) {#sec-cp}

Context Parallelism, also called [Sequence Parallelism](sequence_parallelism.qmd), addresses the memory bottleneck from long sequences. The input sequence itself is split along the sequence length dimension and distributed across GPUs.

- How it works: If you have a sequence of 8192 tokens and a `context_parallel_size` of 4, each GPU will only handle a chunk of 2048 tokens.
- The Challenge: Attention is not local; every token needs to "attend to" every other token. Splitting the sequence breaks this.
- The Solution (`ring-flash-attention`): An efficient communication protocol is used. To compute attention for its local sequence chunk, each GPU passes its Key-Value (KV) cache to its neighbor in a "ring." After `N-1` steps, every GPU has seen the KV-cache from all other GPUs, allowing it to compute the correct attention values for its chunk. This is implemented using the highly optimized `flash-attention` kernel at each step.

### Hybrid Sharding Data Parallel (HSDP) {#sec-hsdp}

HSDP is a 2D strategy that intelligently combines FSDP and DDP, typically for multi-node training.

- Intra-Node (within a machine): Use FSDP. This is efficient because GPUs on the same node have fast interconnects (NVLink), making the `all_gather` operations for sharded parameters fast.
- Inter-Node (across machines): Use DDP. The gradient synchronization between nodes is less frequent than FSDP's parameter gathering, making it a better fit for the slower node-to-node network (e.g., Ethernet/Infiniband).
- Example: With 2 nodes of 8 GPUs each (16 total), you could have `dp_shard_size=8` (FSDP within each node) and `dp_replicate_size=2` (DDP across the two nodes).

## Usage

```yaml
# FSDP config. See https://docs.axolotl.ai/docs/multi-gpu.html#sec-fsdp
fsdp_version: 2
fsdp_config:
  # ...

# The number of GPUs to shard the model parameters across (FSDP dimension).
dp_shard_size: 4

# The number of times to replicate the sharded model (DDP dimension).
dp_replicate_size: 2

# Number of GPUs for Tensor Parallelism.
tensor_parallel_size: 1  # (default is 1, no TP)

# Number of GPUs for Context/Sequence Parallelism.
context_parallel_size: 1 # (default is 1, no CP)
```

Note: We recommend FSDP. DeepSpeed is only compatible with `tensor_parallel_size`.

## Examples

::: {.callout-tip}
See our example configs [here](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/distributed-parallel).
:::

1.  HSDP on 2 nodes with 4 GPUs each (8 GPUs total):
    - You want FSDP within each node and DDP across nodes.
    - Set `dp_shard_size: 4` and `dp_replicate_size: 2`.

2.  FSDP + TP on a single 8-GPU node:
    - You want to split the model across 4 GPUs using FSDP, and further split each layer across 2 GPUs with TP.
    - Set `dp_shard_size: 4` and `tensor_parallel_size: 2`.

3.  FSDP + CP on a single 8-GPU node for long context:
    - You want to shard the model across all 8 GPUs and also split the sequence length across all 8 GPUs.
    - Set `dp_shard_size: 8` and `context_parallel_size: 8`. Note: this means the data parallel group and context parallel group are the same. A more common setup might be to shard across a smaller group.

## Support Matrix

This matrix describes how different parallelism methods can be combined in Axolotl.

| Combination | `dp_replicate_size` | `dp_shard_size` | `tp_size` | `cp_size` | Status & Notes |
| --- | :---: | :---: |:---:|:---:|---|
| **FSDP** (ZeRO-3) | 1 | >1 | 1 | 1 | ✅ Fully supported. Shards model across all GPUs. |
| **HSDP** | >1 | >1 | 1 | 1 | ✅ Fully supported. FSDP intra-node, DDP inter-node. |
| **FSDP + TP** | 1 | >1 | >1 | 1 | ✅ **2D Parallelism**. Shards the model across a `dp_shard` group, and TP-splits layers within the `tp` group. |
| **HSDP + TP** | >1 | >1 | >1 | 1 | ✅ **3D Parallelism**. A powerful but complex combination. |
| **FSDP + CP** | 1 | >1 | 1 | >1 | ✅ **2D Parallelism**. Combines FSDP with context parallelism. |
| **FSDP + TP + CP**| 1 | >1 | >1| >1| ✅ **3D Parallelism**. Another advanced combination. |
| DDP + TP/CP | >1 | 1 | >1 | >1 | ❌ **Not Supported**. The `ParallelismConfig` explicitly prevents this, as composing pure DDP with TP or CP is currently not supported. You should use FSDP + TP/CP instead (`dp_shard_size > 1`). |
| Just TP / CP | 1 | 1 | >1 | >1 | ✅ Supported. Useful for inference or when the model fits on one GPU but context is too long. |

- `tp_size` refers to `tensor_parallel_size`
- `cp_size` refers to `context_parallel_size`


================================================
FILE: docs/optimizations.qmd
================================================
---
title: Optimizations Guide
description: A guide to the performance and memory optimizations available in Axolotl.
---

Axolotl includes numerous optimizations to speed up training, reduce memory usage, and handle large models.

This guide provides a high-level overview and directs you to the detailed documentation for each feature.

## Speed Optimizations

These optimizations focus on increasing training throughput and reducing total training time.

### Sample Packing

Improves GPU utilization by combining multiple short sequences into a single packed sequence for training. This requires enabling one of the [attention](#attention-implementations) implementations below.

- **Config:** `sample_packing: true`
- **Learn more:** [Sample Packing](multipack.qmd)

### Attention Implementations

Using an optimized attention implementation is critical for training speed.

- **[Flash Attention 2](https://github.com/Dao-AILab/flash-attention)**: `flash_attention: true`. **(Recommended)** The industry standard for fast attention on modern GPUs. Requires Ampere or higher. For AMD, check [AMD Support](https://github.com/Dao-AILab/flash-attention?tab=readme-ov-file#amd-rocm-support).
- **[Flex Attention](https://pytorch.org/blog/flexattention/)**: `flex_attention: true`.
- **[SDP Attention](https://docs.pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html)**: `sdp_attention: true`. PyTorch's native implementation.
- **[Xformers](https://github.com/facebookresearch/xformers)**: `xformers_attention: true`. Works with FP16.

*Note: You should only enable one attention backend.*

### LoRA Optimizations

Leverages optimized kernels to accelerate LoRA training and reduce memory usage.

- **Learn more:** [LoRA Optimizations Documentation](lora_optims.qmd)

## Memory Optimizations

These techniques help you fit larger models or use bigger batch sizes on your existing hardware.

### Parameter Efficient Finetuning (LoRA & QLoRA)

Drastically reduces memory by training a small set of "adapter" parameters instead of the full model. This is the most common and effective memory-saving technique.

- Examples: Find configs with `lora` or `qlora` in the [examples directory](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/llama-3).
- Config Reference: See `adapter`, `load_in_4bit`, and `load_in_8bit` in the [Configuration Reference](config-reference.qmd).

### Gradient Checkpointing & Activation Offloading

These techniques save VRAM by changing how activations are handled.

- Gradient Checkpointing: re-computes activations during the backward pass, trading compute time for VRAM.
- Activation Offloading: moves activations to CPU RAM or disk, trading I/O overhead for VRAM.
- Learn more: [Gradient Checkpointing and Offloading Docs](gradient_checkpointing.qmd)

### Cut Cross Entropy (CCE)

Reduces VRAM usage by using an optimized cross-entropy loss calculation.

- **Learn more:** [Custom Integrations - CCE](custom_integrations.qmd#cut-cross-entropy)

### Liger Kernels

Provides efficient Triton kernels to improve training speed and reduce memory usage.

- **Learn more:** [Custom Integrations - Liger Kernels](custom_integrations.qmd#liger-kernels)

### Expert Kernels

Optimized kernel implementations for Mixture of Experts (MoE) model training.

- **ScatterMoE**: Triton-based MoE kernels with fused LoRA support.
- **SonicMoE**: CUTLASS-based MoE kernels for NVIDIA Hopper and Blackwell GPUs.

- **Learn more:** [Custom Integrations - Kernels Integration](custom_integrations.qmd#kernels-integration)

## Long Context Models

Techniques to train models on sequences longer than their original context window.

### RoPE Scaling

Extends a model's context window by interpolating its Rotary Position Embeddings.

- **Config:** Pass the `rope_scaling` config under the `overrides_of_model_config: `. To learn how to set RoPE, check the respective model config.

### Sequence Parallelism

Splits long sequences across multiple GPUs, enabling training with sequence lengths that would not fit on a single device.

- **Learn more:** [Sequence Parallelism Documentation](sequence_parallelism.qmd)

### Artic Long Sequence Training (ALST)

ALST is a recipe that combines several techniques to train long-context models efficiently. It typically involves:

- TiledMLP to reduce memory usage in MLP layers.
- Tiled Loss functions (like [CCE](#cut-cross-entropy-(cce) or [Liger](#liger-kernels)).
- Activation Offloading to CPU.

- Example: [ALST Example Configuration](https://github.com/axolotl-ai-cloud/axolotl/tree/main/examples/alst)

## Large Models (Distributed Training)

To train models that don't fit on a single GPU, you'll need to use a distributed training strategy like FSDP or DeepSpeed. These frameworks shard the model weights, gradients, and optimizer states across multiple GPUs and nodes.

- **Learn more:** [Multi-GPU Guide](multi-gpu.qmd)
- **Learn more:** [Multi-Node Guide](multi-node.qmd)

### N-D Parallelism (Beta)

For advanced scaling, Axolotl allows you to compose different parallelism techniques (e.g., Data, Tensor, Sequence Parallelism). This is a powerful approach to train an extremely large model by overcoming multiple bottlenecks at once.

- **Learn more:** [N-D Parallelism Guide](nd_parallelism.qmd)


## Quantization

Techniques to reduce the precision of model weights for memory savings.

### 4-bit Training (QLoRA)

The recommended approach for quantization-based training. It loads the base model in 4-bit using `bitsandbytes` and then trains QLoRA adapters. See [Adapter Finetuning](#adapter-finetuning-lora-qlora) for details.

### FP8 Training

Enables training with 8-bit floating point precision on supported hardware (e.g., NVIDIA Hopper series GPUs) for significant speed and memory gains.

- **Example:** [Llama 3 FP8 FSDP Example](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/llama-3/3b-fp8-fsdp2.yaml)

### Quantization Aware Training (QAT)

Simulates quantization effects during training, helping the model adapt and potentially improving the final accuracy of the quantized model.

- **Learn more:** [QAT Documentation](qat.qmd)

### GPTQ

Allows you to finetune LoRA adapters on top of a model that has already been quantized using the GPTQ method.

- **Example:** [GPTQ LoRA Example](https://github.com/axolotl-ai-cloud/axolotl/blob/main/examples/llama-2/gptq-lora.yml)

### MoE Expert Quantization

Quantizes MoE expert weights on load to reduce VRAM when training MoE models with adapters. Required for Transformers v5+ MoE models where experts use fused `nn.Parameter` tensors.

- **Config:** `quantize_moe_experts: true`
- **Learn more:** [MoE Expert Quantization](expert_quantization.qmd)


================================================
FILE: docs/optimizers.qmd
================================================
---
title: Optimizers
description: Configuring optimizers
---

## Overview

Axolotl supports all optimizers supported by [transformers OptimizerNames](https://github.com/huggingface/transformers/blob/51f94ea06d19a6308c61bbb4dc97c40aabd12bad/src/transformers/training_args.py#L142-L187)

Here is a list of optimizers supported by transformers as of `v4.54.0`:

- `adamw_torch`
- `adamw_torch_fused`
- `adamw_torch_xla`
- `adamw_torch_npu_fused`
- `adamw_apex_fused`
- `adafactor`
- `adamw_anyprecision`
- `adamw_torch_4bit`
- `adamw_torch_8bit`
- `ademamix`
- `sgd`
- `adagrad`
- `adamw_bnb_8bit`
- `adamw_8bit`  # alias for adamw_bnb_8bit
- `ademamix_8bit`
- `lion_8bit`
- `lion_32bit`
- `paged_adamw_32bit`
- `paged_adamw_8bit`
- `paged_ademamix_32bit`
- `paged_ademamix_8bit`
- `paged_lion_32bit`
- `paged_lion_8bit`
- `rmsprop`
- `rmsprop_bnb`
- `rmsprop_bnb_8bit`
- `rmsprop_bnb_32bit`
- `galore_adamw`
- `galore_adamw_8bit`
- `galore_adafactor`
- `galore_adamw_layerwise`
- `galore_adamw_8bit_layerwise`
- `galore_adafactor_layerwise`
- `lomo`
- `adalomo`
- `grokadamw`
- `schedule_free_radam`
- `schedule_free_adamw`
- `schedule_free_sgd`
- `apollo_adamw`
- `apollo_adamw_layerwise`
- `stable_adamw`


## Custom Optimizers

Enable custom optimizers by passing a string to the `optimizer` argument. Each optimizer will receive beta and epsilon args, however, some may accept additional args which are detailed below.

### optimi_adamw

```yaml
optimizer: optimi_adamw
```

### ao_adamw_4bit

Deprecated: Please use `adamw_torch_4bit`.

### ao_adamw_8bit

Deprecated: Please use `adamw_torch_8bit`.

### ao_adamw_fp8


```yaml
optimizer: ao_adamw_fp8
```

### adopt_adamw

GitHub: [https://github.com/iShohei220/adopt](https://github.com/iShohei220/adopt)
Paper: [https://arxiv.org/abs/2411.02853](https://arxiv.org/abs/2411.02853)

```yaml
optimizer: adopt_adamw
```

### came_pytorch

GitHub: [https://github.com/yangluo7/CAME/tree/master](https://github.com/yangluo7/CAME/tree/master)
Paper: [https://arxiv.org/abs/2307.02047](https://arxiv.org/abs/2307.02047)

```yaml
optimizer: came_pytorch

# optional args (defaults below)
adam_beta1: 0.9
adam_beta2: 0.999
adam_beta3: 0.9999
adam_epsilon: 1e-30
adam_epsilon2: 1e-16
```

### muon

Blog: [https://kellerjordan.github.io/posts/muon/](https://kellerjordan.github.io/posts/muon/)
Paper: [https://arxiv.org/abs/2502.16982v1](https://arxiv.org/abs/2502.16982v1)

```yaml
optimizer: muon
```

### dion

Microsoft's Dion (DIstributed OrthoNormalization) optimizer is a scalable and communication-efficient
orthonormalizing optimizer that uses low-rank approximations to reduce gradient communication.

GitHub: [https://github.com/microsoft/dion](https://github.com/microsoft/dion)
Paper: [https://arxiv.org/pdf/2504.05295](https://arxiv.org/pdf/2504.05295)
Note: Implementation written for PyTorch 2.7+ for DTensor

```yaml
optimizer: dion
dion_lr: 0.01
dion_momentum: 0.95
lr: 0.00001  # learning rate for embeddings and parameters that fallback to AdamW
```


================================================
FILE: docs/qat.qmd
================================================
---
title: "Quantization Aware Training (QAT)"
back-to-top-navigation: true
toc: true
toc-expand: 2
toc-depth: 4
---

## Overview

[Quantization Aware Training](https://pytorch.org/blog/introduction-to-quantization-on-pytorch/#quantization-aware-training) (QAT) is a technique for improving the accuracy of models which are quantized
by applying "fake" quantizations to the model's weights (and optionally, activations) during training. This fake
quantization allows for the model to adjust for noise introduced by the quantization, so when the model is eventually
quantized, the accuracy loss is minimized. We use the quantization techniques implemented in [torchao](https://github.com/pytorch/ao) to provide
support for QAT and post-training quantization (PTQ) in axolotl.

We recommend reviewing the excellent QAT tutorial in the [torchtune library](https://pytorch.org/torchtune/main/tutorials/qat_finetune.html#quantizing-the-qat-model),
and the QAT documentation in the [torchao library](https://github.com/pytorch/ao/tree/main/torchao/quantization/qat), for more details.

## Configuring QAT in Axolotl

To enable QAT in axolotl, add the following to your configuration file:

```yaml
qat:
  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4", "int8", "float8"
  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4", "fp8", and "nvfp4".
  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
  fake_quant_after_n_steps: # Optional[int] = None. The number of steps to apply fake quantization after
```

We support the following quantization schemas:

- `Int4WeightOnly` (requires the `fbgemm-gpu` extra when installing Axolotl)
- `Int8DynamicActivationInt4Weight`
- `Float8DynamicActivationFloat8Weight`
- `Float8DynamicActivationInt4Weight`
- `NVFP4`

Once you have finished training, you must quantize your model by using the same quantization configuration which you used to train the model with. You can use the [`quantize`](./quantize.qmd) command to do this.


================================================
FILE: docs/quantize.qmd
================================================
---
title: "Quantization with torchao"
back-to-top-navigation: true
toc: true
toc-expand: 2
toc-depth: 4
---

Quantization is a technique to lower the memory footprint of your model, potentially at the cost of accuracy or model performance. We support quantizing your model using the [torchao](https://github.com/pytorch/ao) library. Quantization is supported for both post-training quantization (PTQ) and quantization-aware training (QAT).


::: {.callout-note}

We do not currently support quantization techniques such as GGUF/GPTQ,EXL2 at the moment.

:::

## Configuring Quantization in Axolotl

Quantization is configured using the `quantization` key in your configuration file.

```yaml
base_model: # The path to the model to quantize.
quantization:
  activation_dtype: # Optional[str] = "int8". Fake quantization layout to use for activation quantization. Valid options are "int4", "int8", "float8"
  weight_dtype: # Optional[str] = "int8". Fake quantization layout to use for weight quantization. Valid options are "int4", "fp8", and "nvfp4".
  group_size: # Optional[int] = 32. The number of elements in each group for per-group fake quantization
  quantize_embedding: # Optional[bool] = False. Whether to quantize the embedding layer.

output_dir:  # The path to the output directory.
```

Once quantization is complete, your quantized model will be saved in the `{output_dir}/quantized` directory.

You may also use the `quantize` command to quantize a model which has been trained with [QAT](./qat.qmd) - you can do this by using the existing QAT configuration file which
you used to train the model:

```yaml
# qat.yml
qat:
  activation_dtype: int8
  weight_dtype: int4
  group_size: 256

output_dir: # The path to the output directory used during training where the final checkpoint has been saved.
```

```bash
axolotl quantize qat.yml
```

This ensures that an identical quantization configuration is used to quantize the model as was used to train it.


::: {.callout-note}

If you have configured pushing to hub with `hub_model_id`, your model hub name will have the quantization schema appended to it,
e.g. `axolotl-ai-cloud/qat-nvfp4-llama3B` will become `axolotl-ai-cloud/qat-nvfp4-llama3B-nvfp4w`

:::


================================================
FILE: docs/ray-integration.qmd
================================================
---
title: Ray Train
description: How to use Axolotl with Ray Train
---

Axolotl supports using Ray as an alternative to `accelerate` for orchestrating training. This is especially useful for multi-node training since you only have to setup code and dependencies in a single node and launch training as if you were using a single node.

With the `--use-ray` CLI flag, Axolotl will use Ray Train's [`TorchTrainer`](https://docs.ray.io/en/latest/train/api/doc/ray.train.torch.TorchTrainer.html#ray.train.torch.TorchTrainer) to run training.

## Ray cluster setup

A prerequisite using the Ray Train integration is to setup a Ray cluster on your desired node(s). For a detailed guide on how you can get started with ray clusters, check the official Ray docs [here](https://docs.ray.io/en/latest/cluster/getting-started.html).

Every Ray cluster has one _head_ node and a set of worker nodes. The head node is just like any other worker node, but it also runs certain special processes related to scheduling and orchestration. Ray-enabled scripts are run on the head node and depending on the resources (number of CPUs, GPUs, etc) they request, will be scheduled to run certain tasks on the worker nodes. For more on key concepts behind a Ray cluster, you can refer this [doc](https://docs.ray.io/en/latest/cluster/key-concepts.html#cluster-key-concepts).

## Sanity check

To run a sanity check on whether your ray cluster is setup properly, execute the following on the head node:

```bash
ray status
```

The output should have a summary of your Ray cluster - list of all the nodes in your cluster, the number of CPUs and GPUs in your cluster, etc. For example, if you have a cluster with 1 CPU-only head node and 2 4xL40S worker nodes, the output can look like this:


```
Node status
---------------------------------------------------------------
Active:
 1 head
Idle:
 2 4xL40S:48CPU-384GB
Pending:
 (no pending nodes)
Recent failures:
 (no failures)

Resources
---------------------------------------------------------------
Usage:
 0.0/96.0 CPU
 0.0/8.0 GPU
 0B/800.00GiB memory
 0B/229.57GiB object_store_memory

Demands:
 (no resource demands)
```

You should also be able to see the same on the [Ray dashboard](https://docs.ray.io/en/latest/ray-observability/getting-started.html).


## Configuring training with Ray Train

You can find an example configuration at `configs/llama-3/lora-1b-ray.yaml`.

The key parameters to note here are:

```yaml
use_ray: true
ray_num_workers: 4
# optional
resources_per_worker:
    GPU: 1
```

- `use_ray`: This is the flag that enables the Ray Train integration. You can either use the corresponding `--use-ray` flag in the CLI or set `use_ray` in the config file.
- `ray_num_workers`: This is the number of workers/GPUs to use for training.
- `resources_per_worker`: This is the Ray [resource request](https://docs.ray.io/en/latest/ray-core/scheduling/resources.html) for each worker. This can be used to request a specific GPU type or a custom resource for each worker. For example, if your ray cluster has GPUs of different types, and you only want to use NVIDIA L40S GPUs, you can do

```yaml
resources_per_worker:
    accelerator_type:L40S: 0.001
```

## Launching training

You can simply run the following command on the head node:

```bash
axolotl train examples/llama-3/lora-1b-ray.yml --use-ray
```

This will launch training on the head node and workers will be scheduled automatically by Ray Train to run on the appropriate head or worker nodes.

You can also monitor training progress on the Ray dashboard.

Coming back to the example on a Ray cluster with 1 head node and 2 4xL40S worker nodes, let's say you want to make use of all 8 GPUs. You would be able to just set `ray_num_workers: 8` and run the previous command. The Cluster tab will show the following:

![Ray dashboard](./images/ray-cluster-dashboard.png)


================================================
FILE: docs/reward_modelling.qmd
================================================
---
title: "Reward Modelling"
description: "Reward models are used to guide models towards behaviors which is preferred by humans, by training over large datasets annotated with human preferences. "
---

### Overview

Reward modelling is a technique used to train models to predict the reward or value of a given input. This is particularly useful in reinforcement learning scenarios where the model needs to evaluate the quality of its actions or predictions.
We support the reward modelling techniques supported by `trl`.

### (Outcome) Reward Models

Outcome reward models are trained using data which contains preference annotations for an entire interaction between the user and model (e.g. rather than per-turn or per-step).
For improved training stability, you can use the `center_rewards_coefficient` parameter to encourage mean-zero reward outputs ([see TRL docs](https://huggingface.co/docs/trl/v0.10.1/en/reward_trainer#centering-rewards)).

```yaml
base_model: google/gemma-2-2b
model_type: AutoModelForSequenceClassification
num_labels: 1
tokenizer_type: AutoTokenizer

reward_model: true
chat_template: gemma
datasets:
  - path: argilla/distilabel-intel-orca-dpo-pairs
    type: bradley_terry.chat_template

val_set_size: 0.1
eval_steps: 100
```

Bradley-Terry chat templates expect single-turn conversations in the following format:

```json
{
    "system": "...", // optional
    "input": "...",
    "chosen": "...",
    "rejected": "..."
}
```

### Process Reward Models (PRM)

::: {.callout-tip}
Check out our [PRM blog](https://axolotlai.substack.com/p/process-reward-models).
:::

Process reward models are trained using data which contains preference annotations for each step in a series of interactions. Typically, PRMs are trained to provide reward signals over each step of a reasoning trace and are used for downstream reinforcement learning.
```yaml
base_model: Qwen/Qwen2.5-3B
model_type: AutoModelForTokenClassification
num_labels: 2

process_reward_model: true
datasets:
  - path: trl-lib/math_shepherd
    type: stepwise_supervised
    split: train

val_set_size: 0.1
eval_steps: 100
```

Please see [stepwise_supervised](dataset-formats/stepwise_supervised.qmd) for more details on the dataset format.


================================================
FILE: docs/rlhf.qmd
================================================
---
title: "RLHF (Beta)"
description: "Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human feedback."
back-to-top-navigation: true
toc: true
toc-expand: 2
toc-depth: 4
---

## Overview

Reinforcement Learning from Human Feedback is a method whereby a language model is optimized from data using human
feedback. Various methods include, but not limited to:

- [Direct Preference Optimization (DPO)](#dpo)
- [Identity Preference Optimization (IPO)](#ipo)
- [Kahneman-Tversky Optimization (KTO)](#kto)
- [Odds Ratio Preference Optimization (ORPO)](#orpo)
- [Group Relative Policy Optimization (GRPO)](#grpo)
- [Group Reward-Decoupled Policy Optimization (GDPO)](#gdpo)


## RLHF using Axolotl

::: {.callout-important}
This is a BETA feature and many features are not fully implemented. You are encouraged to open new PRs to improve the integration and functionality.
:::

We rely on the [TRL](https://github.com/huggingface/trl) library for implementations of various RL training methods, which we wrap around to expose in axolotl. Each method has their own supported ways of loading datasets and prompt formats.

::: {.callout-tip}
You can find what each method supports by going into `src/axolotl/prompt_strategies/{method}` where `{method}` is one of our supported methods. The `type: ` can be retrieved from `{method}.{function_name}`.
:::

### DPO

Example config:

```yaml
rl: dpo
datasets:
  - path: Intel/orca_dpo_pairs
    split: train
    type: chatml.intel
  - path: argilla/ultrafeedback-binarized-preferences
    split: train
    type: chatml
```

DPO supports the following types with the following dataset format:

#### chatml.argilla

```json
{
    "system": "...", // optional
    "instruction": "...",
    "chosen_response": "...",
    "rejected_response": "..."
}
```

#### chatml.argilla_chat

```json
{
    "chosen": [
        {"role": "user", "content": "..."},
        {"role": "assistant", "content": "..."}
    ],
    "rejected": [
        {"role": "user", "content": "..."},
        {"role": "assistant", "content": "..."}
    ]
}
```

#### chatml.icr

```json
{
    "system": "...", // optional
    "input": "...",
    "chosen": "...",
    "rejected": "..."
}
```

#### chatml.intel

```json
{
    "system": "...", // optional
    "question": "...",
    "chosen": "...",
    "rejected": "..."
}
```

#### chatml.prompt_pairs

```json
{
    "system": "...", // optional
    "prompt": "...",
    "chosen": "...",
    "rejected": "..."
}
```

#### chatml.ultra

```json
{
    "system": "...", // optional
    "prompt": "...",
    "chosen": [
        {"role": "user", "content": "..."},
        {"role": "assistant", "content": "..."}
    ],
    "rejected": [
        {"role": "user", "content": "..."},
        {"role": "assistant", "content": "..."}
    ]
}
```

#### llama3.argilla

```json
{
    "system": "...", // optional
    "instruction": "...",
    "chosen_response": "...",
    "rejected_response": "..."
}
```

#### llama3.argilla_chat

```json
{
    "chosen": [
        {"role": "user", "content": "..."},
        {"role": "assistant", "content": "..."}
    ],
    "rejected": [
        {"role": "user", "content": "..."},
        {"role": "assistant", "content": "..."}
    ]
}
```

#### llama3.icr

```json
{
    "system": "...", // optional
    "input": "...",
    "chosen": "...",
    "rejected": "..."
}
```

#### llama3.intel

```json
{
    "system": "...", // optional
    "question": "...",
    "chosen": "...",
    "rejected": "..."
}
```

#### llama3.prompt_pairs

```json
{
    "system": "...", // optional
    "prompt": "...",
    "chosen": "...",
    "rejected": "..."
}
```

#### llama3.ultra

```json
{
    "system": "...", // optional
    "prompt": "...",
    "chosen": [
        {"role": "user", "content": "..."},
        {"role": "assistant", "content": "..."}
    ],
    "rejected": [
        {"role": "user", "content": "..."},
        {"role": "assistant", "content": "..."}
    ]
}
```

#### zephyr.nectar

```json
{
    "prompt": "...",
    "answers": [
        {
            "answer": "...",
            "rank": 1
        },
        {
            "answer": "...",
            "rank": 2
        }
        // ... more answers with ranks
    ]
}
```

#### chat_template.argilla_chat

```json
{
    "chosen": [
        {"role": "user", "content": "..."},
        {"role": "assistant", "content": "..."}
    ],
    "rejected": [
        {"role": "user", "content": "..."},
        {"role": "assistant", "content": "..."}
    ]
}
```

#### chat_template.default

```yaml
rl: dpo
datasets:
  - path: ...
    split: train
    type: chat_template.default
    field_messages: "messages"
    field_chosen: "chosen"
    field_rejected: "rejected"
    message_property_mappings:
      role: role
      content: content
    roles:
      user: ["user"]
      assistant: ["assistant"]
      system: ["system"]
```

Sample input format:

```json
{
    "messages": [
        {
            "role": "system",
            "content": "..."
        },
        {
            "role": "user",
            "content": "..."
        },
        // ... more messages
    ],
    "chosen": {
        "role": "assistant",
        "content": "..."
    },
    "rejected": {
        "role": "assistant",
        "content": "..."
    }
}
```

#### user_defined.default

For custom behaviors,

```yaml
rl: dpo
datasets:
  - path: ...
    split: train
    type:
      field_prompt: "prompt"
      field_system: "system"
      field_chosen: "chosen"
      field_rejected: "rejected"
      prompt_format: "{prompt}"
      chosen_format: "{chosen}"
      rejected_format: "{rejected}"
```

The input format is a simple JSON input with customizable fields based on the above config.

```json
{
    "system": "...",  // optional
    "prompt": "...",
    "chosen": "...",
    "rejected": "..."
}
```

### IPO

As IPO is just DPO with a different loss function, all supported dataset formats for [DPO](#dpo) are also supported for IPO.

```yaml
rl: ipo
```

### ORPO

Paper: https://arxiv.org/abs/2403.07691

```yaml
rl: orpo
orpo_alpha: 0.1
remove_unused_columns: false

chat_template: chatml
datasets:
  - path: argilla/ultrafeedback-binarized-preferences-cleaned
    type: chat_template.argilla
```

ORPO supports the following types with the following dataset format:

#### chat_template.argilla

```json
{
    "system": "...",  // optional
    "prompt": "...",  // if available, will be taken as user message for single-turn instead of from list below

    // chosen/rejected should be same till last content and only even-number of alternating user/assistant turns
    "chosen": [
        {"role": "user", "content": "..."},
        {"role": "assistant", "content": "..."}
    ],
    "rejected": [
        {"role": "user", "content": "..."},
        {"role": "assistant", "content": "..."}
    ]
}
```

### KTO

```yaml
rl: kto
rl_beta: 0.1  # default
kto_desirable_weight: 1.0  # default
kto_undesirable_weight: 1.0  # default

remove_unused_columns: false

datasets:
  - path: argilla/ultrafeedback-binarized-preferences-cleaned-kto
    type: llama3.ultra
    split: train

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: true
```

KTO supports the following types with the following dataset format:

#### chatml.argilla

```json
{
    "system": "...", // optional
    "instruction": "...",
    "completion": "..."
}
```

#### chatml.argilla_chat

```json
{
    "chosen": [
        {"role": "user", "content": "..."}
    ],
    "completion": [
        {"role": "assistant", "content": "..."}
    ]
}
```

#### chatml.intel

```json
{
    "system": "...", // optional
    "question": "...",
    "completion": "..."
}
```

#### chatml.prompt_pairs

```json
{
    "system": "...", // optional
    "prompt": "...",
    "completion": "..."
}
```

#### chatml.ultra

```json
{
    "system": "...", // optional
    "prompt": "...",
    "completion": "..."
}
```

#### llama3.argilla

```json
{
    "system": "...", // optional
    "instruction": "...",
    "completion": "..."
}
```

#### llama3.argilla_chat

```json
{
    "completion": [
        {"role": "user", "content": "..."},
        {"role": "assistant", "content": "..."}
    ]
}
```

#### llama3.intel

```json
{
    "system": "...", // optional
    "question": "...",
    "completion": "..."
}
```

#### llama3.prompt_pairs

```json
{
    "system": "...", // optional
    "prompt": "...",
    "completion": "..."
}
```

#### llama3.ultra

```json
{
    "system": "...", // optional
    "prompt": "...",
    "completion": "..."
}
```

#### user_defined.default

For custom behaviors,

```yaml
rl: kto
datasets:
  - path: ...
    split: train
    type:
      field_prompt: "prompt"
      field_system: "system"
      field_completion: "completion"
      field_label: "label"
      prompt_format: "{prompt}"
      completion_format: "{completion}"
```

The input format is a simple JSON input with customizable fields based on the above config.

```json
{
    "system": "...",  // optional
    "prompt": "...",
    "completion": "...",
    "label": "..."
}
```

### GRPO

::: {.callout-tip}
Check out our [GRPO cookbook](https://github.com/axolotl-ai-cloud/grpo_code).
:::

In the latest GRPO implementation, `vLLM` is used to significantly speedup trajectory generation during training. In this example, we're using 4 GPUs - 2 for training, and 2 for vLLM:

::: {.callout-important}
Make sure you've installed the correct version of vLLM by including it as an extra when installing axolotl, e.g. `pip install axolotl[vllm]`.
:::

```yaml
base_model: Qwen/Qwen2.5-1.5B-Instruct

vllm:
    host: 0.0.0.0
    port: 8000
    tensor_parallel_size: 2
    gpu_memory_utilization: 0.85
    dtype: auto
    # max_model_len: # you may find it useful to set the vLLM model context length if you know this beforehand

rl: grpo
trl:
    use_vllm: true
    vllm_server_host: 0.0.0.0
    vllm_server_port: 8000
    vllm_server_timeout: 300
```

```bash
CUDA_VISIBLE_DEVICES=2,3 axolotl vllm-serve grpo.yaml
```

Your `vLLM` instance will now attempt to spin up, and it's time to kick off training utilizing our remaining two GPUs. In another terminal, execute:

```bash
CUDA_VISIBLE_DEVICES=0,1 axolotl train grpo.yaml --num-processes 2
```

::: {.callout-note}
Due to TRL's implementation with vLLM, the vLLM instance must use the last N GPUs instead of the first N GPUs. This is why in the example above, we use `CUDA_VISIBLE_DEVICES=2,3` for the vLLM instance.
:::

#### Reward functions

GRPO uses custom reward functions and transformations. Please have them ready locally.

For example, to load OpenAI's GSM8K and use a random reward for completions:

```python
# rewards.py
import random

def rand_reward_func(completions, **kwargs) -> list[float]:
    return [random.uniform(0, 1) for _ in completions]

def oai_gsm8k_transform(cfg, *args, **kwargs):
    def transform_fn(example, tokenizer=None):
        label = example["answer"].split("####")[-1].strip().replace(",", "")
        return {
            "prompt": [{"role": "user", "content": example["question"]},],
            "answer": label,
        }
    return transform_fn, {"remove_columns": ["question"]}
```

```yaml
rl: grpo

trl:
    beta: 0.001
    max_completion_length: 256
    use_vllm: True
    num_generations: 4
    reward_funcs: ["rewards.rand_reward_func"]    # format: '{file_name}.{fn_name}'
    reward_weights: [1.0]
datasets:
  - path: openai/gsm8k
    name: main
    type: rewards.oai_gsm8k_transform  # format: '{file_name}.{fn_name}'
```

To see other examples of custom reward functions, please see [TRL GRPO Docs](https://github.com/huggingface/trl/blob/main/docs/source/grpo_trainer.md#using-a-custom-reward-function).

To see all configs, please see [TRLConfig](https://github.com/axolotl-ai-cloud/axolotl/blob/v0.9.2/src/axolotl/utils/schemas/trl.py).

#### OpenEnv Rollout Functions

GRPO supports custom rollout functions for OpenEnv-style environments, enabling interactive tasks like web browsing, code execution, or tool use. This allows you to implement custom generation logic that interacts with external environments.

For example, to implement a simple math-solving environment with step-by-step verification:

```python
# math_env.py
import re

def math_solver_rollout(model, processing_class, prompts, generation_config=None):
    """
    Custom rollout function that generates step-by-step math solutions.

    Args:
        model: The language model
        processing_class: The tokenizer/processing_class
        prompts: List of prompt dicts (with 'messages' key for chat format)
        generation_config: Optional generation configuration

    Returns:
        List of completion strings
    """
    completions = []

    for prompt in prompts:
        # Apply chat template to prompt
        messages = prompt.get("messages", [])
        formatted_prompt = processing_class.apply_chat_template(
            messages, processing_class=False, add_generation_prompt=True
        )

        # Generate step-by-step solution
        full_response = ""
        for step in range(5):  # Max 5 reasoning steps
            current_input = formatted_prompt + full_response + "\nNext step:"
            inputs = processing_class(current_input, return_tensors="pt").to(model.device)

            outputs = model.generate(
                **inputs,
                max_new_tokens=100,
                generation_config=generation_config,
            )
            step_text = processing_class.decode(
                outputs[0][inputs.input_ids.shape[1]:],
                skip_special_tokens=True
            )

            # Check if solution is complete
            if "FINAL ANSWER:" in step_text:
                full_response += step_text
                break
            full_response += step_text + "\n"

        completions.append(full_response)

    return completions

def math_reward(prompts, completions, answers, **kwargs):
    """Reward function that checks mathematical correctness"""
    rewards = []
    for completion, correct_answer in zip(completions, answers):
        # Extract predicted answer
        match = re.search(r"FINAL ANSWER:\s*(.+)", completion)
        predicted = match.group(1).strip() if match else ""

        # Compare with correct answer
        reward = 1.0 if predicted == str(correct_answer) else 0.0
        rewards.append(reward)

    return rewards

def math_transform(cfg, *args, **kwargs):
    """Transform dataset to GRPO format with answer field"""
    def transform_fn(example, processing_class=None):
        return {
            "prompt": [{"role": "user", "content": example["question"]}],
            "answer": str(example["answer"]),
        }
    return transform_fn, {"remove_columns": ["question"]}
```

```yaml
rl: grpo

trl:
  beta: 0.001
  max_completion_length: 512
  num_generations: 4
  rollout_func: "math_env.math_solver_rollout"  # Custom rollout function
  reward_funcs: ["math_env.math_reward"]
  reward_weights: [1.0]

datasets:
  - path: openai/gsm8k
    name: main
    type: math_env.math_transform
```

The `rollout_func` parameter accepts a fully qualified name (e.g., `module_name.function_name`) that points to a callable function in your local directory. The function receives:

- `model`: The language model
- `processing_class`: The tokenizer/processing class
- `prompts`: List of prompt dictionaries
- `generation_config` (optional): Generation configuration

And should return a list of completion strings.

For more OpenEnv examples, see [TRL OpenEnv Documentation](https://huggingface.co/docs/trl/main/en/openenv).

#### GRPO with DAPO/Dr. GRPO loss

The DAPO paper and subsequently Dr. GRPO paper proposed an alternative loss function for GRPO to remediate the penalty in longer responses.

```yaml
trl:
  loss_type: dr_grpo
  # Normalizes loss based on max completion length (default: 256)
  max_completion_length:
```

For more information, see [GRPO docs](https://huggingface.co/docs/trl/v0.17.0/en/grpo_trainer#loss-types).

#### Async GRPO

Async GRPO overlaps vLLM generation with training by producing rollouts in a background thread. While the model trains on the current batch, the next batch is already being generated. This can significantly reduce wall-clock time per step.

```yaml
trl:
  use_data_producer: true     # Enable data producer protocol
  use_vllm: true
  async_prefetch: true         # Generate rollouts in background thread
  prefetch_depth: 1            # Number of rollouts to prefetch
  vllm_sync_interval: 2        # Sync weights to vLLM every N steps
```

::: {.callout-note}
Because the background thread generates completions with slightly stale model weights, async GRPO uses importance sampling correction to account for the distribution shift. This is controlled by `vllm_importance_sampling_correction: true` (default when async is enabled).
:::

##### vLLM LoRA Sync

By default, weight sync to vLLM merges the LoRA adapter into the base model and broadcasts all parameters via NCCL. LoRA sync is a faster alternative that saves only the adapter weights to the filesystem and has vLLM load them natively using Punica kernels.

```yaml
adapter: lora
lora_r: 32
lora_alpha: 64
lora_target_linear: true

trl:
  vllm_lora_sync: true         # Enable native LoRA sync
```

When `vllm_lora_sync: true` is set, axolotl automatically selects the LoRA-aware vLLM serve module. Start vLLM as usual:

```bash
CUDA_VISIBLE_DEVICES=0 axolotl vllm-serve config.yaml
```

Then start training on a separate GPU:

```bash
CUDA_VISIBLE_DEVICES=1 axolotl train config.yaml
```

::: {.callout-tip}
LoRA sync is especially beneficial with multi-GPU training (FSDP/DeepSpeed), where NCCL merge-sync can cause GPU contention with vLLM generation.
:::

##### Streaming Partial Batch

Instead of scoring the entire batch at once, streaming mode scores one prompt group at a time. This enables finer-grained zero-advantage skipping and reduces peak memory usage during scoring.

```yaml
trl:
  streaming_partial_batch: true
```

##### Importance Sampling Correction

When using async prefetch, completions are generated from a slightly older version of the model. Importance sampling (IS) correction adjusts the policy gradient to account for this distribution shift.

```yaml
trl:
  vllm_importance_sampling_correction: true   # Enable IS correction
  importance_sampling_level: token             # 'token' or 'sequence'
  off_policy_mask_threshold: 0.5              # Mask sequences with IS ratio below this
```

- `importance_sampling_level: token` applies per-token IS ratios (recommended with Liger kernel)
- `importance_sampling_level: sequence` applies per-sequence IS ratios
- `off_policy_mask_threshold` masks out sequences where the IS ratio indicates they are too far off-policy

##### Replay Buffer

The replay buffer caches rollout groups that had learning signal (non-zero reward variance) and uses them to replace zero-signal groups in later batches.

```yaml
trl:
  replay_buffer_size: 100       # Max cached groups (0 = disabled)
  replay_recompute_logps: true  # Recompute log-probs for replayed data (recommended)
```

::: {.callout-note}
When `replay_recompute_logps: true` (default), old log-probabilities are recomputed using the current model weights. This fixes the IS mismatch that would otherwise occur when replaying stale data.
:::

##### Deferred Re-rolling

Failed prompts (where the model produces zero reward for all generations) are buffered and re-injected into later batches when the model may be better equipped to solve them.

```yaml
trl:
  reroll_start_fraction: 0.5    # Start re-rolling after 50% of training
  reroll_max_groups: 1          # Max groups to replace per batch
```

##### Zero-Advantage Batch Skipping

When all advantages in a micro-batch are zero (no learning signal), the forward/backward pass is skipped entirely. This is enabled by default and logged as `skipped_zero_adv_batches=1`.

```yaml
trl:
  skip_zero_advantage_batches: true   # default
```

##### Parallel Reward Workers

Reward functions that use `signal.alarm()` (e.g., `math_verify`) must run in the main thread. Parallel reward workers use subprocesses to work around this limitation while enabling concurrent reward computation.

```yaml
trl:
  reward_num_workers: 4         # Number of subprocess workers (1 = no parallelism)
```

##### Full Async GRPO Example

```yaml
base_model: Qwen/Qwen2.5-1.5B-Instruct

vllm:
    host: 0.0.0.0
    port: 8000
    gpu_memory_utilization: 0.35
    dtype: auto

adapter: lora
lora_r: 32
lora_alpha: 64
lora_target_linear: true

rl: grpo
trl:
  use_data_producer: true
  use_vllm: true
  async_prefetch: true
  prefetch_depth: 1
  vllm_sync_interval: 2
  vllm_lora_sync: true
  streaming_partial_batch: true
  vllm_importance_sampling_correction: true
  off_policy_mask_threshold: 0.5
  importance_sampling_level: token
  num_generations: 8
  max_completion_length: 512
  reward_funcs:
    - rewards.accuracy_reward
  reroll_start_fraction: 0.5
  replay_buffer_size: 100
  reward_num_workers: 4
  skip_zero_advantage_batches: true

datasets:
  - path: AI-MO/NuminaMath-TIR
    type: rewards.prompt_transform
    split: train

gradient_accumulation_steps: 4
micro_batch_size: 2
max_steps: 500
learning_rate: 1e-5
bf16: true
gradient_checkpointing: true
```

```bash
# Terminal 1: Start vLLM on GPU 0
CUDA_VISIBLE_DEVICES=0 axolotl vllm-serve config.yaml

# Terminal 2: Train on GPU 1
CUDA_VISIBLE_DEVICES=1 axolotl train config.yaml
```

##### Multi-GPU Async GRPO

Async GRPO supports FSDP and DeepSpeed ZeRO-3 for multi-GPU training. vLLM runs on one GPU while training is distributed across the remaining GPUs.

**FSDP:**

```yaml
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer
gradient_checkpointing_kwargs:
  use_reentrant: false
```

**DeepSpeed ZeRO-3:**

```yaml
deepspeed: deepspeed_configs/zero3_bf16.json
gradient_checkpointing_kwargs:
  use_reentrant: true   # Required for ZeRO-3
```

```bash
# Terminal 1: Start vLLM on GPU 0
CUDA_VISIBLE_DEVICES=0 axolotl vllm-serve config.yaml

# Terminal 2: Train on GPUs 0,1
CUDA_VISIBLE_DEVICES=0,1 accelerate launch --num_processes 2 -m axolotl.cli.train config.yaml
```

::: {.callout-important}
With multi-GPU async prefetch, only rank 0 generates completions in the background thread. Results are broadcast to all ranks on the main thread. This avoids FSDP/DeepSpeed collective deadlocks from unsynchronized background threads.
:::

### GDPO

GDPO (Group Reward-Decoupled Policy Optimization) extends GRPO for multi-reward training. It addresses the **reward advantage collapse** problem by normalizing each reward function independently before combining them.

::: {.callout-tip}
Use GDPO when training with multiple reward functions. For single reward, GRPO and GDPO produce equivalent results.
:::

Paper: [https://arxiv.org/pdf/2501.05242](https://arxiv.org/pdf/2501.05242)

GDPO uses TRL's native `multi_objective_aggregation` parameter under the hood. When you set `rl: gdpo`, axolotl automatically configures TRL to use `normalize_then_sum` aggregation.

```yaml
base_model: Qwen/Qwen2.5-1.5B-Instruct

vllm:
    host: 0.0.0.0
    port: 8000
    tensor_parallel_size: 2
    gpu_memory_utilization: 0.85

rl: gdpo

trl:
    beta: 0.001
    max_completion_length: 256
    use_vllm: true
    num_generations: 4
    reward_funcs:
        - rewards.format_reward
        - rewards.correctness_reward
    reward_weights: [1.0, 2.0]

datasets:
    - path: openai/gsm8k
      name: main
      type: rewards.oai_gsm8k_transform
```

You can also use GRPO with explicit aggregation control:

```yaml
rl: grpo
trl:
    multi_objective_aggregation: normalize_then_sum  # GDPO behavior
    # or: sum_then_normalize  # Default GRPO behavior
```

#### GDPO vs GRPO

| Aspect | GRPO | GDPO |
|--------|------|------|
| **Aggregation** | `sum_then_normalize` | `normalize_then_sum` |
| **Multi-reward** | May collapse advantages | Preserves reward signals |
| **Single reward** | Standard behavior | Equivalent to GRPO |

#### Why GDPO?

When using multiple rewards with GRPO, different reward combinations can produce identical advantages:

```
# Example: format + correctness rewards
[format=0, correct=3] → sum=3
[format=1, correct=2] → sum=3  ← GRPO sees these as equal!
[format=2, correct=1] → sum=3
[format=3, correct=0] → sum=3
```

GDPO normalizes each reward independently, preserving their relative differences.

#### Reward Functions

GDPO uses the same reward function format as GRPO:

```python
# rewards.py
def format_reward(completions, **kwargs) -> list[float]:
    return [1.0 if len(c) > 10 else 0.0 for c in completions]

def correctness_reward(completions, answers, **kwargs) -> list[float]:
    rewards = []
    for completion, answer in zip(completions, answers):
        # Your scoring logic here
        rewards.append(score)
    return rewards
```

#### Sequence Parallelism

GDPO supports sequence parallelism for long-context training:

```yaml
rl: gdpo
context_parallel_size: 2
```

### SimPO

SimPO uses [CPOTrainer](https://huggingface.co/docs/trl/main/en/cpo_trainer) but with alternative loss function.

```yaml
rl: simpo
rl_beta: 0.1  # default in CPOTrainer
cpo_alpha: 1.0  # default in CPOTrainer
simpo_gamma: 0.5  # default in CPOTrainer
```

This method uses the same dataset format as [DPO](#dpo).

### Using local dataset files

```yaml
datasets:
  - ds_type: json
    data_files:
      - orca_rlhf.jsonl
    split: train
    type: chatml.intel
```

### TRL auto-unwrapping for PEFT

TRL supports auto-unwrapping PEFT models for RL training paradigms which rely on a reference model. This significantly reduces memory pressure as an additional refreference model does not need to be loaded, and reference model log-probabilities can be obtained by disabling PEFT adapters. This is enabled by default. To turn it off, pass the following config:

```yaml
# load ref model when adapter training.
rl_adapter_ref_model: true
```


================================================
FILE: docs/scripts/examples-allowlist.yml
================================================
examples:
  # December 2025
  - name: kimi-linear
    title: Kimi Linear
  - name: plano
    title: Plano Orchestrator
  - name: mimo
    title: MiMo
  - name: internvl3_5
    title: InternVL 3.5

  # AllenAI
  - name: olmo3
    title: OLMo 3

  # ArceeAI
  - name: trinity
    title: Trinity
  - name: arcee
    title: Arcee AFM

  # MistralAI
  - name: ministral3/think
    title: Ministral 3 Thinking
  - name: ministral3/vision
    title: Ministral 3 Vision
  - name: magistral/think
    title: Magistral Thinking
  - name: magistral/vision
    title: Magistral Vision
  - name: ministral
    title: Ministral
  - name: mistral-small
    title: Mistral Small 3.1/3.2
  - name: voxtral
    title: Voxtral
  - name: devstral
    title: Devstral
  - name: mistral
    title: Mistral 7B

  # Meta
  - name: llama-4
    title: Llama 4
  - name: llama-2
    title: Llama 2

  # Alibaba
  - name: qwen3-next
    title: Qwen 3 Next
  - name: qwen3
    title: Qwen 3

  # Google
  - name: gemma3n
    title: Gemma 3n

  # Swiss AI
  - name: apertus
    title: Apertus

  # GPT-OSS
  - name: gpt-oss
    title: GPT-OSS
  - name: seed-oss
    title: Seed-OSS

  # Microsoft
  - name: phi
    title: Phi

  # SmolVLM
  - name: smolvlm2
    title: SmolVLM 2

  # IBM
  - name: granite4
    title: Granite 4

  # LiquidAI
  - name: LiquidAI
    title: Liquid Foundation Models 2

  # Other
  - name: hunyuan
    title: Hunyuan
  - name: jamba
    title: Jamba
  - name: orpheus
    title: Orpheus


================================================
FILE: docs/scripts/generate_config_docs.py
================================================
# type: ignore

"""
Quarto documentation generation from Pydantic models. Uses Pydantic model source code
to automatically group fields, including inherited fields from parent classes.
"""

import ast
import inspect
import textwrap
import types
import typing
from typing import Any, FrozenSet, Type, Union

from pydantic import BaseModel

from axolotl.utils.schemas.config import AxolotlInputConfig


class QuartoGenerator:
    """Generate Quarto documentation from Pydantic models."""

    def __init__(self):
        self._class_fields_cache = {}
        self._inheritance_map_cache = {}
        self._nested_models_cache = {}

    def _get_direct_fields(self, cls: Type[BaseModel]) -> FrozenSet[str]:
        """Get fields defined directly in a single class (not inherited)."""
        if cls in self._class_fields_cache:
            return self._class_fields_cache[cls]

        fields = set()

        # Get annotated fields
        if hasattr(cls, "__annotations__"):
            fields.update(cls.__annotations__.keys())

        # Filter out private/special methods
        fields = {f for f in fields if not f.startswith("_")}

        result = frozenset(fields)
        self._class_fields_cache[cls] = result
        return result

    def _is_pydantic_model(self, type_obj) -> bool:
        """Check if a type is a Pydantic BaseModel."""
        return inspect.isclass(type_obj) and issubclass(type_obj, BaseModel)

    def _extract_nested_type(self, field_type) -> Any:
        """Extract the actual type from complex type annotations."""
        # Handle Annotated types (Python 3.9+)
        if hasattr(typing, "get_origin") and hasattr(typing, "get_args"):
            origin = typing.get_origin(field_type)
            args = typing.get_args(field_type)

            if origin is not None:
                # Handle Annotated[SomeType, ...] - extract the first argument
                if hasattr(typing, "Annotated") and origin is typing.Annotated:
                    if args:
                        return self._extract_nested_type(
                            args[0]
                        )  # Recursively process the actual type

                # Handle list[SomeType], List[SomeType], etc.
                elif origin in (list, typing.List):
                    if args:
                        return self._extract_nested_type(
                            args[0]
                        )  # Extract element type

                # Handle Union types (including | syntax)
                elif origin is typing.Union:
                    # Get non-None types from the Union
                    non_none_types = [arg for arg in args if arg is not type(None)]
                    if len(non_none_types) >= 1:
                        # Prioritize Pydantic models over primitive types
                        pydantic_models = [
                            arg
                            for arg in non_none_types
                            if self._is_pydantic_model(arg)
                        ]
                        if pydantic_models:
                            # Return the first Pydantic model found
                            return self._extract_nested_type(pydantic_models[0])

                        # No Pydantic models, return the first non-None type
                        return self._extract_nested_type(non_none_types[0])

        # Handle new Python 3.10+ union syntax (PeftConfig | None)
        if hasattr(field_type, "__class__") and field_type.__class__ is types.UnionType:
            # Get non-None types from the Union
            non_none_types = [
                arg for arg in field_type.__args__ if arg is not type(None)
            ]
            if len(non_none_types) >= 1:
                # Prioritize Pydantic models over primitive types
                pydantic_models = [
                    arg for arg in non_none_types if self._is_pydantic_model(arg)
                ]
                if pydantic_models:
                    return self._extract_nested_type(pydantic_models[0])
                return self._extract_nested_type(non_none_types[0])

        # Handle old typing.Union syntax (fallback)
        if hasattr(field_type, "__origin__"):
            if field_type.__origin__ is Union:
                # Get non-None types from the Union
                non_none_types = [
                    arg for arg in field_type.__args__ if arg is not type(None)
                ]
                if len(non_none_types) >= 1:
                    # Prioritize Pydantic models over primitive types
                    pydantic_models = [
                        arg for arg in non_none_types if self._is_pydantic_model(arg)
                    ]
                    if pydantic_models:
                        return self._extract_nested_type(pydantic_models[0])
                    return self._extract_nested_type(non_none_types[0])
            # Handle other generic types like dict[str, Any], etc.
            elif hasattr(field_type, "__args__"):
                return field_type

        return field_type

    def _extract_all_pydantic_models_from_type(
        self, field_type
    ) -> list[type[BaseModel]]:
        """Extract all Pydantic models from a type annotation, including from Unions."""
        models = []

        if field_type is None:
            return models

        # Handle Annotated types
        if hasattr(typing, "get_origin") and hasattr(typing, "get_args"):
            origin = typing.get_origin(field_type)
            args = typing.get_args(field_type)

            if origin is not None:
                # Handle Annotated[SomeType, ...] - extract from the first argument
                if hasattr(typing, "Annotated") and origin is typing.Annotated:
                    if args:
                        models.extend(
                            self._extract_all_pydantic_models_from_type(args[0])
                        )
                    return models

                # Handle list[SomeType], List[SomeType], etc.
                if origin in (list, typing.List):
                    if args:
                        models.extend(
                            self._extract_all_pydantic_models_from_type(args[0])
                        )
                    return models

                # Handle Union types
                if origin is typing.Union:
                    for arg in args:
                        if arg is not type(None):  # Skip None type
                            models.extend(
                                self._extract_all_pydantic_models_from_type(arg)
                            )
                    return models

        # Handle new Python 3.10+ union syntax
        if hasattr(field_type, "__class__") and field_type.__class__ is types.UnionType:
            for arg in field_type.__args__:
                if arg is not type(None):  # Skip None type
                    models.extend(self._extract_all_pydantic_models_from_type(arg))
            return models

        # Handle old typing.Union syntax (fallback)
        if hasattr(field_type, "__origin__") and field_type.__origin__ is Union:
            for arg in field_type.__args__:
                if arg is not type(None):  # Skip None type
                    models.extend(self._extract_all_pydantic_models_from_type(arg))
            return models

        # Check if this type itself is a Pydantic model
        if self._is_pydantic_model(field_type):
            models.append(field_type)

        return models

    def _get_nested_models(
        self, model_class: type[BaseModel], visited=None
    ) -> dict[str, type[BaseModel]]:
        """Get all nested Pydantic models from a model class."""
        if visited is None:
            visited = set()

        # Avoid infinite recursion
        if model_class in visited:
            return {}

        if model_class in self._nested_models_cache:
            return self._nested_models_cache[model_class]

        visited.add(model_class)
        nested_models = {}

        # Check all fields in the model
        for field_info in model_class.model_fields.values():
            field_type = self._extract_nested_type(field_info.annotation)

            if self._is_pydantic_model(field_type):
                nested_models[field_type.__name__] = field_type
                # Recursively get nested models from this nested model
                deeper_nested = self._get_nested_models(field_type, visited.copy())
                nested_models.update(deeper_nested)

        self._nested_models_cache[model_class] = nested_models
        return nested_models

    def _build_inheritance_map(self, child_class: Type[BaseModel]):
        """Build inheritance map for a class and all its parents."""
        if child_class in self._inheritance_map_cache:
            return self._inheritance_map_cache[child_class]

        inheritance_map = {}

        # Get MRO and filter out BaseModel and object
        mro_classes = [
            cls
            for cls in child_class.__mro__
            if cls not in (BaseModel, object) and hasattr(cls, "__annotations__")
        ]

        # Process each class in the MRO
        for cls in mro_classes:
            inheritance_map[cls] = self._get_direct_fields(cls)

        self._inheritance_map_cache[child_class] = inheritance_map
        return inheritance_map

    def _wrap_comment(self, text: str, width: int = 88) -> list[str]:
        """Wrap a comment to specified width, accounting for '# ' prefix."""
        if not text.strip():
            return ["#"]

        # Account for "# " prefix (2 characters)
        content_width = width - 2
        wrapped_lines = textwrap.wrap(text, width=content_width)
        return [f"# {line}" for line in wrapped_lines]

    def _extract_type_from_source(
        self, model_class: type[BaseModel], field_name: str
    ) -> str:
        """Extract the actual type annotation text from source code, checking inheritance chain."""
        # Use inheritance map to check classes efficiently
        inheritance_map = self._build_inheritance_map(model_class)

        # Check classes in MRO order
        for cls in model_class.__mro__:
            if cls in inheritance_map and field_name in inheritance_map[cls]:
                type_annotation = self._get_type_from_class_source(cls, field_name)
                if type_annotation != "unknown":
                    return type_annotation

        return "unknown"

    def _get_type_from_class_source(self, class_obj: type, field_name: str) -> str:
        """Extract type annotation from a specific class's source code."""
        try:
            source = inspect.getsource(class_obj)
            tree = ast.parse(source)
        except (OSError, TypeError):
            return "unknown"

        # Find the class definition
        for node in tree.body:
            if isinstance(node, ast.ClassDef) and node.name == class_obj.__name__:
                # Find the field assignment
                for body_node in node.body:
                    if isinstance(body_node, ast.AnnAssign) and isinstance(
                        body_node.target, ast.Name
                    ):
                        if body_node.target.id == field_name and body_node.annotation:
                            return ast.unparse(body_node.annotation)
                break

        return "unknown"

    def _extract_field_groups_from_all_classes(
        self, model_class: type[BaseModel]
    ) -> list[dict]:
        """Extract field groups from all classes in the inheritance hierarchy."""
        all_groups = []
        inheritance_map = self._build_inheritance_map(model_class)

        # Get all Pydantic base classes in MRO order (most specific first)
        # This puts AxolotlInputConfig fields first, then parent class fields
        pydantic_classes = [
            cls
            for cls in model_class.__mro__
            if cls in inheritance_map and inheritance_map[cls]
        ]

        # Extract groups from each class
        for cls in pydantic_classes:
            class_groups = self._extract_field_groups_from_source(cls)
            for group in class_groups:
                all_groups.append(group)

        # If no groups found, create a default grouping by class
        if not all_groups:
            for cls in pydantic_classes:
                fields_in_class = inheritance_map[cls]
                if fields_in_class:
                    all_groups.append(
                        {
                            "fields": list(fields_in_class),
                        }
                    )

        return all_groups

    def _extract_field_groups_from_source(
        self, model_class: type[BaseModel]
    ) -> list[dict]:
        """Extract field groups from source code based on blank lines and comments."""
        try:
            source = inspect.getsource(model_class)
            tree = ast.parse(source)
        except (OSError, TypeError):
            # Fallback if we can't get source code
            fields_in_class = self._get_direct_fields(model_class)
            if fields_in_class:
                return [
                    {
                        "fields": list(fields_in_class),
                    }
                ]
            return []

        groups = []
        current_group_fields = []
        current_group_comment = None

        # Find the class definition
        class_node = None
        for node in ast.walk(tree):
            if isinstance(node, ast.ClassDef) and node.name == model_class.__name__:
                class_node = node
                break

        if not class_node:
            fields_in_class = self._get_direct_fields(model_class)
            if fields_in_class:
                return [
                    {
                        "fields": list(fields_in_class),
                    }
                ]
            return []

        # Parse the source lines to detect groupings
        source_lines = source.split("\n")

        # Get fields that are actually defined in this specific class
        fields_in_class = self._get_direct_fields(model_class)

        # Find assignments that correspond to model fields for THIS class only
        field_assignments = []
        for node in class_node.body:
            if isinstance(node, ast.AnnAssign) and isinstance(node.target, ast.Name):
                field_name = node.target.id
                if field_name in fields_in_class:
                    field_assignments.append(
                        {
                            "name": field_name,
                            "lineno": node.lineno,
                            "end_lineno": getattr(node, "end_lineno", node.lineno),
                        }
                    )

        if not field_assignments:
            if fields_in_class:
                return [
                    {
                        "fields": list(fields_in_class),
                    }
                ]
            return []

        # Sort by line number
        field_assignments.sort(key=lambda x: x["lineno"])

        # Group fields based on blank lines and comments
        for i, field_info in enumerate(field_assignments):
            field_name = field_info["name"]
            current_line = field_info["lineno"]

            # Check if this starts a new group (blank line before or significant gap)
            is_new_group = False

            if i == 0:
                is_new_group = True
            else:
                prev_end_line = field_assignments[i - 1]["end_lineno"]

                # Check for blank lines or comments between fields
                lines_between = source_lines[prev_end_line : current_line - 1]
                has_blank_line = any(line.strip() == "" for line in lines_between)
                has_comment = any(
                    line.strip().startswith("#") for line in lines_between
                )

                # Start new group if there's a blank line or comment, or significant gap
                if has_blank_line or has_comment or (current_line - prev_end_line > 3):
                    is_new_group = True

            if is_new_group and current_group_fields:
                # Save the previous group
                groups.append(
                    {
                        "fields": current_group_fields.copy(),
                        "description": current_group_comment,
                    }
                )
                current_group_fields = []
                current_group_comment = None

            current_group_fields.append(field_name)

        # Add the final group
        if current_group_fields:
            groups.append(
                {
                    "fields": current_group_fields,
                    "description": current_group_comment,
                }
            )

        return groups

    def _generate_field_documentation(
        self,
        model_class: type[BaseModel],
        field_name: str,
        field_info: dict,
        field_type_str: str,
        is_required: bool,
        indent_level: int = 0,
        visited_models: set = None,
    ) -> list[str]:
        """Generate documentation for a single field, expanding nested models inline."""
        if visited_models is None:
            visited_models = set()

        lines = []
        indent = "  " * indent_level

        # Get the actual field type for nested model detection
        if field_name in model_class.model_fields:
            pydantic_field_info = model_class.model_fields[field_name]
            actual_field_type = pydantic_field_info.annotation
        else:
            actual_field_type = None

        # Add description comment if available
        description = field_info.get("description", "")
        if description:
            wrapped_lines = self._wrap_comment(description, width=88 - len(indent))
            for line in wrapped_lines:
                lines.append(f"{indent}{line}")

        # Extract nested Pydantic models from the type annotation
        nested_models = self._extract_all_pydantic_models_from_type(actual_field_type)

        # Filter out already visited models to prevent infinite recursion
        expandable_models = [
            model for model in nested_models if model not in visited_models
        ]

        if expandable_models:
            # This field contains Pydantic models that can be expanded

            # Show the field with its full type annotation
            field_line = f"{indent}{field_name}: {field_type_str}"
            if field_info.get("default") is not None:
                field_line += f" = {field_info['default']}"
            if is_required:
                field_line += " (required)"
            lines.append(field_line)

            # Add to visited to prevent infinite recursion
            new_visited = visited_models.copy()
            new_visited.update(expandable_models)

            # Expand each nested Pydantic model
            for i, nested_model in enumerate(expandable_models):
                if i > 0:
                    lines.append("\n")
                lines.append(f"{indent}  # For {nested_model.__name__}:")

                # Get nested model schema
                try:
                    nested_schema = nested_model.model_json_schema()
                    nested_properties = nested_schema.get("properties", {})
                    nested_required = nested_schema.get("required", [])
                except Exception:
                    # Fallback: use model fields directly
                    nested_properties = {}
                    nested_required = []
                    for (
                        nested_field_name,
                        nested_field_info,
                    ) in nested_model.model_fields.items():
                        nested_description = ""
                        if (
                            hasattr(nested_field_info, "json_schema_extra")
                            and nested_field_info.json_schema_extra
                        ):
                            nested_description = (
                                nested_field_info.json_schema_extra.get(
                                    "description", ""
                                )
                            )
                        elif (
                            hasattr(nested_field_info, "description")
                            and nested_field_info.description
                        ):
                            nested_description = nested_field_info.description

                        nested_default_val = None
                        if (
                            hasattr(nested_field_info, "default")
                            and nested_field_info.default is not None
                        ):
                            if str(nested_field_info.default) != "PydanticUndefined":
                                nested_default_val = nested_field_info.default

                        nested_properties[nested_field_name] = {
                            "type": "unknown",
                            "description": nested_description,
                            "default": nested_default_val,
                        }

                        if nested_field_info.is_required():
                            nested_required.append(nested_field_name)

                # Get field groups for the nested model
                nested_field_groups = self._extract_field_groups_from_all_classes(
                    nested_model
                )

                # Generate nested fields with increased indentation
                for i, group in enumerate(nested_field_groups):
                    if not group["fields"]:
                        continue

                    # Add blank line between groups (except before first group)
                    if i > 0:
                        lines.append("")

                    # Process nested fields
                    for nested_field_name in group["fields"]:
                        if nested_field_name not in nested_properties:
                            continue

                        nested_field_info = nested_properties[nested_field_name]
                        nested_field_type = self._extract_type_from_source(
                            nested_model, nested_field_name
                        )
                        nested_is_required = nested_field_name in nested_required

                        # Recursively generate documentation for nested field
                        nested_lines = self._generate_field_documentation(
                            nested_model,
                            nested_field_name,
                            nested_field_info,
                            nested_field_type,
                            nested_is_required,
                            indent_level + 1,
                            new_visited,
                        )
                        lines.extend(nested_lines)
        else:
            # Regular field (no expandable nested models)
            field_line = f"{indent}{field_name}: {field_type_str}"
            if field_info.get("default") is not None:
                field_line += f" = {field_info['default']}"
            if is_required:
                field_line += " (required)"
            lines.append(field_line)

        return lines

    def generate_qmd(
        self,
        model_class: type[BaseModel],
        title: str | None = None,
        expand_nested: bool = True,
    ) -> str:
        """Auto-generate config reference documentation including inherited fields."""

        if title is None:
            title = f"{model_class.__name__} Reference"

        # Try to get JSON schema, with fallback for serialization issues
        try:
            schema = model_class.model_json_schema()
            properties = schema.get("properties", {})
            required = schema.get("required", [])
        except Exception as e:
            print(
                f"Warning: Could not generate JSON schema ({e}). Using model fields instead."
            )
            # Fallback: use model fields directly
            properties = {}
            required = []
            for field_name, field_info in model_class.model_fields.items():
                # Extract description from json_schema_extra or field info
                description = ""
                if (
                    hasattr(field_info, "json_schema_extra")
                    and field_info.json_schema_extra
                ):
                    description = field_info.json_schema_extra.get("description", "")
                elif hasattr(field_info, "description") and field_info.description:
                    description = field_info.description

                # Get default value
                default_val = None
                if hasattr(field_info, "default") and field_info.default is not None:
                    # Handle special Pydantic default markers
                    if str(field_info.default) != "PydanticUndefined":
                        default_val = field_info.default

                properties[field_name] = {
                    "type": "unknown",
                    "description": description,
                    "default": default_val,
                }

                if field_info.is_required():
                    required.append(field_name)

        # Extract field groups from all classes in inheritance hierarchy
        field_groups = self._extract_field_groups_from_all_classes(model_class)

        # Start building QMD content
        qmd_lines = [
            "---",
            f"title: {title}",
            "description: A complete list of all configuration options.",
            "---",
            "",
        ]

        # Generate one big code block with all fields (inline nested expansion)
        qmd_lines.append("```yaml")

        for i, group in enumerate(field_groups):
            if not group["fields"]:
                continue

            # Add blank line between groups (except before first group)
            if i > 0:
                qmd_lines.append("")

            # Process fields in the order they appear in source
            for field_name in group["fields"]:
                if field_name not in properties:
                    continue

                field_info = properties[field_name]
                field_type = self._extract_type_from_source(model_class, field_name)
                is_required = field_name in required

                if expand_nested:
                    # Check if this field has nested models
                    if field_name in model_class.model_fields:
                        pydantic_field_info = model_class.model_fields[field_name]
                        nested_models = self._extract_all_pydantic_models_from_type(
                            pydantic_field_info.annotation
                        )
                        has_nested = bool(nested_models)
                    else:
                        has_nested = False

                    # Add blank line before nested config
                    if has_nested:
                        qmd_lines.append("")

                    # Use the new inline generation method
                    field_lines = self._generate_field_documentation(
                        model_class,
                        field_name,
                        field_info,
                        field_type,
                        is_required,
                        indent_level=0,
                        visited_models=set(),
                    )
                    qmd_lines.extend(field_lines)

                    # Add blank line after nested config
                    if has_nested:
                        qmd_lines.append("")
                else:
                    # Original simple approach
                    description = field_info.get("description", "")
                    default = field_info.get("default")

                    # Add wrapped comment for description
                    if description:
                        wrapped_lines = self._wrap_comment(description)
                        qmd_lines.extend(wrapped_lines)

                    line = f"{field_name}: {field_type}"
                    if default is not None:
                        line += f" = {default}"
                    if is_required:
                        line += " (required)"
                    qmd_lines.append(line)

        qmd_lines.append("```")

        # Join all lines and clean up any double newlines
        content = "\n".join(qmd_lines)

        # Replace multiple consecutive newlines with just two newlines (one blank line)
        import re

        content = re.sub(r"\n{3,}", "\n\n", content)

        # Ensure single newline at the very end
        content = content.rstrip("\n") + "\n"

        return content


def main():
    generator = QuartoGenerator()

    print("Generating config reference content...")
    qmd_content = generator.generate_qmd(AxolotlInputConfig, "Config Reference", True)

    print("Writing to file...")
    with open("docs/config-reference.qmd", "w", encoding="utf-8") as f:
        f.write(qmd_content)
    print("Done!")


if __name__ == "__main__":
    main()


================================================
FILE: docs/scripts/generate_examples_docs.py
================================================
"""
auto generate example docs from allowlist
"""

import re
import shutil
import sys
from pathlib import Path

import yaml

# Paths
THIS = Path(__file__).resolve()
ROOT = THIS.parents[2]  # repo root (docs/scripts -> docs -> ROOT)
EXAMPLES_DIR = ROOT / "examples"
OUTPUT_DIR = ROOT / "docs" / "models"
ALLOWLIST_YML = THIS.parent / "examples-allowlist.yml"


def slugify(name: str) -> str:
    """Convert a name to a slug (lowercase, hyphens for spaces)."""
    s = re.sub(r"[^a-zA-Z0-9\s\-]+", "", name.strip())
    s = re.sub(r"\s+", "-", s).strip("-").lower()
    return s or "example"


def read_allowlist():
    with open(ALLOWLIST_YML, "r", encoding="utf-8") as f:
        data = yaml.safe_load(f) or {}
    items = data.get("examples", [])
    if not isinstance(items, list):
        raise ValueError("`examples` must be a list in examples-allowlist.yml")
    return items


def find_readme(folder: Path) -> Path | None:
    for name in ("README.md", "Readme.md", "readme.md"):
        p = folder / name
        if p.exists():
            return p
    return None


def remove_first_h1(md: str) -> tuple[str, str | None]:
    """
    Remove the first H1 from markdown and return (modified_md, h1_title).
    The H1 is removed since we use the frontmatter title instead.
    """
    lines = md.splitlines()
    result = []
    h1_title = None
    skipped_first = False

    for line in lines:
        if not skipped_first and line.startswith("# "):
            h1_title = line[2:].strip()
            skipped_first = True
            continue
        result.append(line)

    return "\n".join(result), h1_title


IMG_RE = re.compile(r"!\[[^\]]*\]\(([^)]+)\)")
LINK_RE = re.compile(r"\[([^\]]+)\]\(([^)]+)\)")


def rewrite_and_copy_assets(md: str, src_dir: Path, dest_assets_root: Path) -> str:
    """
    Copy local image assets referenced in markdown to
    docs/examples/assets/... and rewrite the links.
    """
    dest_assets = dest_assets_root / "assets"

    def repl(m):
        url = m.group(1).strip()
        if re.match(r"^(https?:)?//", url):
            return m.group(0)  # leave remote URLs
        src_path = (src_dir / url).resolve()
        if not src_path.exists():
            return m.group(0)  # leave as-is if not found
        rel = src_path.relative_to(src_dir)
        # Create a unique asset path based on source directory name
        asset_name = src_dir.name.replace("/", "-")
        dest_path = dest_assets / asset_name / rel
        dest_path.parent.mkdir(parents=True, exist_ok=True)
        shutil.copy2(src_path, dest_path)
        new_rel = f"assets/{asset_name}/{rel.as_posix()}"
        return m.group(0).replace(url, new_rel)

    return IMG_RE.sub(repl, md)


def rewrite_readme_links(
    md: str,
    src_dir: Path,
    examples_dir: Path,
    parent_index_only: set,
    current_src_path: str,
    allowlist_entries: set,
    current_output_path: str,
) -> str:
    """
    Rewrite links between README.md files to point to the correct .qmd files.
    """

    def repl(m):
        text = m.group(1)
        url = m.group(2).strip()

        # Skip remote URLs and anchor links
        if re.match(r"^(https?:)?//", url) or url.startswith("#"):
            return m.group(0)

        # Skip non-markdown files
        if not url.lower().endswith(".md"):
            return m.group(0)

        # Resolve the target path
        try:
            target_path = (src_dir / url).resolve()

            # Check if target is outside examples_dir
            try:
                rel_path = target_path.relative_to(examples_dir)
            except ValueError:
                # Target is outside examples_dir, leave as-is
                return m.group(0)

            parts = list(rel_path.parts)

            # Determine the output path for the target
            if len(parts) > 0 and parts[-1].lower() in ("readme.md", "readme"):
                # This is a README link
                if len(parts) == 1:
                    # Link to root README -> index.qmd
                    target_output = "index.qmd"
                elif len(parts) == 2:
                    if parts[0] == ".":
                        # Current directory README
                        target_output = "index.qmd"
                    else:
                        # subdir/README.md
                        parent_dir = parts[0]
                        if parent_dir in parent_index_only:
                            target_output = f"{parent_dir}/index.qmd"
                        else:
                            target_output = f"{parent_dir}.qmd"
                else:
                    # Deeper nesting: parent/subdir/README.md
                    # Build the full path like "parent/subdir"
                    full_path = "/".join(parts[:-1])  # Remove README.md
                    # Check if this exact path is in allowlist
                    if full_path in allowlist_entries:
                        # This is a sub-entry with its own entry -> use .qmd
                        target_output = f"{full_path}.qmd"
                    elif parts[0] == ".":
                        # ./subdir/README.md -> check if subdir has own entry
                        subdir = parts[1]
                        if subdir in parent_index_only:
                            target_output = f"{subdir}/index.qmd"
                        else:
                            target_output = f"{subdir}.qmd"
                    else:
                        # parent/subdir where parent doesn't have own entry
                        target_output = f"{full_path}/index.qmd"
            else:
                # Regular .md file -> convert to .qmd, keep path structure
                target_output = "/".join(parts)[:-2] + "qmd"

            # Compute relative path from current output file to target
            current_parts = current_output_path.split("/")
            target_parts = target_output.split("/")

            # Special case: if current is a subdir file and target is a single-component file at root
            # Example: current="magistral/vision", target="magistral.qmd"
            if len(current_parts) > 1 and len(target_parts) == 1:
                # Current is in subdir, target is at root level
                # Go up to root: ../ for each level
                up_count = len(current_parts) - 1
                rel_parts = [".."] * up_count + [target_parts[0]]
                new_url = "/".join(rel_parts)
            else:
                # Find common prefix
                i = 0
                while (
                    i < min(len(current_parts) - 1, len(target_parts))
                    and current_parts[i] == target_parts[i]
                ):
                    i += 1

                # Build relative path: go up (../) then down to target
                up_count = len(current_parts) - 1 - i
                rel_parts = [".."] * up_count + target_parts[i:]

                if not rel_parts or rel_parts == [".."]:
                    # Points to same directory or parent
                    new_url = "/".join(rel_parts) if rel_parts else "."
                else:
                    new_url = "/".join(rel_parts)

            return f"[{text}]({new_url})"
        except (ValueError, IndexError):
            return m.group(0)

    return LINK_RE.sub(repl, md)


def write_qmd(out_path: Path, title: str, body_md: str):
    out_path.parent.mkdir(parents=True, exist_ok=True)
    fm = f"---\ntitle: {title!r}\nexecute:\n  eval: false\nformat:\n  html:\n    toc: true\n---\n\n"
    out_path.write_text(fm + body_md, encoding="utf-8")


def update_quarto_yml(generated: list[tuple[str, str, str]]):
    """
    Update _quarto.yml with the generated example files in the correct order.
    This keeps the sidebar in sync with the allowlist.

    Model Guides is now nested under "Getting Started" section.
    Creates nested sections for models with sub-entries (e.g., magistral, ministral3).
    Parent pages are now flat files (e.g., ministral3.qmd) with sub-pages in subdirs.
    """
    quarto_yml = ROOT / "_quarto.yml"
    if not quarto_yml.exists():
        print(f"[WARN] {quarto_yml} not found, skipping update", file=sys.stderr)
        return

    content = quarto_yml.read_text(encoding="utf-8")

    # First pass: find all parents that have sub-entries
    parents_with_subs = set()
    for path, _name, _title in generated:
        if "/" in path:
            parent = path.split("/")[0]
            parents_with_subs.add(parent)

    # Build the YAML contents while preserving allowlist order
    lines = []
    processed_sections = set()

    for path, _name, title in generated:
        # Check if this is a parent page that has sub-pages
        if path in parents_with_subs:
            # This is a parent page with sub-pages - create a nested section
            if path not in processed_sections:
                processed_sections.add(path)
                section_title = (
                    title or path.replace("-", " ").replace("_", " ").title()
                )
                lines.append(f'                - section: "{section_title}"')
                lines.append("                  contents:")
                # Add the parent page first
                lines.append(f"                    - docs/models/{path}.qmd")
                # Then add all sub-pages
                for sub_path, _sub_name, _sub_title in generated:
                    if "/" in sub_path and sub_path.split("/")[0] == path:
                        lines.append(
                            f"                    - docs/models/{sub_path}.qmd"
                        )
        elif "/" not in path:
            # This is a flat item with no sub-pages
            # Skip if it was already included as part of a parent section
            if path not in processed_sections:
                lines.append(f"                - docs/models/{path}.qmd")

    yaml_content = "\n".join(lines) + "\n"

    # Pattern to match only the Model Guides contents, stopping at the next item
    # in Getting Started (lines starting with 12 spaces: same level as the section)
    pattern = r'(            - section: "Model Guides"\n              contents:)([^\n]*|.*?)(?=\n            - |\n        - section:|\n\nformat:)'

    def replacement(match):
        prefix = match.group(1)
        return prefix + "\n" + yaml_content

    new_content = re.sub(pattern, replacement, content, flags=re.DOTALL)

    if new_content != content:
        quarto_yml.write_text(new_content, encoding="utf-8")
        print(f"Updated {quarto_yml}")
    else:
        print(f"No changes needed for {quarto_yml}")


def main():
    allow = read_allowlist()
    if not EXAMPLES_DIR.exists():
        print(f"[WARN] {EXAMPLES_DIR} not found", file=sys.stderr)
        return

    (OUTPUT_DIR / "assets").mkdir(parents=True, exist_ok=True)

    # First pass: identify which parents have their own entry vs only sub-entries
    parent_entries = set()  # Parents that have their own entry
    parent_with_subs = set()  # Parents that have sub-entries
    allowlist_entries = set()  # All entries in allowlist

    for item in allow:
        if isinstance(item, str):
            name = item
        else:
            name = item.get("name")

        allowlist_entries.add(name)

        if "/" in name:
            parent = name.split("/")[0]
            parent_with_subs.add(parent)
        else:
            parent_entries.add(name)

    # Parents with subs that DON'T have their own entry -> use index.qmd
    parent_index_only = parent_with_subs - parent_entries

    generated = []
    seen_dirs = set()  # Track which parent directories we've created index for

    for item in allow:
        if isinstance(item, str):
            name = item
            title = None
        else:
            name = item.get("name")
            title = item.get("title")

        if not name:
            print(f"[WARN] Skipping item without name: {item}", file=sys.stderr)
            continue

        src_dir = EXAMPLES_DIR / name
        if not src_dir.exists() or not src_dir.is_dir():
            print(f"[WARN] Skipping {name} (not a directory)", file=sys.stderr)
            continue

        readme = find_readme(src_dir)
        if not readme:
            print(f"[WARN] Skipping {name} (no README.md)", file=sys.stderr)
            continue

        md = readme.read_text(encoding="utf-8")

        # Determine output path first (needed for link rewriting)
        parts = name.split("/")
        if len(parts) == 1:
            # Simple case: no subdirectory
            out_path = OUTPUT_DIR / f"{parts[0]}.qmd"
            sidebar_path = parts[0]
        else:
            # Has subdirectory: e.g., magistral/think
            parent = parts[0]
            child = "-".join(parts[1:])  # handle nested subdirs
            out_path = OUTPUT_DIR / parent / f"{child}.qmd"
            sidebar_path = f"{parent}/{child}"

        # Remove the first H1 (we use frontmatter title instead)
        md, _ = remove_first_h1(md)
        # Rewrite links between README files
        md = rewrite_readme_links(
            md,
            src_dir,
            EXAMPLES_DIR,
            parent_index_only,
            name,
            allowlist_entries,
            sidebar_path,
        )
        md = rewrite_and_copy_assets(md, src_dir, OUTPUT_DIR)

        # Handle parent page generation for sub-entries
        if len(parts) > 1:
            # Has subdirectory: e.g., magistral/think
            parent = parts[0]

            # Create parent.qmd if not already done and parent doesn't have own entry
            if parent not in seen_dirs and parent in parent_index_only:
                parent_readme = find_readme(EXAMPLES_DIR / parent)
                if parent_readme:
                    parent_md = parent_readme.read_text(encoding="utf-8")
                    parent_md, _ = remove_first_h1(parent_md)
                    parent_md = rewrite_readme_links(
                        parent_md,
                        EXAMPLES_DIR / parent,
                        EXAMPLES_DIR,
                        parent_index_only,
                        parent,
                        allowlist_entries,
                        parent,
                    )
                    parent_md = rewrite_and_copy_assets(
                        parent_md, EXAMPLES_DIR / parent, OUTPUT_DIR
                    )
                    parent_title = parent.replace("-", " ").replace("_", " ").title()
                    write_qmd(OUTPUT_DIR / f"{parent}.qmd", parent_title, parent_md)
                    generated.append((parent, parent, parent_title))
                    seen_dirs.add(parent)

        if not title:
            title = name.replace("/", " ").replace("-", " ").title()

        write_qmd(out_path, title, md)
        generated.append((sidebar_path, name, title))

    # Index page - preserve allowlist order
    if generated:
        listing = "\n".join(
            [f"- [{title}]({path}.qmd)" for path, name, title in generated]
        )
        index_md = (
            "# Model Guides\n\nBelow are the curated examples for training various model architectures:\n\n"
            + listing
            + "\n"
        )
        index_fm = (
            "---\nexecute:\n  eval: false\nformat:\n  html:\n    toc: true\n---\n\n"
        )
        (OUTPUT_DIR / "index.qmd").write_text(index_fm + index_md, encoding="utf-8")

        # Auto-update _quarto.yml to keep sidebar in sync
        update_quarto_yml(generated)


if __name__ == "__main__":
    main()


================================================
FILE: docs/sequence_parallelism.qmd
================================================
---
title: Sequence Parallelism
description: Train with long sequences split across multiple GPUs.
---

Sequence parallelism is a technique that splits sequences across multiple GPUs,
allowing you to train with very long sequences that wouldn't fit on a single GPU. Each
GPU processes a different portion of the sequence, and the results are aggregated
through a ring communication pattern.

## When to Use Sequence Parallelism

Use sequence parallelism when:

- You need to train with sequence lengths that don't fit into a single GPU's memory
- You have multiple GPUs available
- You're experiencing OOM (Out Of Memory) errors with long sequences

## Configuration

To enable sequence parallelism, add the following to your configuration file:

```yaml
# Set to a divisor (> 1) of the number of GPUs available
context_parallel_size: 4  # Split sequences across 4 GPUs
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
heads_k_stride: 1
# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
# "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
ring_attn_func:
```

The `context_parallel_size` should be a divisor of the total number of GPUs. For example:

- With 8 GPUs, valid values would be 2, 4, or 8
- With 4 GPUs, valid values would be 2 or 4

## Implementation Details

When sequence parallelism is enabled:

1. Each sequence is divided into equal chunks across the GPUs in a sequence parallel group
2. The data collator handles the chunking of input_ids, attention_mask, labels, and position_ids
3. Position IDs are adjusted to maintain proper relative positions
4. The trainer uses special ring communication patterns for attention operations

## Requirements

To use sequence parallelism, you need:

- Multiple GPUs (at least 2)
- The `ring-flash-attn` package. Install with:
  - `pip install axolotl[ring-flash-attn]` (preferred)
  - `pip install ring-flash-attn>=0.1.4`

## Limitations

- Flash attention must be enabled for this to work (`flash_attention: true` in config YAML)
- May have a small performance overhead due to communication between GPUs

## Example

```yaml
base_model: meta-llama/Llama-3-8B-Instruct
sequence_len: 8192

...

context_parallel_size: 4  # Split each sequence into 4 parts, one per GPU
# Optional; strides across the key dimension. Larger values use more memory but should make training faster.
heads_k_stride: 1
# Optional; one of "varlen_llama3" or "batch_ring". Defaults to
# "varlen_llama3" when `sample_packing: true`, and "batch_ring" otherwise.
ring_attn_func:

...
```

This will train the Llama 3 8B model with 8K context length, with each sequence split
into 2 subsequences of length 4096 across 2 GPUs.

## Sample Packing with Sequence Parallelism

Sequence parallelism is compatible with Axolotl's sample packing functionality. When using both features together:

1. Samples are first packed together
2. The packed sequences are then divided across GPUs in the sequence parallel group
3. Position IDs are automatically adjusted to maintain proper relative positions

## Effect on Batch Size

When using sequence parallelism, your effective global batch size is **divided** by the `context_parallel_size`. This happens because:

- Each group of `context_parallel_size` GPUs works on the same batch (just different parts of each sequence)
- The number of batches processed per step decreases

For example:
- With 8 GPUs and no sequence parallelism: 8 different batches processed per step
- With 8 GPUs and `context_parallel_size=4`: Only 2 different batches processed per step (each split across 4 GPUs)
- If your per-GPU `micro_batch_size` is 2, the global batch size decreases from 16 to 4


================================================
FILE: docs/streaming.qmd
================================================
---
title: Streaming Datasets
description: How to use streaming mode for large-scale datasets and memory-efficient training
order: 10
---

Streaming enables memory-efficient training with large datasets by loading data
incrementally rather than loading the entire dataset into memory at once.

Use streaming when:

- Your dataset is too large to fit in memory (e.g. when you're doing pretraining with massive text corpora)
- You want to start training immediately without preprocessing the entire dataset

Streaming works with both remote and locally stored datasets!

::: {.callout-note}
Streaming currently only supports a single dataset. Multi-dataset support will be added soon.
:::


## Configuration

### Basic Streaming

Enable streaming mode by setting the `streaming` flag:

```yaml
streaming: true
```

### Pretraining with Streaming

For pretraining tasks, streaming is automatically enabled when using `pretraining_dataset`:

```yaml
pretraining_dataset:
  - path: HuggingFaceFW/fineweb-edu
    type: pretrain
    text_column: text
    split: train

# Optionally, enable sample packing
streaming_multipack_buffer_size: 10000
sample_packing: true
```

### SFT with Streaming

For supervised fine-tuning with streaming:

```yaml
streaming: true
datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
    split: train

# Optionally, enable sample packing
streaming_multipack_buffer_size: 10000
sample_packing: true
```

## Configuration Options

### `streaming_multipack_buffer_size`

Controls the buffer size for multipack streaming (default: 10,000). This determines how
many samples are buffered before packing. Larger buffers can improve packing efficiency
but use more memory.

### `shuffle_merged_datasets`

When enabled, shuffles the streaming dataset using the buffer. This requires additional
memory for the shuffle buffer.

## Sample Packing with Streaming

Sample packing is supported for streaming datasets. When enabled, multiple samples are
packed into a single sequence to maximize GPU utilization:

```yaml
sample_packing: true
streaming_multipack_buffer_size: 10000

# For SFT: attention is automatically isolated between packed samples
# For pretraining: control with pretrain_multipack_attn
pretrain_multipack_attn: true  # prevent cross-attention between packed samples
```

For more information, see our [documentation](multipack.qmd) on multipacking.

## Important Considerations

### Memory Usage

While streaming reduces memory usage compared to loading entire datasets, you still need
to consider:

- You can control the memory usage by adjusting `streaming_multipack_buffer_size`
- Sample packing requires buffering multiple samples
- Shuffling requires additional memory for the shuffle buffer

### Performance

- Streaming may have slightly higher latency compared to preprocessed datasets, as samples are processed on-the-fly
- Network speed and disk read speed are important when streaming from remote sources or a local dataset, respectively
- Consider using `axolotl preprocess` for smaller or more frequently used datasets

### Evaluation Datasets

Evaluation datasets are not streamed to ensure consistent evaluation metrics. They're
loaded normally even when training uses streaming.

## Examples

See the `examples/streaming/` directory for complete configuration examples:

- `pretrain.yaml`: Pretraining with streaming dataset
- `sft.yaml`: Supervised fine-tuning with streaming


================================================
FILE: docs/telemetry.qmd
================================================
---
title: Telemetry
description: A description of the telemetry implementation in Axolotl.
---

# Telemetry in Axolotl

Axolotl implements anonymous telemetry to help maintainers understand how the library
is used and where users encounter issues. This data helps prioritize features, optimize
performance, and fix bugs.

## Data Collection

We collect:

- System info: OS, Python version, Axolotl version, PyTorch version, Transformers
version, etc.
- Hardware info: CPU count, memory, GPU count and models
- Runtime metrics: Training progress, memory usage, timing information
- Usage patterns: Models (from a whitelist) and configurations used
- Error tracking: Stack traces and error messages (sanitized to remove personal
information)

Personally identifiable information (PII) is not collected.

## Implementation

Telemetry is implemented using PostHog and consists of:

- `axolotl.telemetry.TelemetryManager`: A singleton class that initializes the
telemetry system and provides methods for tracking events.
- `axolotl.telemetry.errors.send_errors`: A decorator that captures exceptions and
sends sanitized stack traces.
- `axolotl.telemetry.runtime_metrics.RuntimeMetricsTracker`: A class that tracks
runtime metrics during training.
- `axolotl.telemetry.callbacks.TelemetryCallback`: A Trainer callback that sends
runtime metrics telemetry.

The telemetry system will block training startup for 10 seconds to ensure users are
aware of data collection, unless telemetry is explicitly enabled or disabled.

## Opt-Out Mechanism

Telemetry is **enabled by default** on an opt-out basis. To disable it, set
`AXOLOTL_DO_NOT_TRACK=1` or `DO_NOT_TRACK=1`.

A warning message will be logged on start to clearly inform users about telemetry.
We will remove this after some period.

To hide the warning message about telemetry that is displayed on train, etc. startup,
explicitly set: `AXOLOTL_DO_NOT_TRACK=0` (enable telemetry) or `AXOLOTL_DO_NOT_TRACK=1`
(explicitly disable telemetry).

## Privacy

- All path-like config information is automatically redacted from telemetry data
- Model information is only collected for whitelisted organizations
    - See `axolotl/telemetry/whitelist.yaml` for the set of whitelisted organizations
- Each run generates a unique anonymous ID
    - This allows us to link different telemetry events in a single same training run
- Telemetry is only sent from the main process to avoid duplicate events


================================================
FILE: docs/torchao.qmd
================================================
---
title: "PyTorch ao"
description: "Custom data types and layouts for training and inference"
---

To use experimental optimizers (`AdamWFp8`, `AdamW4bit`, `AdamW8bit`) from Pytorch Ao, please install the package as shown below.

::: {.callout-tip}
Some experimental optimizers are already present in regular Pytorch, so please re-check if you actually need this package!
:::

### Installation

Stable Release from the PyTorch index

```bash
pip install torchao --extra-index-url https://download.pytorch.org/whl/cu121 # full options are cpu/cu118/cu121/cu124
```


Nightly release

```bash
pip install --pre torchao-nightly --index-url https://download.pytorch.org/whl/nightly/cu121 # full options are cpu/cu118/cu121/cu124
```


================================================
FILE: docs/unsloth.qmd
================================================
---
title: "Unsloth"
description: "Hyper-optimized QLoRA finetuning for single GPUs"
---

### Overview

Unsloth provides hand-written optimized kernels for LLM finetuning that slightly improve speed and VRAM over
standard industry baselines.

::: {.callout-important}
Due to breaking changes in transformers `v4.48.0`, users will need to downgrade to `<=v4.47.1` to use this patch.

This will later be deprecated in favor of [LoRA Optimizations](lora_optims.qmd).
:::


### Installation

The following will install the correct unsloth and extras from source.

```bash
python scripts/unsloth_install.py | sh
```

### Usage

Axolotl exposes a few configuration options to try out unsloth and get most of the performance gains.

Our unsloth integration is currently limited to the following model architectures:
 - llama

These options are specific to LoRA finetuning and cannot be used for multi-GPU finetuning
```yaml
unsloth_lora_mlp: true
unsloth_lora_qkv: true
unsloth_lora_o: true
```

These options are composable and can be used with multi-gpu finetuning
```yaml
unsloth_cross_entropy_loss: true
unsloth_rms_norm: true
unsloth_rope: true
```

### Limitations

- Single GPU only; e.g. no multi-gpu support
- No deepspeed or FSDP support (requires multi-gpu)
- LoRA + QLoRA support only. No full fine tunes or fp8 support.
- Limited model architecture support. Llama, Phi, Gemma, Mistral only
- No MoE support.


================================================
FILE: examples/LiquidAI/README.md
================================================
# Finetune Liquid Foundation Models 2 (LFM2) with Axolotl

[Liquid Foundation Models 2 (LFM2)](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38) are a family of small, open-weight models from [Liquid AI](https://www.liquid.ai/) focused on quality, speed, and memory efficiency. Liquid AI released text-only [LFM2](https://huggingface.co/collections/LiquidAI/lfm2-686d721927015b2ad73eaa38) and text+vision [LFM2-VL](https://huggingface.co/collections/LiquidAI/lfm2-vl-68963bbc84a610f7638d5ffa) models.

LFM2 features a new hybrid Liquid architecture with multiplicative gates, short-range convolutions, and grouped query attention, enabling fast training and inference.

This guide shows how to fine-tune both the LFM2 and LFM2-VL models with Axolotl.

Thanks to the team at LiquidAI for giving us early access to prepare for these releases.

## Getting Started

1.  Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

    Here is an example of how to install from pip:
    ```bash
    # Ensure you have a compatible version of Pytorch installed
    pip3 install packaging setuptools wheel ninja
    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
    ```

2.  Run one of the finetuning examples below.

    **LFM2**
    ```bash
    # FFT SFT (1x48GB @ 25GiB)
    axolotl train examples/LiquidAI/lfm2-350m-fft.yaml
    ```

    **LFM2-VL**
    ```bash
    # LoRA SFT (1x48GB @ 2.7GiB)
    axolotl train examples/LiquidAI/lfm2-vl-lora.yaml
    ```

    **LFM2-MoE**
    ```bash
    pip install git+https://github.com/huggingface/transformers.git@0c9a72e4576fe4c84077f066e585129c97bfd4e6

    # LoRA SFT (1x48GB @ 16.2GiB)
    axolotl train examples/LiquidAI/lfm2-8b-a1b-lora.yaml
    ```

### TIPS

- **Installation Error**: If you encounter `ImportError: ... undefined symbol ...` or `ModuleNotFoundError: No module named 'causal_conv1d_cuda'`, the `causal-conv1d` package may have been installed incorrectly. Try uninstalling it:
  ```bash
  pip uninstall -y causal-conv1d
  ```

- **Dataset Loading**: Read more on how to load your own dataset in our [documentation](https://docs.axolotl.ai/docs/dataset_loading.html).
- **Dataset Formats**:
  - For LFM2 models, the dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
  - For LFM2-VL models, Axolotl follows the multi-content Messages format. See our [Multimodal docs](https://docs.axolotl.ai/docs/multimodal.html#dataset-format) for details.

## Optimization Guides

- [Optimizations Guide](https://docs.axolotl.ai/docs/optimizations.html)

## Related Resources

- [LFM2 Blog](https://www.liquid.ai/blog/liquid-foundation-models-v2-our-second-series-of-generative-ai-models)
- [LFM2-VL Blog](https://www.liquid.ai/blog/lfm2-vl-efficient-vision-language-models)
- [LFM2-MoE Blog](https://www.liquid.ai/blog/lfm2-8b-a1b-an-efficient-on-device-mixture-of-experts)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/LiquidAI/lfm2-350m-fft.yaml
================================================
base_model: LiquidAI/LFM2-350M

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

eot_tokens:
  - "<|im_end|>"
datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_field_role: from
    message_field_content: value
dataset_prepared_path: last_run_prepared
val_set_size: 0.05
output_dir: ./outputs/out

sequence_len: 4096
sample_packing: true


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 5e-5

bf16: true
tf32: true

gradient_checkpointing: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 2
saves_per_epoch: 1

weight_decay: 0.0

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/LiquidAI/lfm2-8b-a1b-lora.yaml
================================================
base_model: LiquidAI/LFM2-8B-A1B

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: true

eot_tokens:
  - "<|im_end|>"
datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_field_role: from
    message_field_content: value
dataset_prepared_path: last_run_prepared
val_set_size: 0.05
output_dir: ./outputs/out

sequence_len: 4096
sample_packing: true

adapter: lora
lora_model_dir:

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 4
num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 5e-5

bf16: true
tf32: true

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 2
saves_per_epoch: 1

weight_decay: 0.0

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/LiquidAI/lfm2-vl-lora.yaml
================================================
base_model: LiquidAI/LFM2-VL-450M
trust_remote_code: true
model_type: AutoModelForImageTextToText
processor_type: AutoProcessor

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]

dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

adapter: lora
lora_model_dir:

sequence_len: 8192
pad_to_sequence_len: false

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32: true

gradient_checkpointing: true
logging_steps: 1
flash_attention: true
eager_attention:

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/alst/README.md
================================================
# Arctic Long Sequence Training (ALST)

Artic Long Sequence Training (ALST) is a technique for training long context models using a variety of optimization
techniques. It is a combination of:
- TiledMLP: Leverage tiling over the sequence dimension on MLP layers to reduce memory usage
- Tiled Loss: Using optimized loss functions like Liger-Kernel or Cut Cross Entropy to reduce memory usage
- Activation Offloading: Offload activations to CPU RAM to reduce memory usage

For more information, you can check out the ALST paper [here](https://www.arxiv.org/abs/2506.13996).

## Usage

```yaml
tiled_mlp: true

# See Sequence Parallelism docs
# https://docs.axolotl.ai/docs/sequence_parallelism.html
context_parallel_size: int

plugins:
# See Cut Cross Entropy docs
# https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

# or Liger Kernel docs
# https://docs.axolotl.ai/docs/custom_integrations.html#liger-kernels
  - axolotl.integrations.liger.LigerPlugin
# ...

```


================================================
FILE: examples/alst/llama3-8b-deepspeed-alst.yaml
================================================
base_model: meta-llama/Llama-3.1-8B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

datasets:
  - path: togethercomputer/Long-Data-Collections
    type: completion
    field: text
    data_files:
      - pretrain/rp_sub.jsonl.zst
  - path: princeton-nlp/TextbookChapters
    type: completion
    field: chapter
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

sequence_len: 500_000
min_sample_len: 200_000
sample_packing: true

tiled_mlp: true
context_parallel_size: 8
plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_8bit
lr_scheduler: cosine
learning_rate: 2e-5

bf16: auto
tf32: true

gradient_checkpointing: true
activation_offloading: legacy

resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_steps: 100
saves_per_epoch: 1
evals_per_epoch: 2
weight_decay: 0.0
special_tokens:
  pad_token: <|end_of_text|>

deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/alst/llama3-8b-fsdp2-alst.yaml
================================================
base_model: meta-llama/Llama-3.1-8B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

datasets:
  - path: togethercomputer/Long-Data-Collections
    type: completion
    field: text
    data_files:
      - pretrain/rp_sub.jsonl.zst
  - path: princeton-nlp/TextbookChapters
    type: completion
    field: chapter
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

sequence_len: 500_000
min_sample_len: 200_000
sample_packing: true

tiled_mlp: true
context_parallel_size: 8
plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_8bit
lr_scheduler: cosine
learning_rate: 2e-5

bf16: auto
tf32: true

gradient_checkpointing: true
activation_offloading: legacy

resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_steps: 100
saves_per_epoch: 1
evals_per_epoch: 2
weight_decay: 0.0
special_tokens:
  pad_token: <|end_of_text|>

fsdp_version: 2
fsdp_config:
  offload_params: false  # offloading is currently not compatible with SP + torchao optimizer
  state_dict_type: SHARDED_STATE_DICT
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: LlamaDecoderLayer
  reshard_after_forward: true

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/apertus/README.md
================================================
# Finetune Swiss-AI's Apertus with Axolotl

[Apertus](https://huggingface.co/collections/swiss-ai/apertus-llm-68b699e65415c231ace3b059) is a family of opensource models trained by Swiss-ai.

This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Apertus is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).

    Here is an example of how to install from main for pip:

```bash
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
git clone https://github.com/axolotl-ai-cloud/axolotl.git
cd axolotl

pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation -e '.[flash-attn]'

# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
python scripts/cutcrossentropy_install.py | sh
```

2. (Optional, highly recommended) Install XIELU CUDA

```bash
## Recommended for reduced VRAM and faster speeds

# Point to CUDA toolkit directory
# For those using our Docker image, use the below path.
export CUDA_HOME=/usr/local/cuda

pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
```

For any installation errors, see [XIELU Installation Issues](#xielu-installation-issues)

3. Run the finetuning example:

```bash
axolotl train examples/apertus/apertus-8b-qlora.yaml
```

This config uses about 8.7 GiB VRAM.

Let us know how it goes. Happy finetuning! 🚀

### Tips

- For inference, the official Apertus team recommends `top_p=0.9` and `temperature=0.8`.
- You can instead use full paremter fine-tuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).

### XIELU Installation Issues

#### `ModuleNotFoundError: No module named 'torch'`

Please check these one by one:
- Running in correct environment
- Env has PyTorch installed
- CUDA toolkit is at `CUDA_HOME`

If those didn't help, please try the below solutions:

1. Pass env for CMAKE and try install again:

    ```bash
    Python_EXECUTABLE=$(which python) pip3 install git+https://github.com/nickjbrowning/XIELU@59d6031 --no-build-isolation --no-deps
    ```

2. Git clone the repo and manually hardcode python path:

    ```bash
    git clone https://github.com/nickjbrowning/XIELU
    cd xielu
    git checkout 59d6031

    cd xielu
    nano CMakeLists.txt  # or vi depending on your preference
    ```

    ```diff
    execute_process(
    -    COMMAND ${Python_EXECUTABLE} -c "import torch.utils; print(torch.utils.cmake_prefix_path)"
    +    COMMAND /root/miniconda3/envs/py3.11/bin/python -c "import torch.utils; print(torch.utils.cmake_prefix_path)"
        RESULT_VARIABLE TORCH_CMAKE_PATH_RESULT
        OUTPUT_VARIABLE TORCH_CMAKE_PATH_OUTPUT
        ERROR_VARIABLE TORCH_CMAKE_PATH_ERROR
    )
    ```

    ```bash
    pip3 install . --no-build-isolation --no-deps
    ```

## Optimization Guides

- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)

## Related Resources

- [Apertus Tech Report](https://github.com/swiss-ai/apertus-tech-report/blob/main/Apertus_Tech_Report.pdf)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/apertus/apertus-8b-qlora.yaml
================================================
base_model: swiss-ai/Apertus-8B-Instruct-2509

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/arcee/README.md
================================================
# Finetune ArceeAI's AFM with Axolotl

[Arcee Foundation Models (AFM)](https://huggingface.co/collections/arcee-ai/afm-45b-68823397c351603014963473) are a family of 4.5B parameter open weight models trained by Arcee.ai.

This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.

Thanks to the team at Arcee.ai for using Axolotl in supervised fine-tuning the AFM model.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as AFM is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).

    Here is an example of how to install from main for pip:

```bash
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
git clone https://github.com/axolotl-ai-cloud/axolotl.git
cd axolotl

pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation -e '.[flash-attn]'

# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
python scripts/cutcrossentropy_install.py | sh
```

2. Run the finetuning example:

```bash
axolotl train examples/arcee/afm-4.5b-qlora.yaml
```

This config uses about 7.8GiB VRAM.

Let us know how it goes. Happy finetuning! 🚀

### TIPS

- For inference, the official Arcee.ai team recommends `top_p: 0.95`, `temperature: 0.5`, `top_k: 50`, and `repeat_penalty: 1.1`.
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).

## Optimization Guides

- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)

## Related Resources

- [AFM Blog](https://docs.arcee.ai/arcee-foundation-models/introduction-to-arcee-foundation-models)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/arcee/afm-4.5b-qlora.yaml
================================================
base_model: arcee-ai/AFM-4.5B

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/archived/README.md
================================================
# Archived Examples

This directory contains examples that are no longer maintained and may no longer be functional.

We keep them around for archival purposes in case they are useful to others.


================================================
FILE: examples/archived/cerebras/btlm-ft.yml
================================================
base_model: cerebras/btlm-3b-8k-base
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: GPT2Tokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true
tokenizer_use_fast: true
tokenizer_legacy: true
push_dataset_to_hub:
hf_use_auth_token: true
datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path: last_prepared_run
val_set_size: 0.05

adapter:
lora_model_dir:
sequence_len: 2048
max_packed_sequence_len:
sample_packing: false
sample_packing_eff_est:
sample_packing_seq_len_multiplier:
total_num_tokens:

lora_r:
lora_alpha:
lora_dropout:
lora_target_modules:
lora_target_linear:

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

output_dir: ./outputs/btlm-out
gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_fused
adam_beta2: 0.95
adam_eps: 0.000000001
max_grad_norm: 1.0

torchdistx_path:
lr_scheduler: cosine
lr_quadratic_warmup: true
learning_rate: 0.000085
train_on_inputs: true
group_by_length: false
bf16: auto
tf32: true

gradient_checkpointing: false
resume_from_checkpoint:
logging_steps: 1

flash_attention: true
sdp_attention:
flash_optimum:

gptq_groupsize:
gptq_model_v1:

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
save_total_limit:

weight_decay: 0.1
special_tokens:
  pad_token: "<|endoftext|>"
fsdp:
#  - full_shard
#  - auto_wrap
fsdp_config:
#  fsdp_state_dict_type: FULL_STATE_DICT
#  fsdp_transformer_layer_cls_to_wrap: BTLMBlock


================================================
FILE: examples/archived/cerebras/qlora.yml
================================================
base_model: cerebras/Cerebras-GPT-1.3B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true
push_dataset_to_hub:
datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
adapter: qlora
lora_model_dir:
sequence_len: 2048
lora_r: 16
lora_alpha: 32
lora_dropout: 0.05
lora_target_modules:
  - c_fc
  - c_attn
  - c_proj
lora_target_linear:
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
output_dir: ./outputs/qlora-out
batch_size: 4
micro_batch_size: 4
num_epochs: 2
optimizer: paged_adamw_8bit
torchdistx_path:
lr_scheduler: cosine
learning_rate: 0.0002
bf16: auto
tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1
special_tokens:
  pad_token: "<|endoftext|>"


================================================
FILE: examples/archived/code-llama/13b/lora.yml
================================================
base_model: codellama/CodeLlama-13b-hf
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: CodeLlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: true
load_in_4bit: false

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out

sequence_len: 4096
sample_packing: true


adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"


================================================
FILE: examples/archived/code-llama/13b/qlora.yml
================================================
base_model: codellama/CodeLlama-13b-hf
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: CodeLlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/qlora-out

adapter: qlora
lora_model_dir:

sequence_len: 4096
sample_packing: true


lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: paged_adamw_32bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"


================================================
FILE: examples/archived/code-llama/34b/lora.yml
================================================
base_model: codellama/CodeLlama-34b-hf
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: CodeLlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: true
load_in_4bit: false

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out

sequence_len: 4096
sample_packing: true


adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"


================================================
FILE: examples/archived/code-llama/34b/qlora.yml
================================================
base_model: codellama/CodeLlama-34b-hf
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: CodeLlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/qlora-out

adapter: qlora
lora_model_dir:

sequence_len: 4096
sample_packing: true


lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: paged_adamw_32bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"


================================================
FILE: examples/archived/code-llama/7b/lora.yml
================================================
base_model: codellama/CodeLlama-7b-hf
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: CodeLlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: true
load_in_4bit: false

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out

sequence_len: 4096
sample_packing: true


adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"


================================================
FILE: examples/archived/code-llama/7b/qlora.yml
================================================
base_model: codellama/CodeLlama-7b-hf
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: CodeLlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/qlora-out

adapter: qlora
lora_model_dir:

sequence_len: 4096
sample_packing: true


lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: paged_adamw_32bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"


================================================
FILE: examples/archived/code-llama/README.md
================================================
# Overview

This is an example of CodeLLaMA configuration for 7b, 13b and 34b.

The 7b variant fits on any 24GB VRAM GPU and will take up about 17 GB of VRAM during training if using qlora and 20 GB if using lora. On a RTX 4090 it trains 3 epochs of the default dataset in about 15 minutes.

The 13b variant will fit if you change these settings to these values:
gradient_accumulation_steps: 2
micro_batch_size: 1

The 34b variant does not fit on 24GB of VRAM - you will need something with +40 gb VRAM that also supports flash attention v2 - A6000 or A100 are good choices.

```shell
accelerate launch scripts/finetune.py examples/code-llama/[MODEL_SIZE]/qlora.yml

```
or

```shell
accelerate launch scripts/finetune.py examples/code-llama/[MODEL_SIZE]/lora.yml

```


================================================
FILE: examples/archived/dbrx/16bit-lora.yaml
================================================
base_model: LnL-AI/dbrx-base-converted-v2
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

sequence_len: 512
sample_packing: false
pad_to_sequence_len: false

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

adapter: lora
lora_model_dir:
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
# w1, w2, & v1 will hang the trainer
lora_target_modules:
  - q_proj # attn
  - k_proj # attn
  - v_proj # attn
  - out_proj # attn
  - layer # router
#  - w1
#  - w2
#  - v1

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: false  # don't use with fsdp_activation_checkpointing
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1

weight_decay: 0.0
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: false
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: DbrxBlock
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_activation_checkpointing: true


================================================
FILE: examples/archived/dbrx/8bit-lora.yaml
================================================
base_model: LnL-AI/dbrx-base-converted-v2
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

load_in_8bit: true
load_in_4bit: false

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

sequence_len: 512
sample_packing: false
pad_to_sequence_len: false

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

adapter: lora
lora_model_dir:
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
# w1, w2, & v1 will hang the trainer
lora_target_modules:
  - q_proj # attn
  - k_proj # attn
  - v_proj # attn
  - out_proj # attn
  - layer # router
#  - w1
#  - w2
#  - v1

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: false  # don't use with fsdp_activation_checkpointing
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1

weight_decay: 0.0
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: false
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: DbrxBlock
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_activation_checkpointing: true


================================================
FILE: examples/archived/dbrx/README.md
================================================
# DBRX MoE

Currently, for LoRA, only the `q_proj`, `k_proj`, `v_proj` `out_proj` and `layer` Linear layers are trainable.

We are using the "converted" base models based on [this issue](https://huggingface.co/databricks/dbrx-instruct/discussions/10)
where the Experts are fused as an `nn.Parameter` rather than a `nn.Linear` layer. However, the implementation
is still a bit buggy and attempting to train a LoRA adapter over those `w1`, `w2` and `v1` layers
results in the trainer hanging.


### FSDP
We've tested using the [`LnL-AI/dbrx-base-converted-v2`](https://huggingface.co/LnL-AI/dbrx-base-converted-v2) model as the base model for FSDP.

The high memory usage seen w/ FSDP is due to FSDP not supporting 8bit optimizers.

- 16-bit LoRA w/ FSDP
  - ✅ w/o CPU Offload - 8x80GB uses ~80GiB/gpu
  - ❌ w/ CPU Offload - `paged_adamw_8bit` optimizer errors from being on cpu
- ✅ 8-bit LoRA w/ FSDP
- ❌ 4-bit QLoRA w/ FSDP - errors w/: `Error an illegal memory access was encountered at line 90 in file /src/csrc/ops.cu`
- ✅ bf16 full finetune w/ FSDP, freezing all but first 8 layers (8x80GB uses ~78GiB/gpu)


### Deepspeed

WIP


================================================
FILE: examples/archived/dbrx/fft-ds-zero3.yaml
================================================
base_model: LnL-AI/dbrx-base-converted-v2
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

sequence_len: 512
sample_packing: false
pad_to_sequence_len: false

unfrozen_parameters:
  - transformer.blocks.[0-7].

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1

weight_decay: 0.0
deepspeed: deepspeed_configs/zero3_bf16.json


================================================
FILE: examples/archived/deepcoder/deepcoder-14B-preview-lora.yml
================================================
base_model: agentica-org/DeepCoder-14B-Preview
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: true
load_in_4bit: false
strict: false

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out

sequence_len: 4096
sample_packing: true
eval_sample_packing: false


adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: examples/archived/falcon/config-7b-lora.yml
================================================
base_model: tiiuae/falcon-7b
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
trust_remote_code: true

load_in_8bit: true
load_in_4bit: false
gptq: false
push_dataset_to_hub:
datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca:chat
dataset_prepared_path:
val_set_size: 0.05
adapter: lora
lora_model_dir:
sequence_len: 2048
max_packed_sequence_len:
lora_r: 16
lora_alpha: 32
lora_dropout: 0.0
lora_target_linear: true
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
output_dir: ./outputs/falcon-7b
batch_size: 2
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_bnb_8bit
torchdistx_path:
lr_scheduler: cosine
learning_rate: 0.00003
bf16: auto
tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  pad_token: "<|endoftext|>"
  bos_token: "<|endoftext|>"
  eos_token: "<|endoftext|>"


================================================
FILE: examples/archived/falcon/config-7b-qlora.yml
================================================
# 1b: tiiuae/falcon-rw-1b
# 40b: tiiuae/falcon-40b
base_model: tiiuae/falcon-7b
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
trust_remote_code: true


load_in_8bit: false
# enable 4bit for QLoRA
load_in_4bit: true
gptq: false
push_dataset_to_hub:
datasets:
  - path: QingyiSi/Alpaca-CoT
    data_files:
      - Chain-of-Thought/formatted_cot_data/gsm8k_train.json
    type: "alpaca:chat"
dataset_prepared_path:
val_set_size: 0.05
# enable QLoRA
adapter: qlora
lora_model_dir:
sequence_len: 2048
max_packed_sequence_len:

# hyperparameters from QLoRA paper Appendix B.2
# "We find hyperparameters to be largely robust across datasets"
lora_r: 64
lora_alpha: 16
# 0.1 for models up to 13B
# 0.05 for 33B and 65B models
lora_dropout: 0.05
# add LoRA modules on all linear layers of the base model
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
output_dir: ./outputs/qlora-out

# QLoRA paper Table 9
# - 16 for 7b & 13b
# - 32 for 33b, 64 for 64b
# Max size tested on A6000
# - 7b: 40
# - 40b: 4
# decrease if OOM, increase for max VRAM utilization
micro_batch_size: 1
gradient_accumulation_steps: 2
num_epochs: 4
# Optimizer for QLoRA
optimizer: paged_adamw_32bit
torchdistx_path:
lr_scheduler: cosine
# QLoRA paper Table 9
# - 2e-4 for 7b & 13b
# - 1e-4 for 33b & 64b
learning_rate: 0.0002
bf16: auto
tf32: true
gradient_checkpointing: true
# stop training after this many evaluation losses have increased in a row
# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
early_stopping_patience: 3
resume_from_checkpoint:
auto_resume_from_checkpoints: true
logging_steps: 1
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.000001
special_tokens:
  pad_token: "<|endoftext|>"
  bos_token: "<|endoftext|>"
  eos_token: "<|endoftext|>"


================================================
FILE: examples/archived/falcon/config-7b.yml
================================================
base_model: tiiuae/falcon-7b
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

# required by falcon custom model code: https://huggingface.co/tiiuae/falcon-7b/tree/main
trust_remote_code: true
gptq: false
push_dataset_to_hub:
datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca:chat
dataset_prepared_path:
val_set_size: 0.05
adapter:
lora_model_dir:
sequence_len: 2048
max_packed_sequence_len:
lora_r: 64
lora_alpha: 32
lora_dropout: 0.0
lora_target_linear: true
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
output_dir: ./outputs/falcon-7b
batch_size: 2
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_bnb_8bit
torchdistx_path:
lr_scheduler: cosine
learning_rate: 0.00003
bf16: auto
tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  pad_token: "<|endoftext|>"
  bos_token: "<|endoftext|>"
  eos_token: "<|endoftext|>"


================================================
FILE: examples/archived/gemma/qlora.yml
================================================
# use google/gemma-7b if you have access
base_model: mhenrichsen/gemma-7b
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

# huggingface repo
datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
val_set_size: 0.1
output_dir: ./outputs/out

adapter: qlora
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

sequence_len: 4096
sample_packing: true
eval_sample_packing: false


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:


gradient_accumulation_steps: 3
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: examples/archived/gptj/qlora.yml
================================================
base_model: EleutherAI/gpt-j-6b
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true
push_dataset_to_hub:
datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
adapter: qlora
lora_model_dir:
sequence_len: 2048
max_packed_sequence_len:
lora_r: 8
lora_alpha: 32
lora_dropout: 0.05
lora_target_linear: true
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
output_dir: ./outputs/qlora-out
gradient_accumulation_steps: 2
micro_batch_size: 2
num_epochs: 2
optimizer: paged_adamw_8bit
torchdistx_path:
lr_scheduler: cosine
learning_rate: 0.0001
bf16: auto
tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1
special_tokens:
  pad_token: "<|endoftext|>"


================================================
FILE: examples/archived/jeopardy-bot/config.yml
================================================
base_model: huggyllama/llama-7b
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
datasets:
  - path: openaccess-ai-collective/jeopardy
    type: jeopardy
dataset_prepared_path:
val_set_size: 0.02
adapter:
lora_model_dir:
sequence_len: 512
max_packed_sequence_len:
lora_r:
lora_alpha:
lora_dropout:
lora_target_modules:
lora_fan_in_fan_out: false
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
output_dir: ./outputs/jeopardy-bot-7b
gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_bnb_8bit
torchdistx_path:
lr_scheduler: cosine
learning_rate: 0.00003
bf16: auto
tf32: true
resume_from_checkpoint:
logging_steps: 5
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1
tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"


================================================
FILE: examples/archived/mpt-7b/README.md
================================================
# MPT-7B

```shell
accelerate launch scripts/finetune.py examples/mpt-7b/config.yml

```


================================================
FILE: examples/archived/mpt-7b/config.yml
================================================
base_model: mosaicml/mpt-7b
# optionally might have model_type or tokenizer_type
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true  # required for mpt as their model class is not merged into transformers yet
load_in_8bit: false
datasets:
  - path: vicgalle/alpaca-gpt4
    type: alpaca
dataset_prepared_path:
val_set_size: 0.02
adapter:
lora_model_dir:
sequence_len: 2048
max_packed_sequence_len:
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
  - q_proj
  - v_proj
lora_fan_in_fan_out: false
wandb_project: mpt-alpaca-7b
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
output_dir: ./outputs/mpt-alpaca-7b
gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_bnb_8bit
torchdistx_path:
lr_scheduler: cosine
learning_rate: 0.0000002
bf16: auto
tf32: true
resume_from_checkpoint:
logging_steps: 5
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0001
tokens:
  pad_token: "<|padding|>"
  bos_token: "<|endoftext|>"
  eos_token: "<|endoftext|>"
  unk_token: "<|endoftext|>"


================================================
FILE: examples/archived/openllama-3b/README.md
================================================
# openllama-3b

Basic full tune
```shell
accelerate launch scripts/finetune.py examples/openllama-3b/config.yml
```

LoRA
```shell
accelerate launch scripts/finetune.py examples/openllama-3b/lora.yml
```

QLoRA
```shell
accelerate launch scripts/finetune.py examples/openllama-3b/qlora.yml
```


================================================
FILE: examples/archived/openllama-3b/config.yml
================================================
base_model: openlm-research/open_llama_3b_v2
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
push_dataset_to_hub:
datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
dataset_prepared_path:
val_set_size: 0.02
adapter:
lora_model_dir:
sequence_len: 1024
sample_packing: true
lora_r:
lora_alpha:
lora_dropout:
lora_target_modules:
lora_target_linear:
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
output_dir: ./outputs/openllama-out
gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_bnb_8bit
torchdistx_path:
lr_scheduler: cosine
learning_rate: 0.000003
float16: true
bf16: false
fp16: false
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1
special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"


================================================
FILE: examples/archived/openllama-3b/lora.yml
================================================
base_model: openlm-research/open_llama_3b_v2
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: true
load_in_4bit: false
push_dataset_to_hub:
datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
dataset_prepared_path:
val_set_size: 0.02
adapter: lora
lora_model_dir:
sequence_len: 1024
sample_packing: true
lora_r: 8
lora_alpha: 16
lora_dropout: 0.0
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
output_dir: ./outputs/lora-out
gradient_accumulation_steps: 1
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
torchdistx_path:
lr_scheduler: cosine
learning_rate: 0.0002
bf16: false
fp16: true
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1
special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"


================================================
FILE: examples/archived/openllama-3b/qlora.yml
================================================
base_model: openlm-research/open_llama_3b_v2
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true
push_dataset_to_hub:
datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
adapter: qlora
lora_model_dir:
sequence_len: 1024
sample_packing: true
lora_r: 8
lora_alpha: 32
lora_dropout: 0.05
lora_target_linear: true
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
output_dir: ./outputs/qlora-out
gradient_accumulation_steps: 1
micro_batch_size: 2
num_epochs: 4
optimizer: paged_adamw_32bit
torchdistx_path:
lr_scheduler: cosine
learning_rate: 0.0002
bf16: false
fp16: true
tf32: false
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1
special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"


================================================
FILE: examples/archived/pythia/lora.yml
================================================
base_model: EleutherAI/pythia-1.4b-deduped
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: true
datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
adapter: lora
lora_model_dir:
sequence_len: 512
lora_r: 16
lora_alpha: 32
lora_dropout: 0.05
lora_target_modules:
  - query_key_value
lora_target_linear:
lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
output_dir: ./outputs/lora-alpaca-pythia
gradient_accumulation_steps: 1
micro_batch_size: 4
num_epochs: 4
learning_rate: 0.00001
bf16: auto
tf32: true
resume_from_checkpoint:
weight_decay: 0.1
evals_per_epoch: 4
logging_steps: 1


================================================
FILE: examples/archived/pythia-12b/README.md
================================================
# Pythia 12B

- Single-GPU A100 only (?)

```shell
python scripts/finetune.py examples/pythia-12b/config.yml
```

⚠️ Multiple-GPU A100 - Doesn't seem to work with multi-gpu without causing OOM! ⚠️


================================================
FILE: examples/archived/pythia-12b/config.yml
================================================
base_model: EleutherAI/pythia-12b-deduped
base_model_ignore_patterns: pytorch*  # prefer safetensors
# optionally might have model_type or tokenizer_type
model_type: GPTNeoXForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
gptq: false
device_map: auto
datasets:
  - path: vicgalle/alpaca-gpt4
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
adapter:
lora_model_dir:
sequence_len: 2048
max_packed_sequence_len: 2048
lora_r: 64
lora_alpha: 32
lora_dropout: 0.0
lora_target_linear: true
lora_fan_in_fan_out: true  # pythia/GPTNeoX lora specific
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
output_dir: ./outputs/pythia-12b
gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 5
learning_rate: 0.00003
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
bf16: false
fp16: false
float16: true
tf32: true
flash_optimum: true
resume_from_checkpoint:
gradient_checkpointing: true


================================================
FILE: examples/archived/qwen/README.md
================================================
# Qwen

TODO

# Qwen2 MoE

✅ multipack
✅ qwen2_moe 4-bit QLoRA
✅ qwen2_moe 16-bit LoRA
❓ qwen2_moe 8-bit LoRA


================================================
FILE: examples/archived/qwen/lora.yml
================================================
base_model: Qwen/Qwen-7B
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

load_in_8bit: true
load_in_4bit: false

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out

sequence_len: 2048  # supports up to 8192
sample_packing: false
pad_to_sequence_len:

adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: false
resume_from_checkpoint:
logging_steps: 1
flash_attention:

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: examples/archived/qwen/qlora.yml
================================================
base_model: Qwen/Qwen-7B
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out

sequence_len: 2048  # supports up to 8192
sample_packing: false
pad_to_sequence_len:

adapter: qlora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: false
resume_from_checkpoint:
logging_steps: 1
flash_attention:

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: examples/archived/qwen/qwen2-moe-lora.yaml
================================================
base_model: Qwen/Qwen1.5-MoE-A2.7B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/out

sequence_len: 1024  # supports up to 32k
sample_packing: false
pad_to_sequence_len: false

adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 4
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: examples/archived/qwen/qwen2-moe-qlora.yaml
================================================
base_model: Qwen/Qwen1.5-MoE-A2.7B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/out

sequence_len: 1024  # supports up to 32k
sample_packing: false
pad_to_sequence_len: false

adapter: qlora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 4
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: examples/archived/redpajama/README.md
================================================
# RedPajama 3B preview release

```shell
accelerate launch scripts/finetune.py examples/redpajama/config-3b.yml

```


================================================
FILE: examples/archived/redpajama/config-3b.yml
================================================
base_model: togethercomputer/RedPajama-INCITE-Chat-3B-v1
# optionally might have model_type or tokenizer_type
model_type: GPTNeoXForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code:
load_in_8bit: false
datasets:
  - path: vicgalle/alpaca-gpt4
    type: alpaca
dataset_prepared_path:
val_set_size: 0.02
adapter:
lora_model_dir:
sequence_len: 2048
max_packed_sequence_len:
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
  - q_proj
  - v_proj
lora_fan_in_fan_out: false
wandb_project: redpajama-alpaca-3b
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
output_dir: ./outputs/redpajama-alpaca-3b
batch_size: 4
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_bnb_8bit
torchdistx_path:
lr_scheduler: cosine
learning_rate: 0.0000002
bf16: auto
tf32: true
resume_from_checkpoint:
logging_steps: 5
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0001
tokens:
  pad_token: "<|padding|>"
  bos_token: "<|endoftext|>"
  eos_token: "<|endoftext|>"
  unk_token: "<|endoftext|>"


================================================
FILE: examples/archived/replit-3b/config-lora.yml
================================================
base_model: replit/replit-code-v1-3b
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true
load_in_8bit: false
datasets:
  - path: vicgalle/alpaca-gpt4
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
adapter: lora
lora_model_dir:
sequence_len: 2048
max_packed_sequence_len:
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
  - Wqkv
  - mlp_up
  - mlp_down
wandb_project: lora-replit
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
output_dir: ./outputs/lora-replit
batch_size: 8
micro_batch_size: 1
num_epochs: 4
optimizer:
torchdistx_path:
lr_scheduler:
learning_rate: 0.00001
bf16: auto
tf32: true
gradient_checkpointing:
resume_from_checkpoint:
logging_steps: 1
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0
#special_tokens:


================================================
FILE: examples/archived/stablelm-2/1.6b/fft.yml
================================================
base_model: stabilityai/stablelm-2-1_6b
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.05
output_dir: ./outputs/out

sequence_len: 4096
sample_packing: true


adapter:
lora_model_dir:
lora_r:
lora_alpha:
lora_dropout:
lora_target_linear:

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
flash_attn_cross_entropy: false
flash_attn_rms_norm: true
flash_attn_fuse_mlp: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1

deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
weight_decay: 0.1
special_tokens:


================================================
FILE: examples/archived/stablelm-2/1.6b/lora.yml
================================================
base_model: stabilityai/stablelm-2-1_6b
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

load_in_8bit: true
load_in_4bit: false

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out

sequence_len: 4096
sample_packing: true


adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
flash_attn_cross_entropy: false
flash_attn_rms_norm: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: examples/archived/stablelm-2/README.md
================================================
# StableLM 2

This repository contains examples for training and processing using StableLM-2. It also includes a section to help you estimate the GPU requirements for your specific use case.

## Estimating GPU Requirements

| type          | deepspeed | batch size | context length | vRAM GPU (GBs) |
|---------------|-----------|------------|----------------|----------------|
| full finetune | N/A       | 1          | 4096           | ~21.5GBs       |
| full finetune | zero2     | 1          | 4096           | ~20GBs         |
| lora          | N/A       | 1          | 4096           | ~16.6GBs       |

The above are estimates and might differ slight depending on the setup for example whether you pack your sequence lengths or not (the above assumes you do to length 4096).

This blog post from Hamel Husain was a great resource for estimating these numbers: https://hamel.dev/notes/llm/03_estimating_vram.html

## Training
We have example scripts here for both full finetuning and lora using the popular alpaca dataset:

```shell
# preprocess the dataset
CUDA_VISIBLE_DEVICES="" python -m axolotl.cli.preprocess examples/stablelm-2/1.6b/lora.yml
```

Single GPU Training:
```shell
python -m axolotl.cli.train examples/stablelm-2/fft.yml --deepspeed deepspeed_configs/zero2.json
# OR
python -m axolotl.cli.train examples/stablelm-2/1.6b/lora.yml
```

Multinode GPU Training with `accelerate`:
```shell
# make sure you've configured accelerate properly
accelerate launch -m axolotl.cli.train examples/stablelm-2/1.6b/fft.yml --deepspeed deepspeed_configs/zero2.json
```


================================================
FILE: examples/archived/starcoder2/qlora.yml
================================================
base_model: bigcode/starcoder2-3b
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca


dataset_prepared_path:
val_set_size: 0.2
output_dir: ./outputs/qlora

adapter: qlora
lora_model_dir:

sequence_len: 8192
sample_packing: true


lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_run_id:
wandb_log_model:

gradient_accumulation_steps: 8
micro_batch_size: 2
num_epochs: 3
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 2e-5

bf16: auto
fp16: false
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
eval_steps:
saves_per_epoch: 4
save_steps:
save_total_limit: 2
weight_decay:
special_tokens:


================================================
FILE: examples/archived/tiny-llama/README.md
================================================
# Overview

This is a simple example of how to finetune TinyLlama1.1B using either lora or qlora:

LoRa:

```
accelerate launch -m axolotl.cli.train examples/tiny-llama/lora.yml
```

qLoRa:

```
accelerate launch -m axolotl.cli.train examples/tiny-llama/qlora.yml
```

Both take about 10 minutes to complete on a 4090.


================================================
FILE: examples/archived/tiny-llama/lora-mps.yml
================================================
base_model: TinyLlama/TinyLlama_v1.1
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: true
load_in_4bit: false

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0
output_dir: ./outputs/lora-out

sequence_len: 4096
sample_packing: true

eval_sample_packing: false

adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
fp16: false
tf32: true

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: false

warmup_ratio: 0.1
evals_per_epoch: 0
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: examples/archived/tiny-llama/lora.yml
================================================
base_model: TinyLlama/TinyLlama_v1.1
# optionally might have model_type or tokenizer_type
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: true
load_in_4bit: false

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out

sequence_len: 4096
sample_packing: true
eval_sample_packing: false


adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: examples/archived/tiny-llama/pretrain.yml
================================================
base_model: TinyLlama/TinyLlama-1.1B-Chat-v1.0
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

max_steps: 200
pretraining_dataset:
  - path: allenai/c4
    name: en
    type: pretrain
dataset_prepared_path:
val_set_size: 0.0
output_dir: ./outputs/model-out

sequence_len: 2048
sample_packing: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: examples/archived/tiny-llama/qlora.yml
================================================
base_model: TinyLlama/TinyLlama_v1.1
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/qlora-out

adapter: qlora
lora_model_dir:

sequence_len: 4096
sample_packing: true
eval_sample_packing: false


lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: paged_adamw_32bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: examples/archived/xgen-7b/xgen-7b-8k-qlora.yml
================================================
# An example finetuning Saleforce's XGen-7b model with 8k context using qlora
# on Tim Dettmer's Guanaco dataset.
base_model: Salesforce/xgen-7b-8k-base
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

load_in_8bit: false
# enable 4bit for QLoRA
load_in_4bit: true
gptq: false
push_dataset_to_hub:
datasets:
  - path: timdettmers/openassistant-guanaco
    data_files:
      - openassistant_best_replies_train.jsonl
    type: "completion"
dataset_prepared_path:
val_set_size: 0.05
# enable QLoRA
adapter: qlora
lora_model_dir:
sequence_len: 8192
max_packed_sequence_len:

# hyperparameters from QLoRA paper Appendix B.2
# "We find hyperparameters to be largely robust across datasets"
lora_r: 64
lora_alpha: 16
# 0.1 for models up to 13B
# 0.05 for 33B and 65B models
lora_dropout: 0.05
# add LoRA modules on all linear layers of the base model
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:
output_dir: ./outputs/qlora-out

# QLoRA paper Table 9
# - 16 for 7b & 13b
# - 32 for 33b, 64 for 64b
# Max size tested on A6000
# - 7b: 40
# - 40b: 4
# decrease if OOM, increase for max VRAM utilization
micro_batch_size: 1
gradient_accumulation_steps: 1
num_epochs: 4
# Optimizer for QLoRA
optimizer: paged_adamw_32bit
torchdistx_path:
lr_scheduler: cosine
# QLoRA paper Table 9
# - 2e-4 for 7b & 13b
# - 1e-4 for 33b & 64b
learning_rate: 0.00002
bf16: auto
tf32: false
gradient_checkpointing: true
# stop training after this many evaluation losses have increased in a row
# https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback
early_stopping_patience: 3
resume_from_checkpoint:
auto_resume_from_checkpoints: true
logging_steps: 1
xformers_attention: true
flash_attention:
gptq_groupsize:
gptq_model_v1:
warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  eos_token: "<|endoftext|>"
  bos_token: "<|endoftext|>"
  unk_token: "<|endoftext|>"
  pad_token: "<|endoftext|>"


================================================
FILE: examples/archived/yi-34B-chat/README.md
================================================
# Overview

This is an example of a Yi-34B-Chat configuration. It demonstrates that it is possible to finetune a 34B model on a GPU with 24GB of VRAM.

Tested on an RTX 4090 with `python -m axolotl.cli.train examples/mistral/qlora.yml`, a single epoch of finetuning on the alpaca dataset using qlora runs in 47 mins, using 97% of available memory.


================================================
FILE: examples/archived/yi-34B-chat/qlora.yml
================================================
base_model: 01-ai/Yi-34B-Chat
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true
sequence_len: 1024
bf16: auto
tf32: false
flash_attention: true
special_tokens:
  bos_token: "<|startoftext|>"
  eos_token: "<|endoftext|>"
  unk_token: "<unk>"

# Data
datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
warmup_ratio: 0.1

# Iterations
num_epochs: 1

# Evaluation
val_set_size: 0.1
evals_per_epoch: 5
eval_sample_packing: false
eval_batch_size: 1

# LoRA
output_dir: ./outputs/qlora-out
adapter: qlora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:

# Sampling
sample_packing: false
pad_to_sequence_len: false

# Batching
gradient_accumulation_steps: 4
micro_batch_size: 1
gradient_checkpointing: true

# wandb
wandb_project:

# Optimizer
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002

# Misc
resume_from_checkpoint:
logging_steps: 1
weight_decay: 0


================================================
FILE: examples/cloud/baseten.yaml
================================================
provider: baseten
project_name:

secrets:
  - HF_TOKEN
  - WANDB_API_KEY

gpu: h100
gpu_count: 8
node_count: 1


================================================
FILE: examples/cloud/modal.yaml
================================================
project_name:
volumes:
  - name: axolotl-data
    mount: /workspace/data
  - name: axolotl-artifacts
    mount: /workspace/artifacts

# environment variables from local to set as secrets
secrets:
  - HF_TOKEN
  - WANDB_API_KEY

# Which branch of axolotl to use remotely
branch:

# additional custom commands when building the image
dockerfile_commands:

gpu: h100
gpu_count: 1

# Train specific configurations
memory: 128
timeout: 86400

# Preprocess specific configurations
memory_preprocess: 32
timeout_preprocess: 14400


================================================
FILE: examples/cohere/command-r-7b-qlora.yml
================================================
base_model: CohereForAI/c4ai-command-r7b-12-2024
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer

load_in_8bit: false
load_in_4bit: true

# huggingface repo
chat_template: cohere
datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

val_set_size: 0.0
output_dir: ./outputs/out

adapter: qlora
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

sequence_len: 2048
sample_packing: true
eval_sample_packing: false


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/colab-notebooks/colab-axolotl-example.ipynb
================================================
{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "OPLSwmgdrB7g"
   },
   "source": [
    "# Fine-Tune Qwen3 14B with Axolotl\n",
    "\n",
    "[<img src=\"https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png\" alt=\"Built with Axolotl\" width=\"200\" height=\"32\"/>](https://github.com/axolotl-ai-cloud/axolotl)\n",
    "\n",
    "Axolotl is the most performant LLM post-training framework available, delivering faster training with efficient, consistent and stable performance. Train your workload and ship your product 30% faster; saving you both time and money.\n",
    "\n",
    "- ⭐ us on [GitHub](https://github.com/axolotl-ai-cloud/axolotl)\n",
    "- 📜 Read the [Docs](http://docs.axolotl.ai/)\n",
    "- 💬 Chat with us on [Discord](https://discord.gg/mnpEYgRUmD)\n",
    "- 📰 Get updates on [X/Twitter](https://x.com/axolotl_ai)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "rVjKD7CbxIP3"
   },
   "source": [
    "# Installation\n",
    "\n",
    "Axolotl is easy to install from [pip](https://pypi.org/project/axolotl/), or use our [pre-built Docker images](http://docs.axolotl.ai/docs/docker.html) for a hassle free dependency experience. See our [docs](http://docs.axolotl.ai/docs/installation.html) for more information."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "msOCO4NRmRLa"
   },
   "outputs": [],
   "source": [
    "%%capture\n",
    "# This step can take ~5-10 minutes to install dependencies\n",
    "!pip install --no-build-isolation axolotl[flash-attn]>=0.9.1\n",
    "!pip install \"cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@63b15e6\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "N0OW0YeksDLr"
   },
   "source": [
    "## Demo: Talk Like a Pirate\n",
    "\n",
    "In this demo, we are training the model ***to respond like a pirate***. This was chosen as a way to easily show how to train a model to respond in a certain style of your choosing (without being prompted) and is quite easy to validate within the scope of a Colab."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "8Du2fANTsNCK"
   },
   "source": [
    "### Upload your own dataset or use a Huggingface dataset\n",
    "\n",
    "You can choose to use your own JSONL file from your own [Google Drive](https://drive.google.com/drive/home); for example downloading the [Pirate-Ultrachat JSONL](https://huggingface.co/datasets/winglian/pirate-ultrachat-10k/blob/main/train.jsonl) to your Google Drive. JSONL datasets should be formatted similar to the [OpenAI dataset format](https://cookbook.openai.com/examples/chat_finetuning_data_prep).\n",
    "\n",
    "You can also simply use the [`winglian/pirate-ultrachat-10k`](https://huggingface.co/datasets/winglian/pirate-ultrachat-10k) dataset directly.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "fGEEjyQ-r_IV"
   },
   "outputs": [],
   "source": [
    "# Default to HF dataset location\n",
    "dataset_id = \"winglian/pirate-ultrachat-10k\"\n",
    "uploaded = {}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "c5MyYqk7vIsG"
   },
   "outputs": [],
   "source": [
    "import os\n",
    "\n",
    "# Optionally, upload your own JSONL to your Google Drive\n",
    "GOOGLE_DRIVE_PATH = \"\"  # ex: \"MyDrive/Colab\\ Notebooks/train.jsonl\"\n",
    "\n",
    "# \"Select All\" permissions, or you may get the error:\n",
    "# \"MessageError: Error: credential propagation was unsuccessful\"\n",
    "if GOOGLE_DRIVE_PATH:\n",
    "    from google.colab import drive\n",
    "\n",
    "    # Mount your Google Drive\n",
    "    GOOGLE_DRIVE_MNT = \"/content/drive/\"\n",
    "    drive.mount(GOOGLE_DRIVE_MNT, force_remount=True)\n",
    "    tmp_path = os.path.join(GOOGLE_DRIVE_MNT, GOOGLE_DRIVE_PATH.lstrip(\"/\"))\n",
    "    # make sure file exists\n",
    "    if not os.path.isfile(tmp_path):\n",
    "        raise ValueError(f\"File {tmp_path} does not exist\")\n",
    "    dataset_id = tmp_path"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "U6pTk3A9xj1W"
   },
   "source": [
    "# Configure for Supervised Fine-Tuning (SFT)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 151,
     "referenced_widgets": [
      "388f618924274d21a066f098f4f1e744",
      "7c95f85a2b1f47a1bd846d110c47bb3c",
      "083f9cda8d754c168beee10d2f8955a2",
      "62e1a65582f446a78612eaa804e08a7d",
      "487a177d020f4605834878b2fdc7afa3",
      "7fd44cf9ca6e4726bfd7ac21846d6a14",
      "366a343b62fa47d8985a3bd464d99f9e",
      "a0a11e929edd4189b79723d618522c33",
      "e87ea87fcff247b5bbcc331ba79a8dc2",
      "5e18768f7ad6434ba8b8b8a2e853e204",
      "bb33aec33a6447078c31bfd728942994"
     ]
    },
    "id": "fdRioqytmTtX",
    "outputId": "f0acdcec-4b41-4a3f-ffed-c2d2d929158e"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2025-05-08 13:40:27,488] [INFO] [root.register:348] [PID:174] Attempting to load plugin: axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n",
      "[2025-05-08 13:40:27,493] [INFO] [root.register:351] [PID:174] Plugin loaded successfully: axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\n",
      "[2025-05-08 13:40:27,959] [INFO] [axolotl.utils.schemas.config.check_eval_packing:721] [PID:174] [RANK:0] explicitly setting `eval_sample_packing` to match `sample_packing`\u001b[39m\n",
      "[2025-05-08 13:40:27,960] [INFO] [axolotl.utils.schemas.config.hint_sample_packing_padding:514] [PID:174] [RANK:0] Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing\u001b[39m\n",
      "[2025-05-08 13:40:27,961] [INFO] [axolotl.utils.schemas.config.check_bf16:1251] [PID:174] [RANK:0] bf16 support detected, but not enabled for this configuration.\u001b[39m\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "388f618924274d21a066f098f4f1e744",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "config.json:   0%|          | 0.00/728 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2025-05-08 13:40:28,590] [INFO] [axolotl.normalize_config:237] [PID:174] [RANK:0] cuda memory usage baseline: 0.000GB (+0.002GB cache, +0.359GB misc)\u001b[39m\n"
     ]
    }
   ],
   "source": [
    "from axolotl.cli.config import load_cfg\n",
    "from axolotl.utils.dict import DictDefault\n",
    "\n",
    "# Axolotl provides full control and transparency over model and training configuration\n",
    "config = DictDefault(\n",
    "    base_model=\"Qwen/Qwen3-14B\",  # Use the instruct tuned model, but we're aligning it to be a pirate\n",
    "    load_in_4bit=True,  # set to True for qLoRA\n",
    "    adapter=\"qlora\",\n",
    "    lora_r=32,\n",
    "    lora_alpha=64,\n",
    "    lora_target_modules=[\n",
    "        \"q_proj\",\n",
    "        \"k_proj\",\n",
    "        \"v_proj\",\n",
    "        \"o_proj\",  # train self_attn linear modules\n",
    "        \"gate_proj\",\n",
    "        \"down_proj\",\n",
    "        \"up_proj\",  # train MLP linear modules\n",
    "    ],\n",
    "    lora_qkv_kernel=True,  # optimized triton kernels for LoRA\n",
    "    lora_o_kernel=True,\n",
    "    lora_mlp_kernel=True,\n",
    "    embeddings_skip_upcast=True,  # keep embeddings in fp16 so the model fits in 15GB VRAM\n",
    "    xformers_attention=True,  # use xformers on Colab w/ T4 for memory efficient attention, flash_attention only on Ampere or above\n",
    "    plugins=[\n",
    "        # more efficient training using Apple's Cut Cross Entropy; https://github.com/apple/ml-cross-entropy\n",
    "        \"axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin\",\n",
    "    ],\n",
    "    sample_packing=True,  # 2-6x increase in tokens per micro-batch\n",
    "    # when using packing, use a slightly higher learning rate to account for fewer steps\n",
    "    # alternatively, reduce the micro_batch_size + gradient_accumulation_steps to achieve closer to the same number of steps/epoch\n",
    "    learning_rate=0.00019,\n",
    "    sequence_len=4096,  # larger sequence length improves packing efficiency for more tokens/sec\n",
    "    micro_batch_size=1,\n",
    "    gradient_accumulation_steps=1,\n",
    "    gradient_checkpointing=True,  # tradeoff reduced VRAM for increased time\n",
    "    gradient_checkpointing_kwargs={\n",
    "        \"use_reentrant\": False,\n",
    "    },\n",
    "    optimizer=\"paged_adamw_8bit\",\n",
    "    lr_scheduler=\"cosine\",\n",
    "    warmup_steps=5,\n",
    "    fp16=True,  # use float16 + automatic mixed precision, bfloat16 not supported on Colab w/ T4\n",
    "    bf16=False,\n",
    "    max_grad_norm=0.1,  # gradient clipping\n",
    "    num_epochs=1,\n",
    "    saves_per_epoch=2,  # how many checkpoints to save over one epoch\n",
    "    logging_steps=1,\n",
    "    output_dir=\"./outputs/qwen-sft-pirate-rrr\",\n",
    "    chat_template=\"qwen3\",\n",
    "    datasets=[\n",
    "        {\n",
    "            \"path\": dataset_id,  # Huggingface Dataset id or path to train.jsonl\n",
    "            \"type\": \"chat_template\",\n",
    "            \"split\": \"train\",\n",
    "            \"eot_tokens\": [\"<|im_end|>\"],\n",
    "        }\n",
    "    ],\n",
    "    dataloader_prefetch_factor=8,  # dataloader optimizations\n",
    "    dataloader_num_workers=2,\n",
    "    dataloader_pin_memory=True,\n",
    ")\n",
    "\n",
    "# validates the configuration\n",
    "cfg = load_cfg(config)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "id": "715UpvnSoBIS"
   },
   "outputs": [],
   "source": [
    "from axolotl.utils import set_pytorch_cuda_alloc_conf\n",
    "\n",
    "set_pytorch_cuda_alloc_conf()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "Vc6MC-hwyH-n"
   },
   "source": [
    "# Datasets\n",
    "\n",
    "Axolotl has a robust suite of loaders and transforms to parse most open datasets of any format into the appropriate chat template for your model. Axolotl will mask input tokens from the user's prompt so that the train loss is only calculated against the model's response. For more information, [see our documentation](http://docs.axolotl.ai/docs/dataset-formats/conversation.html) on dataset preparation.\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000,
     "referenced_widgets": [
      "b82aa8c57f7c422a9a9c90f333ed2a99",
      "c0991cf63ee6458b96e9a75e7a88b61a",
      "71c8af139cd248b1b51101fd46a93f35",
      "1d5117195d4b49eb8f1a73b18419f7ce",
      "3c21e4a511b4441192c03b7f1d0976e9",
      "ed28e2e0410d4e0b855467e798e53d66",
      "d93f134f802b4b69b575bdaf07dbd27c",
      "d0e9dce55cec4c1ca619a0ccf209d924",
      "4c727d40ef0443449afc31724ee79f0c",
      "0dea5caa27384f5689e3cab51f558727",
      "a6f48410b9964fefba0c3009a77dc838",
      "95caff42f08a4c2aa14c867b8f37f231",
      "de7c37ee83e24f0c889e84d07279c2ec",
      "9d4897eefb5f48259ffb2d23e332f752",
      "253017b0d0534e54ab44e181f6d7c82d",
      "27beaf06e41b472abdb544a43c720c5a",
      "34cf3df51fbc41cabfdbba153c007f0e",
      "ac764024cf1c4e08ba7749afd2cd20ac",
      "30a81da86f8043eca301e86a8651201a",
      "e8b7a81040904c1e89e58978223b1737",
      "1c6f1f10667545aaab958016ba7e2c94",
      "e6e969610738449887259063967f82b0",
      "a138859f19b74fc0928dc236ab5359db",
      "9b42e08b3c9548818488268768a118b1",
      "12b56912736849fea2ad8124456fdc5c",
      "879c8ab5873847a8833bd74123be90a4",
      "20352e5f58d24bb8b1f3940efd14fe4a",
      "d955dcaa0e944e719f3a06139dd54a03",
      "d3de2662c7964f1ba96e58da382af720",
      "97e36007e1304e1583fd81bfb13f0edd",
      "c65dc74c7d6f4bab8f7dd28455161dd8",
      "ef223e8504b64e3592589880326aaf41",
      "598da69727bd4fb8b1caf465ac736d7a",
      "5f86cd894de94c3280fadc1e2fd0ee13",
      "a20927bf5f2c41f58c1e31ac858ab36c",
      "0a46ad75c198463d843fb35e813642cb",
      "09007681cf8d42aeb8c1d2f6a74e470a",
      "ebc80d1a55fa47f4a5ea2756588569ec",
      "1811cda0644e4190a9469d1774435d82",
      "35c811d2ae8e43f3b5cecbdd3cfa857f",
      "b8e39e4dddc3497fbc29ae45c66da759",
      "63b4e563e85c4f03b1b72beda9577bcc",
      "b195f160ca20442fadd8b5aed0ee41af",
      "ca65e32eb52f48c09a84b33cb18f22cd",
      "7cd0b85ebd204b7aba908417811ce4e0",
      "7baeab52d6694c32b1efd1ea1a0a7782",
      "519a7b154022443db6703f04a9142bae",
      "d4183e9715f34d249942b8271cca3bdf",
      "da2347ac94764a3fa2743343cf0d3cd2",
      "93a44a11aa4846fa8efc6c1413ef1627",
      "a55060adc3564407ac81ad7297d34aaa",
      "d02274afd47b462291c745f261209d42",
      "0f417447a7bd4a33acca96fa37aec877",
      "63580b6fb30642479fe3000915bf551a",
      "8f726dbfb45d4528afa33e36a6313267",
      "03b093d592ba4386aa61f7b8483da660",
      "b8766a88716948cf968f4563531a76d9",
      "6f3a28b912714c6e931003549664bfa3",
      "16d1283741404b7bb319094c992fce01",
      "2a5bb0e818ab47be8cf6465988328503",
      "2b3a2659b12244bd8548320320016dbf",
      "0cd7efffbb3c4c4b972e63749f61ab97",
      "5ca240f31e6b44e3882c5eb37cd5a309",
      "5eb06edeb58e4930b1affef2a59eae81",
      "a4e5789584564049b83df7c6c54a3e08",
      "ff3a94b146a948b6907f5d80c7157f99",
      "258b7c635c1045329d4669e48c46ccd5",
      "6f68ed9889f54ad2ae8a3b95ac263a83",
      "80366349d81e4dcc892db6cd56e384f3",
      "c73055099c084dca996159e23e162d0b",
      "977f799afaac4a55b2dc1cffa7d5b63b",
      "41f3b32c2f6b4034ae7a3b9124e28bc7",
      "a10d0a76010f4e508c65a9b69ebc5156",
      "f8ef805b776145c3bfa9ba8d90972058",
      "cc587493c33c4f118d1b1170f85be24c",
      "e40d1c1ac9494b3bade9858324e7ffdf",
      "d65b6b060d9845779299491ac5599c31",
      "0f6907ebbc6242c8bde059cef1e1bd29",
      "5bdfd87fc6cd4f9dabef7cfee29c8060",
      "64f54d4a744a4627a07c3c0120276f3b",
      "65b75b9b8bc143cf997796af68ff6668",
      "d6fe74e4255444368f8f90a62157d869",
      "4d468f96ec924681ad65eb671674b93e",
      "ad7599de524549c48bf2d3124ad4b299",
      "0546d04aae644dde846c58a4afb598a6",
      "897b77a56c09479bb11d7f2a30997e55",
      "81c3db71ac704280ad030072655f1537",
      "042e091f75694c47aee761e760e76773",
      "ef0a3c7a6f14460fb4da096928ae249e",
      "07fb3a2c8315494e97b447e672dfae06",
      "ec030fc3c346426f9abc3a89892258d3",
      "e3fb3fc6afe04b3c9b7ac61809ce78fa",
      "c3be9109d63c485d9c0ef4f9bc0f9218",
      "12815f401eba44658caa7b2e490137a8",
      "30e02aa2d0d241979369e598287f2639",
      "dfd2a2649b8341ef913207526708aff1",
      "4f1977d7e4824ef1a14b65f0f42bba10",
      "c6164e05a1914ae48083db9ad7f4ef7c",
      "813621384dc748b0ad06775e22761c0b",
      "dc892a596f6942d7973c616c38f0eebb",
      "c84cc07789be48aebb322c23d355289e",
      "bed8726b8069434687c75452e21f19e5",
      "16a188a0b06d45f980dcf3933509fe0a",
      "60c1a0d765c14a1d888317e6a507e4ea",
      "0077aedc3d174560bce924ee89e9c006",
      "00321cce58884f6f9b3855a21fcd9187",
      "fa864b41586f4a7aa56aeafd1d84eb75",
      "3225603166b54e7aab766b9964a2f660",
      "349eee9f56d64f0cba6fc24ff2c50c9b",
      "7e5d3774060e4589aa65982da5ea4ef4",
      "7c2485c6cdfe463da6fdb35982a1070d",
      "ad1236893754446881e153adc9d5c962",
      "daee63fd167e4441a32324b51b00ad2b",
      "fe41858c6bd04c58840112b67c19a336",
      "d262c82138024169b9f3aa034ca756fa",
      "62e302ebdad64aada0ffe64ae1c873f3",
      "bd1b0dfed6d34d16af33a4a58330f5ec",
      "d07c8b97d3314f1c852e44bdd40f61ed",
      "ebb69a2c3d0a4299a484698287b3087c",
      "e5a82df528bb4e408797a3b6c2758f4a",
      "f113ebd8c1c34806bea4dd7ed3035173"
     ]
    },
    "id": "KQQhgK8FoDfF",
    "outputId": "f69441d8-95f9-4885-c306-6c8709090ff6"
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "b82aa8c57f7c422a9a9c90f333ed2a99",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer_config.json:   0%|          | 0.00/9.68k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "95caff42f08a4c2aa14c867b8f37f231",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "a138859f19b74fc0928dc236ab5359db",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "5f86cd894de94c3280fadc1e2fd0ee13",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2025-05-08 13:41:00,844] [DEBUG] [axolotl.utils.models.load_tokenizer:441] [PID:174] [RANK:0] EOS: 151645 / <|im_end|>\u001b[39m\n",
      "[2025-05-08 13:41:00,845] [DEBUG] [axolotl.utils.models.load_tokenizer:442] [PID:174] [RANK:0] BOS: None / None\u001b[39m\n",
      "[2025-05-08 13:41:00,846] [DEBUG] [axolotl.utils.models.load_tokenizer:443] [PID:174] [RANK:0] PAD: 151643 / <|endoftext|>\u001b[39m\n",
      "[2025-05-08 13:41:00,847] [DEBUG] [axolotl.utils.models.load_tokenizer:444] [PID:174] [RANK:0] UNK: None / None\u001b[39m\n",
      "[2025-05-08 13:41:00,869] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:271] [PID:174] [RANK:0] Unable to find prepared dataset in last_run_prepared/97037817611d38b3a9c681753c3c4c95\u001b[39m\n",
      "[2025-05-08 13:41:00,870] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:272] [PID:174] [RANK:0] Loading raw datasets...\u001b[39m\n",
      "\u001b[33m[2025-05-08 13:41:00,870] [WARNING] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:274] [PID:174] [RANK:0] Processing datasets during training can lead to VRAM instability. Please pre-process your dataset.\u001b[39m\n",
      "[2025-05-08 13:41:00,871] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:281] [PID:174] [RANK:0] No seed provided, using default seed of 42\u001b[39m\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7cd0b85ebd204b7aba908417811ce4e0",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "train.jsonl:   0%|          | 0.00/27.3M [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "03b093d592ba4386aa61f7b8483da660",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Generating train split: 0 examples [00:00, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2025-05-08 13:41:04,196] [INFO] [axolotl.utils.data.sft.get_dataset_wrapper:484] [PID:174] [RANK:0] Loading dataset with base_type: chat_template and prompt_style: None\u001b[39m\n",
      "[2025-05-08 13:41:04,233] [INFO] [axolotl.__call__:761] [PID:174] [RANK:0] Using chat template:\n",
      "---\n",
      "{%- if tools %}\n",
      "    {{- '<|im_start|>system\\n' }}\n",
      "    {%- if messages[0].role == 'system' %}\n",
      "        {{- messages[0].content + '\\n\\n' }}\n",
      "    {%- endif %}\n",
      "    {{- \"# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n",
      "    {%- for tool in tools %}\n",
      "        {{- \"\\n\" }}\n",
      "        {{- tool | tojson }}\n",
      "    {%- endfor %}\n",
      "    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n",
      "{%- else %}\n",
      "    {%- if messages[0].role == 'system' %}\n",
      "        {{- '<|im_start|>system\\n' + messages[0].content + '<|im_end|>\\n' }}\n",
      "    {%- endif %}\n",
      "{%- endif %}\n",
      "{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}\n",
      "{%- for message in messages[::-1] %}\n",
      "    {%- set index = (messages|length - 1) - loop.index0 %}\n",
      "    {%- if ns.multi_step_tool and message.role == \"user\" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}\n",
      "        {%- set ns.multi_step_tool = false %}\n",
      "        {%- set ns.last_query_index = index %}\n",
      "    {%- endif %}\n",
      "{%- endfor %}\n",
      "{%- for message in messages %}\n",
      "    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) %}\n",
      "        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n",
      "    {%- elif message.role == \"assistant\" %}\n",
      "        {%- set content = message.content %}\n",
      "        {%- set reasoning_content = '' %}\n",
      "        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}\n",
      "            {%- set reasoning_content = message.reasoning_content %}\n",
      "        {%- else %}\n",
      "            {%- if '</think>' in message.content %}\n",
      "                {%- set content = message.content.split('</think>')[-1].lstrip('\\n') %}\n",
      "                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\\n').split('<think>')[-1].lstrip('\\n') %}\n",
      "            {%- endif %}\n",
      "        {%- endif %}\n",
      "        {%- if loop.index0 > ns.last_query_index %}\n",
      "            {%- if loop.last or (not loop.last and reasoning_content) %}\n",
      "                {{- '<|im_start|>' + message.role + '\\n<think>\\n' + reasoning_content.strip('\\n') + '\\n</think>\\n\\n' + content.lstrip('\\n') }}\n",
      "            {%- else %}\n",
      "                {{- '<|im_start|>' + message.role + '\\n' + content }}\n",
      "            {%- endif %}\n",
      "        {%- else %}\n",
      "            {{- '<|im_start|>' + message.role + '\\n' + content }}\n",
      "        {%- endif %}\n",
      "        {%- if message.tool_calls %}\n",
      "            {%- for tool_call in message.tool_calls %}\n",
      "                {%- if (loop.first and content) or (not loop.first) %}\n",
      "                    {{- '\\n' }}\n",
      "                {%- endif %}\n",
      "                {%- if tool_call.function %}\n",
      "                    {%- set tool_call = tool_call.function %}\n",
      "                {%- endif %}\n",
      "                {{- '<tool_call>\\n{\"name\": \"' }}\n",
      "                {{- tool_call.name }}\n",
      "                {{- '\", \"arguments\": ' }}\n",
      "                {%- if tool_call.arguments is string %}\n",
      "                    {{- tool_call.arguments }}\n",
      "                {%- else %}\n",
      "                    {{- tool_call.arguments | tojson }}\n",
      "                {%- endif %}\n",
      "                {{- '}\\n</tool_call>' }}\n",
      "            {%- endfor %}\n",
      "        {%- endif %}\n",
      "        {{- '<|im_end|>\\n' }}\n",
      "    {%- elif message.role == \"tool\" %}\n",
      "        {%- if loop.first or (messages[loop.index0 - 1].role != \"tool\") %}\n",
      "            {{- '<|im_start|>user' }}\n",
      "        {%- endif %}\n",
      "        {{- '\\n<tool_response>\\n' }}\n",
      "        {{- message.content }}\n",
      "        {{- '\\n</tool_response>' }}\n",
      "        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n",
      "            {{- '<|im_end|>\\n' }}\n",
      "        {%- endif %}\n",
      "    {%- endif %}\n",
      "{%- endfor %}\n",
      "{%- if add_generation_prompt %}\n",
      "    {{- '<|im_start|>assistant\\n' }}\n",
      "    {%- if enable_thinking is defined and enable_thinking is false %}\n",
      "        {{- '<think>\\n\\n</think>\\n\\n' }}\n",
      "    {%- endif %}\n",
      "{%- endif %}\n",
      "---\u001b[39m\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "258b7c635c1045329d4669e48c46ccd5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Tokenizing Prompts (num_proc=2):   0%|          | 0/9985 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2025-05-08 13:42:09,195] [INFO] [axolotl.utils.data.utils.drop_long_seq_in_dataset:177] [PID:174] [RANK:0] min_input_len: 23\u001b[39m\n",
      "[2025-05-08 13:42:09,196] [INFO] [axolotl.utils.data.utils.drop_long_seq_in_dataset:179] [PID:174] [RANK:0] max_input_len: 3380\u001b[39m\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "0f6907ebbc6242c8bde059cef1e1bd29",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Dropping Long Sequences (num_proc=2):   0%|          | 0/9985 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "ef0a3c7a6f14460fb4da096928ae249e",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Drop Samples with Zero Trainable Tokens (num_proc=2):   0%|          | 0/9985 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dc892a596f6942d7973c616c38f0eebb",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Add position_id column (Sample Packing) (num_proc=2):   0%|          | 0/9985 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2025-05-08 13:42:21,651] [INFO] [axolotl.utils.data.sft.load_tokenized_prepared_datasets:351] [PID:174] [RANK:0] Saving merged prepared dataset to disk... last_run_prepared/97037817611d38b3a9c681753c3c4c95\u001b[39m\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "7c2485c6cdfe463da6fdb35982a1070d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Saving the dataset (0/1 shards):   0%|          | 0/9985 [00:00<?, ? examples/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2025-05-08 13:42:25,711] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:411] [PID:174] [RANK:0] gather_len_batches: [1540]\u001b[39m\n",
      "[2025-05-08 13:42:25,714] [INFO] [axolotl.calc_sample_packing_eff_est:491] [PID:174] [RANK:0] sample_packing_eff_est across ranks: [0.9987832601968344]\u001b[39m\n"
     ]
    }
   ],
   "source": [
    "from axolotl.common.datasets import load_datasets\n",
    "\n",
    "# Load, parse and tokenize the datasets to be formatted with qwen3 chat template\n",
    "# Drop long samples from the dataset that overflow the max sequence length\n",
    "dataset_meta = load_datasets(cfg=cfg)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "mrSNfHpk0EAe"
   },
   "source": [
    "# Training\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 1000,
     "referenced_widgets": [
      "004d9177a6a14118a5930dc3cc13147b",
      "a80410b919e442c49aea15acc1ce1a72",
      "c6e00f5224364822bc4239b176686919",
      "ec11d1e5ae7b42c883d9b1f38a65356e",
      "734185351eb543fa9a00a881dcbb9fe7",
      "fa1282ccc7544e4f818e2f03ccffe4a5",
      "bbbf575d2a4b4c6ea8389be79b2a6039",
      "2a51b36be41745468e4c2d7a21b1c0d2",
      "4fd114abe9f5494ab59858949f5055f1",
      "936d04b5fe1b4c63bf0b080e423d051b",
      "f1cef8e8dc2646fb9fd09f3b09081074",
      "cdebbc55a1164c018546c2ac6f8c620c",
      "a44f630e099e43899f20a77084ae60cd",
      "c3725c7f79fe415fbd1ea336f0cc9cf1",
      "0e50870ed0c643e0b6c18cc5d7ddae7f",
      "c33ced495f70464aa4a3a91922090853",
      "ed5ca967ad5342929e578ac6aa4dc4c0",
      "af401d117d5047629d3a6e2361757b62",
      "b191ac001a2e4962bc9a245fcdf26e6b",
      "054c8dffadba48c6b895a6cc62448ecc",
      "bfcdbba993b74972a9e3e575f86908ff",
      "6ebb2ec171414e47a14765505f64bb3c",
      "500e272208a246089613bf788a165271",
      "200df5e79b9244849e589ecb0250a520",
      "cc94432d08464affa3e58b560bdad194",
      "3036608c71904ce9ae4bb2a9fa8802d9",
      "adacfdcc1b0140efac56918e9ccf064e",
      "f4a1795dc7514a718f478245f521f0ba",
      "5e746eb25bbe416fb585fa24e79f5177",
      "b5b65414154544aa8a71b1a39164aad7",
      "f0a58fbd0fca4340890041f99fa2f8c8",
      "5ca6be24acb548cea130bd58e9954c7c",
      "5cfb02ee044b4011a378efa8b54a370f",
      "4d05314858354e729d76094b3b0ce761",
      "c42acf646f344a88b8c11f81e67f7206",
      "7be6f04c284e4326bb4ff3d301e7b3c6",
      "ffdbb12a2f2c4d14911685e7683e0ef0",
      "bee3501b2a17427784a717e50a85e7fa",
      "8bc9d8ba866c442b9118d9630009939c",
      "9f56a2d9979c4bd8928c644c22c3ecdf",
      "9503a45960984adc97b58e16c50662e0",
      "da6e93f3e4984780b930fe7a706983ea",
      "ab93eabd7cea4b94b4b7a387f101e8a1",
      "704f2f5a9b1c49d5a75a0025a5dda11b",
      "dd0e646fad3f4a89ba23b39d162bd8d9",
      "d43c6df07ddb466587807d6dbe1ff614",
      "e0e8b840b8ea4d0d9db09afe99fa287d",
      "9327977822be4b1294f80e876552e305",
      "77304d1a46b3468a98483e02ec0ac4a4",
      "8c4d4fc5a30f4e7cb3be53fe2adda33d",
      "e90658f4bcb642baa78426012f863152",
      "f7434f3e03124a1c938a39af79d7fa59",
      "c1314f241a434c41b45d84dc4d3b30f8",
      "37de928300e34184881039378bd75e7f",
      "0e936d9dbf9c4fdd86bbfe9730dedc47",
      "e21e180307e5485cbbe908672fd6639a",
      "2e2b0c1599c341a198f632f46a40c90e",
      "bff139df987d4a62abec6456cb27f3d4",
      "ebe1cc366d324ad59b264c8b3c431441",
      "114dece49dba437c8572ef94b23c3b1e",
      "be724f04b03942b2a033a7e8898bb4fd",
      "fcbab4d8dced41a18dfccce81e3a45a0",
      "c1f9c267ba3f40039cdb5eb3267e8043",
      "33b3b1d0295646edaac7b4822761aeb0",
      "fba7aa824b38467ab3061b226114cdec",
      "f3075dccbd2747b4a7913b66f44f2596",
      "fe18bba7f3fb4c31bf840541f36b3425",
      "fd4f333f7ece4450b04e1a9af1f9d2f6",
      "f60a2bdb6b6b4e0e8c3508580e247132",
      "c0892a1881de4eb4bfabc6a68f87ae99",
      "1bec6297c90242a88672d195bc09d429",
      "d1f9b10c130542f094c8fd3d1e23b5e9",
      "e575d87a7efe4ec7b1efde489839d4a6",
      "edc99591b9c747b689b94d0052fec14c",
      "35cc989ca3374e7dba0cb166febc4bde",
      "158c8b85dbf34de6a94b4e35e2fc7d5a",
      "0b4c9753a7cb4354b8e5f187e6e1ad7c",
      "4471ff62258549fba9514bb67050f965",
      "9cd5211b5d8b457aa0002f1d17b80028",
      "19127c7bb1554ccbac877059f9a82db0",
      "f4667818b9d34a09891cd727a429a610",
      "9ed02dc43412471a9ab47f3620ccf3a5",
      "6932489232ec4ab18a160b1e7fbcdfe1",
      "4540927d98f54466b434ba4c0edf045d",
      "e400cbf14bcc446a9d33b210cd93550b",
      "71002199df6b40c9a1ac40df5fb27a1b",
      "4b27c267393640f28f6eae0875bd2ed9",
      "9858cb74a09748a39e8149baac96702c",
      "eb1c9535e6a546098b760528b2ea387c",
      "18357b321ce44d7b8bd9d1c886f69275",
      "279937fe03bc4e4eb25b472d7e9df163",
      "bca2c7185b6749fd899c06a2ba4c5e46",
      "1f7d30f71bbd4547a9150d21da071055",
      "e366ae3fceec4566b9ed303d6c5f90af",
      "5dd7d150dbe04f08b165ce7f2c27cd11",
      "b634bb73cfa743d09a5999101b840976",
      "742b1030acfd414bbd9d5327b7e3826d",
      "0f480e3a0b0a45d2a2d2dec3cad923f3",
      "fcb30372e7404c5d8a1ad4df91e6c7b2",
      "2860e3bb3baf4f7da058465850e800c5",
      "3efd18ea8eaa41918894883da9541bfa",
      "e09f1bcbb9d94c09be53e5e1303642c2",
      "82177df57a494de8900c14c2f5185175",
      "ccfcdc95baf646f8aeb3d516742383f2",
      "8f5bd719974e41c3a8dd9a5b0d3d71e6",
      "b87c84de30e84b3abf4871461fb9cbd3",
      "e7d8e4fe58384e93a106de546068c65e",
      "0aa8ab56b85f4171a79c3bc210594025",
      "67da6c4260574869aa24c3cbc1bc1654",
      "94b9088614464f60a203de39dbcae853",
      "fea1b70fb46745feb5111b3929175b5d",
      "f365820a3d3c42b2948abfe32065de14",
      "823f1c78f15043e38bbd4dca3932a86a",
      "a1959759c5424da9961fb2a308d4dee4",
      "34c9c0137b504cd799c6bd6de69507c2",
      "735d4f225b24414294fc1b213c61223c",
      "5e5e15b0569b474c9620083b3ec6af55",
      "03a3c744d716431488163b4358b80f92",
      "a5434ee714f9498d83870544b67c0cb7",
      "3aaecbf540f54a2db9ab0931e3b1fe57",
      "9e333ed3b5014069ac1dd969255dd591"
     ]
    },
    "id": "IwrpurmloGOy",
    "outputId": "84fa167f-ba27-4255-d508-dc9df56ad39b"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\n",
      "     #@@ #@@      @@# @@#\n",
      "    @@  @@          @@  @@           =@@#                               @@                 #@    =@@#.\n",
      "    @@    #@@@@@@@@@    @@           #@#@=                              @@                 #@     .=@@\n",
      "      #@@@@@@@@@@@@@@@@@            =@# @#     ##=     ##    =####=+    @@      =#####+  =#@@###.   @@\n",
      "    @@@@@@@@@@/  +@@/  +@@          #@  =@=     #@=   @@   =@#+  +#@#   @@    =@#+  +#@#   #@.      @@\n",
      "    @@@@@@@@@@  ##@@  ##@@         =@#   @#      =@# @#    @@      @@   @@    @@      #@   #@       @@\n",
      "     @@@@@@@@@@@@@@@@@@@@          #@=+++#@=      =@@#     @@      @@   @@    @@      #@   #@       @@\n",
      "                                  =@#=====@@     =@# @#    @@      @@   @@    @@      #@   #@       @@\n",
      "    @@@@@@@@@@@@@@@@  @@@@        #@      #@=   #@=  +@@   #@#    =@#   @@.   =@#    =@#   #@.      @@\n",
      "                                 =@#       @#  #@=     #@   =#@@@@#=    +#@@=  +#@@@@#=    .##@@+   @@\n",
      "    @@@@  @@@@@@@@@@@@@@@@\n",
      "\n",
      "[2025-05-07 22:08:14,344] [INFO] [axolotl.monkeypatch.peft.utils.patch_peft_prep_code:76] [PID:1336] [RANK:0] patching prepare_model_for_kbit_training to allow for overrides\u001b[39m\n",
      "[2025-05-07 22:08:14,549] [INFO] [axolotl.integrations.cut_cross_entropy.pre_model_load:80] [PID:1336] [RANK:0] Applying Cut Cross Entropy to model type: qwen3\u001b[39m\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "004d9177a6a14118a5930dc3cc13147b",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model.safetensors.index.json:   0%|          | 0.00/36.5k [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "cdebbc55a1164c018546c2ac6f8c620c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00001-of-00008.safetensors:   0%|          | 0.00/3.84G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "500e272208a246089613bf788a165271",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00002-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4d05314858354e729d76094b3b0ce761",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00003-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "dd0e646fad3f4a89ba23b39d162bd8d9",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00004-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "e21e180307e5485cbbe908672fd6639a",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00005-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fe18bba7f3fb4c31bf840541f36b3425",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00006-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "4471ff62258549fba9514bb67050f965",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00007-of-00008.safetensors:   0%|          | 0.00/3.96G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "eb1c9535e6a546098b760528b2ea387c",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "model-00008-of-00008.safetensors:   0%|          | 0.00/1.91G [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2025-05-07 22:09:49,798] [INFO] [accelerate.utils.modeling.get_balanced_memory:990] [PID:1336] We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).\n"
     ]
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "2860e3bb3baf4f7da058465850e800c5",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "fea1b70fb46745feb5111b3929175b5d",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "generation_config.json:   0%|          | 0.00/239 [00:00<?, ?B/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2025-05-07 22:11:37,521] [INFO] [axolotl.utils.models.load_model:1302] [PID:1336] [RANK:0] cuda memory usage after model load: 9.264GB (+1.721GB cache, +0.375GB misc)\u001b[39m\n",
      "[2025-05-07 22:11:37,532] [INFO] [axolotl.utils.models.prepare_model:1205] [PID:1336] [RANK:0] converting PEFT model w/ prepare_model_for_kbit_training\u001b[39m\n",
      "[2025-05-07 22:11:37,537] [INFO] [axolotl.utils.models.load_model:1341] [PID:1336] [RANK:0] Converting modules to torch.float16\u001b[39m\n",
      "trainable params: 128,450,560 || all params: 14,896,757,760 || trainable%: 0.8623\n",
      "[2025-05-07 22:11:40,170] [INFO] [axolotl.utils.models.load_model:1402] [PID:1336] [RANK:0] cuda memory usage after adapters: 9.743GB (+1.476GB cache, +0.375GB misc)\u001b[39m\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.11/dist-packages/axolotl/core/trainers/base.py:64: FutureWarning: `tokenizer` is deprecated and will be removed in version 5.0.0 for `AxolotlTrainer.__init__`. Use `processing_class` instead.\n",
      "  super().__init__(*_args, **kwargs)\n",
      "No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2025-05-07 22:11:41,755] [INFO] [axolotl.train.save_initial_configs:359] [PID:1336] [RANK:0] Pre-saving adapter config to ./outputs/qwen-sft-pirate-rrr...\u001b[39m\n",
      "[2025-05-07 22:11:41,756] [INFO] [axolotl.train.save_initial_configs:363] [PID:1336] [RANK:0] Pre-saving tokenizer to ./outputs/qwen-sft-pirate-rrr...\u001b[39m\n",
      "[2025-05-07 22:11:41,974] [INFO] [axolotl.train.save_initial_configs:366] [PID:1336] [RANK:0] Pre-saving model config to ./outputs/qwen-sft-pirate-rrr...\u001b[39m\n",
      "[2025-05-07 22:11:41,982] [INFO] [axolotl.train.execute_training:211] [PID:1336] [RANK:0] Starting trainer...\u001b[39m\n",
      "[2025-05-07 22:11:45,047] [INFO] [axolotl.utils.samplers.multipack.calc_min_len:411] [PID:1336] [RANK:0] gather_len_batches: [1540]\u001b[39m\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n",
      "You're using a Qwen2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
     ]
    },
    {
     "data": {
      "text/html": [
       "\n",
       "    <div>\n",
       "      \n",
       "      <progress value='25' max='25' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
       "      [25/25 09:25, Epoch 0/1]\n",
       "    </div>\n",
       "    <table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       " <tr style=\"text-align: left;\">\n",
       "      <th>Step</th>\n",
       "      <th>Training Loss</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <td>1</td>\n",
       "      <td>1.092300</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>2</td>\n",
       "      <td>1.554200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>3</td>\n",
       "      <td>1.041400</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>4</td>\n",
       "      <td>1.733800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>5</td>\n",
       "      <td>1.430000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>6</td>\n",
       "      <td>1.258500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>7</td>\n",
       "      <td>1.343600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>8</td>\n",
       "      <td>1.101700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>9</td>\n",
       "      <td>1.086500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>10</td>\n",
       "      <td>0.813200</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>11</td>\n",
       "      <td>0.689600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>12</td>\n",
       "      <td>0.826700</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>13</td>\n",
       "      <td>1.541800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>14</td>\n",
       "      <td>0.948000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>15</td>\n",
       "      <td>1.357000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>16</td>\n",
       "      <td>1.085800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>17</td>\n",
       "      <td>1.516800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>18</td>\n",
       "      <td>1.146800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>19</td>\n",
       "      <td>0.834800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>20</td>\n",
       "      <td>0.968000</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>21</td>\n",
       "      <td>1.388800</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>22</td>\n",
       "      <td>1.511500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>23</td>\n",
       "      <td>1.338500</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>24</td>\n",
       "      <td>1.206600</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <td>25</td>\n",
       "      <td>1.504600</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table><p>"
      ],
      "text/plain": [
       "<IPython.core.display.HTML object>"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[2025-05-07 22:12:42,746] [INFO] [axolotl.callbacks.on_step_end:128] [PID:1336] [RANK:0] cuda memory usage while training: 9.768GB (+3.287GB cache, +0.646GB misc)\u001b[39m\n",
      "[2025-05-07 22:21:46,859] [INFO] [axolotl.train.save_trained_model:231] [PID:1336] [RANK:0] Training completed! Saving pre-trained model to ./outputs/qwen-sft-pirate-rrr.\u001b[39m\n"
     ]
    }
   ],
   "source": [
    "from axolotl.train import train\n",
    "\n",
    "# just train the first 25 steps for demo.\n",
    "# This is sufficient to align the model as we've used packing to maximize the trainable samples per step.\n",
    "cfg.max_steps = 25\n",
    "model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "j1b9ypF78eCb"
   },
   "source": [
    "# Inferencing the trained model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "r3_vHhif8YEs",
    "outputId": "e5050605-f6c9-421c-98f9-bde56a281eae"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Ahoy there, matey! Shiver me timbers, ye be lookin' for the Pythagorean theorem, eh? Well, hold yer horses and listen up, for I'll be tellin' ye all about it in me own special way.\n",
      "\n",
      "The Pythagorean theorem be a real gem of a mathematical trick that helps ye find the length of a side of a right triangle. Now, a right triangle be a triangle with a right angle, which be that little corner that looks like a square. \n",
      "\n",
      "The theorem be named after a clever fellow named Pythagoras, who be a mathematician from ancient Greece. He discovered that if ye have a right triangle, the square of the length of the hypotenuse (that be the side opposite the right angle) be equal to the sum of the squares of the other two sides. \n",
      "\n",
      "In other words, if ye have a triangle with sides of length a, b, and c (\n"
     ]
    }
   ],
   "source": [
    "from transformers import TextStreamer\n",
    "\n",
    "messages = [\n",
    "    {\n",
    "        \"role\": \"user\",\n",
    "        \"content\": \"Explain the Pythagorean theorem to me.\",\n",
    "    },\n",
    "]\n",
    "\n",
    "prompt = tokenizer.apply_chat_template(\n",
    "    messages,\n",
    "    add_generation_prompt=True,\n",
    "    tokenize=False,\n",
    "    enable_thinking=False,\n",
    ")\n",
    "\n",
    "outputs = model.generate(\n",
    "    **tokenizer(prompt, return_tensors=\"pt\").to(\"cuda\"),\n",
    "    max_new_tokens=192,\n",
    "    temperature=1.0,\n",
    "    top_p=0.8,\n",
    "    top_k=32,\n",
    "    streamer=TextStreamer(tokenizer, skip_prompt=True),\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "HoGwT2JRSIjA"
   },
   "source": [
    "# Saving your trained model\n",
    "\n",
    "Axolotl automatically saves checkpoints to the `output_dir` path.\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/"
    },
    "id": "5BmSbiy6NaaS",
    "outputId": "f5e1d913-7d55-42d2-8340-f9f1b0bc2b38"
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "total 506M\n",
      "-rw-r--r-- 1 root root  845 May  7 22:21 adapter_config.json\n",
      "-rw-r--r-- 1 root root 491M May  7 22:21 adapter_model.safetensors\n",
      "-rw-r--r-- 1 root root  707 May  7 22:11 added_tokens.json\n",
      "drwxr-xr-x 2 root root 4.0K May  7 22:17 checkpoint-13\n",
      "drwxr-xr-x 2 root root 4.0K May  7 22:21 checkpoint-25\n",
      "-rw-r--r-- 1 root root 1.2K May  7 22:11 config.json\n",
      "-rw-r--r-- 1 root root 1.6M May  7 22:11 merges.txt\n",
      "-rw-r--r-- 1 root root 2.6K May  7 22:21 README.md\n",
      "-rw-r--r-- 1 root root  613 May  7 22:11 special_tokens_map.json\n",
      "-rw-r--r-- 1 root root 9.5K May  7 22:11 tokenizer_config.json\n",
      "-rw-r--r-- 1 root root  11M May  7 22:11 tokenizer.json\n",
      "-rw-r--r-- 1 root root 2.7M May  7 22:11 vocab.json\n"
     ]
    }
   ],
   "source": [
    "# Show the saved checkpoints in the output_dir\n",
    "!ls -lh \"./outputs/qwen-sft-pirate-rrr\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "id": "_PCIFWxuOZd6"
   },
   "source": [
    "Setting `hub_model_id: ` in the original config would have automatically uploaded the model to HuggingFace Hub (e.g. `hub_model_id: username/model_id`)\n",
    "\n",
    "If you prefer to manually upload the training artifacts, we can still upload the entire final checkpoint to HuggingFace from the CLI."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "colab": {
     "base_uri": "https://localhost:8080/",
     "height": 955,
     "referenced_widgets": [
      "c12ea43372ac4d57bb9605f1a429b397",
      "86816687746246b4a6105e8010384e25",
      "6f05e9bebf7b40c9835808e77de6c236",
      "c7433acd3c4841e6958ae8f7e87b1808",
      "19c1e38389fa46c7b7e2152a56e1df34",
      "0e067d8db8ed48308a718d5f57683fd1",
      "131065f118274a1586ac38e39ed84ef0",
      "8640ac440fbc4644b9a3af7ba3ae7183",
      "5cea7996f02040b187ece0bb2d6a8d1f",
      "2e257c8be2da40b4bb67a9e4ab6811f3",
      "56e3768bef5a4b9db4168c5c17f509c2",
      "62c028fdef904dedb9cdeca2b3bda725",
      "a7cf477e80fc43e0ad82c7997b076dce",
      "835bcc28a5564fb9b3d651bc8e32dc46",
      "9f1c9a0695384bdaa6f8b847ef89bee8",
      "b1bea589efa14258a9982071b87938bf",
      "590eef89881545aa8bbef9a8bbe7fb00",
      "4b1f04ff63d14a118fdd15814dff50e4",
      "39789237703c4a418134243055c9cbf5",
      "a3a945817f684328b34651fe052393ec"
     ]
    },
    "id": "2yw8pLvlSMl8",
    "outputId": "6e489ab2-4abe-4e28-84ca-959f912433a4"
   },
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "c12ea43372ac4d57bb9605f1a429b397",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "It seems you are trying to upload a large folder at once. This might take some time and then fail if the folder is too large. For such cases, it is recommended to upload in smaller batches or to use `HfApi().upload_large_folder(...)`/`huggingface-cli upload-large-folder` instead. For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/upload#upload-a-large-folder.\n",
      "Start hashing 40 files.\n",
      "Finished hashing 40 files.\n",
      "Uploading files using Xet Storage..\n",
      "Uploading...:  87% 1.82G/2.10G [00:23<00:04, 67.3MB/s]Cancellation requested; stopping current tasks.\n",
      "Traceback (most recent call last):\n",
      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/_commit_api.py\", line 598, in _upload_xet_files\n",
      "    upload_files(\n",
      "RuntimeError: Xet Runtime Error: Task cancelled; possible runtime shutdown in progress (task 9 was cancelled).\n",
      "\n",
      "During handling of the above exception, another exception occurred:\n",
      "\n",
      "Traceback (most recent call last):\n",
      "  File \"/usr/local/bin/huggingface-cli\", line 8, in <module>\n",
      "    sys.exit(main())\n",
      "             ^^^^^^\n",
      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/huggingface_cli.py\", line 57, in main\n",
      "    service.run()\n",
      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/upload.py\", line 207, in run\n",
      "    print(self._upload())\n",
      "          ^^^^^^^^^^^^^^\n",
      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/commands/upload.py\", line 302, in _upload\n",
      "    return self.api.upload_folder(\n",
      "           ^^^^^^^^^^^^^^^^^^^^^^^\n",
      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n",
      "    return fn(*args, **kwargs)\n",
      "           ^^^^^^^^^^^^^^^^^^^\n",
      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 1633, in _inner\n",
      "    return fn(self, *args, **kwargs)\n",
      "           ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4942, in upload_folder\n",
      "    commit_info = self.create_commit(\n",
      "                  ^^^^^^^^^^^^^^^^^^^\n",
      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n",
      "    return fn(*args, **kwargs)\n",
      "           ^^^^^^^^^^^^^^^^^^^\n",
      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 1633, in _inner\n",
      "    return fn(self, *args, **kwargs)\n",
      "           ^^^^^^^^^^^^^^^^^^^^^^^^^\n",
      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4202, in create_commit\n",
      "    self.preupload_lfs_files(\n",
      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/hf_api.py\", line 4483, in preupload_lfs_files\n",
      "    _upload_xet_files(**upload_kwargs, create_pr=create_pr)  # type: ignore [arg-type]\n",
      "    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n",
      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/utils/_validators.py\", line 114, in _inner_fn\n",
      "    return fn(*args, **kwargs)\n",
      "           ^^^^^^^^^^^^^^^^^^^\n",
      "  File \"/usr/local/lib/python3.11/dist-packages/huggingface_hub/_commit_api.py\", line 592, in _upload_xet_files\n",
      "    with progress_cm as progress:\n",
      "  File \"/usr/local/lib/python3.11/dist-packages/tqdm/std.py\", line 1138, in __exit__\n",
      "    def __exit__(self, exc_type, exc_value, traceback):\n",
      "\n",
      "KeyboardInterrupt\n",
      "^C\n"
     ]
    }
   ],
   "source": [
    "from huggingface_hub import notebook_login\n",
    "\n",
    "# remove the partial epoch checkpoints\n",
    "!rm -rf \"./outputs/qwen-sft-pirate-rrr/checkpoint-*\"\n",
    "\n",
    "# HF Notebook login widget\n",
    "notebook_login()\n",
    "\n",
    "# upload the LoRA adapter for your model to HF, remember to update the username/model-name below\n",
    "!huggingface-cli upload --repo-type=model winglian/pirate-qwen-14B \"./outputs/qwen-sft-pirate-rrr\""
   ]
  }
 ],
 "metadata": {
  "accelerator": "GPU",
  "colab": {
   "gpuType": "T4",
   "provenance": []
  },
  "kernelspec": {
   "display_name": "Python 3",
   "name": "python3"
  },
  "language_info": {
   "name": "python"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "00321cce58884f6f9b3855a21fcd9187": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "004d9177a6a14118a5930dc3cc13147b": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_a80410b919e442c49aea15acc1ce1a72",
       "IPY_MODEL_c6e00f5224364822bc4239b176686919",
       "IPY_MODEL_ec11d1e5ae7b42c883d9b1f38a65356e"
      ],
      "layout": "IPY_MODEL_734185351eb543fa9a00a881dcbb9fe7"
     }
    },
    "0077aedc3d174560bce924ee89e9c006": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "03a3c744d716431488163b4358b80f92": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "03b093d592ba4386aa61f7b8483da660": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_b8766a88716948cf968f4563531a76d9",
       "IPY_MODEL_6f3a28b912714c6e931003549664bfa3",
       "IPY_MODEL_16d1283741404b7bb319094c992fce01"
      ],
      "layout": "IPY_MODEL_2a5bb0e818ab47be8cf6465988328503"
     }
    },
    "042e091f75694c47aee761e760e76773": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "0546d04aae644dde846c58a4afb598a6": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "054c8dffadba48c6b895a6cc62448ecc": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "07fb3a2c8315494e97b447e672dfae06": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_12815f401eba44658caa7b2e490137a8",
      "placeholder": "​",
      "style": "IPY_MODEL_30e02aa2d0d241979369e598287f2639",
      "value": "Drop Samples with Zero Trainable Tokens (num_proc=2): 100%"
     }
    },
    "083f9cda8d754c168beee10d2f8955a2": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_a0a11e929edd4189b79723d618522c33",
      "max": 728,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_e87ea87fcff247b5bbcc331ba79a8dc2",
      "value": 728
     }
    },
    "09007681cf8d42aeb8c1d2f6a74e470a": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_b195f160ca20442fadd8b5aed0ee41af",
      "placeholder": "​",
      "style": "IPY_MODEL_ca65e32eb52f48c09a84b33cb18f22cd",
      "value": " 11.4M/11.4M [00:00&lt;00:00, 21.8MB/s]"
     }
    },
    "0a46ad75c198463d843fb35e813642cb": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_b8e39e4dddc3497fbc29ae45c66da759",
      "max": 11422654,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_63b4e563e85c4f03b1b72beda9577bcc",
      "value": 11422654
     }
    },
    "0aa8ab56b85f4171a79c3bc210594025": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "0b4c9753a7cb4354b8e5f187e6e1ad7c": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "0cd7efffbb3c4c4b972e63749f61ab97": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "0dea5caa27384f5689e3cab51f558727": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "0e067d8db8ed48308a718d5f57683fd1": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_b1bea589efa14258a9982071b87938bf",
      "placeholder": "​",
      "style": "IPY_MODEL_590eef89881545aa8bbef9a8bbe7fb00",
      "value": "\n<b>Pro Tip:</b> If you don't already have one, you can create a dedicated\n'notebooks' token with 'write' access, that you can then easily reuse for all\nnotebooks. </center>"
     }
    },
    "0e50870ed0c643e0b6c18cc5d7ddae7f": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_bfcdbba993b74972a9e3e575f86908ff",
      "placeholder": "​",
      "style": "IPY_MODEL_6ebb2ec171414e47a14765505f64bb3c",
      "value": " 3.84G/3.84G [00:09&lt;00:00, 664MB/s]"
     }
    },
    "0e936d9dbf9c4fdd86bbfe9730dedc47": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "0f417447a7bd4a33acca96fa37aec877": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "0f480e3a0b0a45d2a2d2dec3cad923f3": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "0f6907ebbc6242c8bde059cef1e1bd29": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_5bdfd87fc6cd4f9dabef7cfee29c8060",
       "IPY_MODEL_64f54d4a744a4627a07c3c0120276f3b",
       "IPY_MODEL_65b75b9b8bc143cf997796af68ff6668"
      ],
      "layout": "IPY_MODEL_d6fe74e4255444368f8f90a62157d869"
     }
    },
    "114dece49dba437c8572ef94b23c3b1e": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "12815f401eba44658caa7b2e490137a8": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "12b56912736849fea2ad8124456fdc5c": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_97e36007e1304e1583fd81bfb13f0edd",
      "max": 1671853,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_c65dc74c7d6f4bab8f7dd28455161dd8",
      "value": 1671853
     }
    },
    "131065f118274a1586ac38e39ed84ef0": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": "center",
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": "flex",
      "flex": null,
      "flex_flow": "column",
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": "50%"
     }
    },
    "158c8b85dbf34de6a94b4e35e2fc7d5a": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "16a188a0b06d45f980dcf3933509fe0a": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_349eee9f56d64f0cba6fc24ff2c50c9b",
      "placeholder": "​",
      "style": "IPY_MODEL_7e5d3774060e4589aa65982da5ea4ef4",
      "value": " 9985/9985 [00:04&lt;00:00, 2604.11 examples/s]"
     }
    },
    "16d1283741404b7bb319094c992fce01": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_a4e5789584564049b83df7c6c54a3e08",
      "placeholder": "​",
      "style": "IPY_MODEL_ff3a94b146a948b6907f5d80c7157f99",
      "value": " 9985/0 [00:00&lt;00:00, 50763.46 examples/s]"
     }
    },
    "1811cda0644e4190a9469d1774435d82": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "18357b321ce44d7b8bd9d1c886f69275": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_e366ae3fceec4566b9ed303d6c5f90af",
      "placeholder": "​",
      "style": "IPY_MODEL_5dd7d150dbe04f08b165ce7f2c27cd11",
      "value": "model-00008-of-00008.safetensors: 100%"
     }
    },
    "19127c7bb1554ccbac877059f9a82db0": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "danger",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_e400cbf14bcc446a9d33b210cd93550b",
      "max": 3963750880,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_71002199df6b40c9a1ac40df5fb27a1b",
      "value": 3963750502
     }
    },
    "19c1e38389fa46c7b7e2152a56e1df34": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ButtonModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ButtonModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ButtonView",
      "button_style": "",
      "description": "Login",
      "disabled": false,
      "icon": "",
      "layout": "IPY_MODEL_835bcc28a5564fb9b3d651bc8e32dc46",
      "style": "IPY_MODEL_9f1c9a0695384bdaa6f8b847ef89bee8",
      "tooltip": ""
     }
    },
    "1bec6297c90242a88672d195bc09d429": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "1c6f1f10667545aaab958016ba7e2c94": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "1d5117195d4b49eb8f1a73b18419f7ce": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_0dea5caa27384f5689e3cab51f558727",
      "placeholder": "​",
      "style": "IPY_MODEL_a6f48410b9964fefba0c3009a77dc838",
      "value": " 9.68k/9.68k [00:00&lt;00:00, 812kB/s]"
     }
    },
    "1f7d30f71bbd4547a9150d21da071055": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "200df5e79b9244849e589ecb0250a520": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_f4a1795dc7514a718f478245f521f0ba",
      "placeholder": "​",
      "style": "IPY_MODEL_5e746eb25bbe416fb585fa24e79f5177",
      "value": "model-00002-of-00008.safetensors: 100%"
     }
    },
    "20352e5f58d24bb8b1f3940efd14fe4a": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "253017b0d0534e54ab44e181f6d7c82d": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_1c6f1f10667545aaab958016ba7e2c94",
      "placeholder": "​",
      "style": "IPY_MODEL_e6e969610738449887259063967f82b0",
      "value": " 2.78M/2.78M [00:00&lt;00:00, 17.8MB/s]"
     }
    },
    "258b7c635c1045329d4669e48c46ccd5": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_6f68ed9889f54ad2ae8a3b95ac263a83",
       "IPY_MODEL_80366349d81e4dcc892db6cd56e384f3",
       "IPY_MODEL_c73055099c084dca996159e23e162d0b"
      ],
      "layout": "IPY_MODEL_977f799afaac4a55b2dc1cffa7d5b63b"
     }
    },
    "279937fe03bc4e4eb25b472d7e9df163": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "danger",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_b634bb73cfa743d09a5999101b840976",
      "max": 1912371880,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_742b1030acfd414bbd9d5327b7e3826d",
      "value": 1912371698
     }
    },
    "27beaf06e41b472abdb544a43c720c5a": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "2860e3bb3baf4f7da058465850e800c5": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_3efd18ea8eaa41918894883da9541bfa",
       "IPY_MODEL_e09f1bcbb9d94c09be53e5e1303642c2",
       "IPY_MODEL_82177df57a494de8900c14c2f5185175"
      ],
      "layout": "IPY_MODEL_ccfcdc95baf646f8aeb3d516742383f2"
     }
    },
    "2a51b36be41745468e4c2d7a21b1c0d2": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "2a5bb0e818ab47be8cf6465988328503": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "2b3a2659b12244bd8548320320016dbf": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "2e257c8be2da40b4bb67a9e4ab6811f3": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "2e2b0c1599c341a198f632f46a40c90e": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_be724f04b03942b2a033a7e8898bb4fd",
      "placeholder": "​",
      "style": "IPY_MODEL_fcbab4d8dced41a18dfccce81e3a45a0",
      "value": "model-00005-of-00008.safetensors: 100%"
     }
    },
    "3036608c71904ce9ae4bb2a9fa8802d9": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_5ca6be24acb548cea130bd58e9954c7c",
      "placeholder": "​",
      "style": "IPY_MODEL_5cfb02ee044b4011a378efa8b54a370f",
      "value": " 3.96G/3.96G [00:10&lt;00:00, 531MB/s]"
     }
    },
    "30a81da86f8043eca301e86a8651201a": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "30e02aa2d0d241979369e598287f2639": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "3225603166b54e7aab766b9964a2f660": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "33b3b1d0295646edaac7b4822761aeb0": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "349eee9f56d64f0cba6fc24ff2c50c9b": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "34c9c0137b504cd799c6bd6de69507c2": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "34cf3df51fbc41cabfdbba153c007f0e": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "35c811d2ae8e43f3b5cecbdd3cfa857f": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "35cc989ca3374e7dba0cb166febc4bde": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "366a343b62fa47d8985a3bd464d99f9e": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "37de928300e34184881039378bd75e7f": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "388f618924274d21a066f098f4f1e744": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_7c95f85a2b1f47a1bd846d110c47bb3c",
       "IPY_MODEL_083f9cda8d754c168beee10d2f8955a2",
       "IPY_MODEL_62e1a65582f446a78612eaa804e08a7d"
      ],
      "layout": "IPY_MODEL_487a177d020f4605834878b2fdc7afa3"
     }
    },
    "39789237703c4a418134243055c9cbf5": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "3aaecbf540f54a2db9ab0931e3b1fe57": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "3c21e4a511b4441192c03b7f1d0976e9": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "3efd18ea8eaa41918894883da9541bfa": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_8f5bd719974e41c3a8dd9a5b0d3d71e6",
      "placeholder": "​",
      "style": "IPY_MODEL_b87c84de30e84b3abf4871461fb9cbd3",
      "value": "Loading checkpoint shards: 100%"
     }
    },
    "41f3b32c2f6b4034ae7a3b9124e28bc7": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "4471ff62258549fba9514bb67050f965": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_9cd5211b5d8b457aa0002f1d17b80028",
       "IPY_MODEL_19127c7bb1554ccbac877059f9a82db0",
       "IPY_MODEL_f4667818b9d34a09891cd727a429a610"
      ],
      "layout": "IPY_MODEL_9ed02dc43412471a9ab47f3620ccf3a5"
     }
    },
    "4540927d98f54466b434ba4c0edf045d": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "487a177d020f4605834878b2fdc7afa3": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "4b1f04ff63d14a118fdd15814dff50e4": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "LabelModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "LabelModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "LabelView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_39789237703c4a418134243055c9cbf5",
      "placeholder": "​",
      "style": "IPY_MODEL_a3a945817f684328b34651fe052393ec",
      "value": "Connecting..."
     }
    },
    "4b27c267393640f28f6eae0875bd2ed9": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "4c727d40ef0443449afc31724ee79f0c": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "4d05314858354e729d76094b3b0ce761": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_c42acf646f344a88b8c11f81e67f7206",
       "IPY_MODEL_7be6f04c284e4326bb4ff3d301e7b3c6",
       "IPY_MODEL_ffdbb12a2f2c4d14911685e7683e0ef0"
      ],
      "layout": "IPY_MODEL_bee3501b2a17427784a717e50a85e7fa"
     }
    },
    "4d468f96ec924681ad65eb671674b93e": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "4f1977d7e4824ef1a14b65f0f42bba10": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "4fd114abe9f5494ab59858949f5055f1": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "500e272208a246089613bf788a165271": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_200df5e79b9244849e589ecb0250a520",
       "IPY_MODEL_cc94432d08464affa3e58b560bdad194",
       "IPY_MODEL_3036608c71904ce9ae4bb2a9fa8802d9"
      ],
      "layout": "IPY_MODEL_adacfdcc1b0140efac56918e9ccf064e"
     }
    },
    "519a7b154022443db6703f04a9142bae": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_d02274afd47b462291c745f261209d42",
      "max": 27341251,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_0f417447a7bd4a33acca96fa37aec877",
      "value": 27341251
     }
    },
    "56e3768bef5a4b9db4168c5c17f509c2": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "590eef89881545aa8bbef9a8bbe7fb00": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "598da69727bd4fb8b1caf465ac736d7a": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "5bdfd87fc6cd4f9dabef7cfee29c8060": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_4d468f96ec924681ad65eb671674b93e",
      "placeholder": "​",
      "style": "IPY_MODEL_ad7599de524549c48bf2d3124ad4b299",
      "value": "Dropping Long Sequences (num_proc=2): 100%"
     }
    },
    "5ca240f31e6b44e3882c5eb37cd5a309": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": "20px"
     }
    },
    "5ca6be24acb548cea130bd58e9954c7c": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "5cea7996f02040b187ece0bb2d6a8d1f": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "5cfb02ee044b4011a378efa8b54a370f": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "5dd7d150dbe04f08b165ce7f2c27cd11": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "5e18768f7ad6434ba8b8b8a2e853e204": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "5e5e15b0569b474c9620083b3ec6af55": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "5e746eb25bbe416fb585fa24e79f5177": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "5eb06edeb58e4930b1affef2a59eae81": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "5f86cd894de94c3280fadc1e2fd0ee13": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_a20927bf5f2c41f58c1e31ac858ab36c",
       "IPY_MODEL_0a46ad75c198463d843fb35e813642cb",
       "IPY_MODEL_09007681cf8d42aeb8c1d2f6a74e470a"
      ],
      "layout": "IPY_MODEL_ebc80d1a55fa47f4a5ea2756588569ec"
     }
    },
    "60c1a0d765c14a1d888317e6a507e4ea": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "62c028fdef904dedb9cdeca2b3bda725": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "62e1a65582f446a78612eaa804e08a7d": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_5e18768f7ad6434ba8b8b8a2e853e204",
      "placeholder": "​",
      "style": "IPY_MODEL_bb33aec33a6447078c31bfd728942994",
      "value": " 728/728 [00:00&lt;00:00, 20.3kB/s]"
     }
    },
    "62e302ebdad64aada0ffe64ae1c873f3": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "63580b6fb30642479fe3000915bf551a": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "63b4e563e85c4f03b1b72beda9577bcc": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "64f54d4a744a4627a07c3c0120276f3b": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_0546d04aae644dde846c58a4afb598a6",
      "max": 9985,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_897b77a56c09479bb11d7f2a30997e55",
      "value": 9985
     }
    },
    "65b75b9b8bc143cf997796af68ff6668": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_81c3db71ac704280ad030072655f1537",
      "placeholder": "​",
      "style": "IPY_MODEL_042e091f75694c47aee761e760e76773",
      "value": " 9985/9985 [00:02&lt;00:00, 3977.47 examples/s]"
     }
    },
    "67da6c4260574869aa24c3cbc1bc1654": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "6932489232ec4ab18a160b1e7fbcdfe1": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "6ebb2ec171414e47a14765505f64bb3c": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "6f05e9bebf7b40c9835808e77de6c236": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "PasswordModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "PasswordModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "PasswordView",
      "continuous_update": true,
      "description": "Token:",
      "description_tooltip": null,
      "disabled": false,
      "layout": "IPY_MODEL_2e257c8be2da40b4bb67a9e4ab6811f3",
      "placeholder": "​",
      "style": "IPY_MODEL_56e3768bef5a4b9db4168c5c17f509c2",
      "value": ""
     }
    },
    "6f3a28b912714c6e931003549664bfa3": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_5ca240f31e6b44e3882c5eb37cd5a309",
      "max": 1,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_5eb06edeb58e4930b1affef2a59eae81",
      "value": 1
     }
    },
    "6f68ed9889f54ad2ae8a3b95ac263a83": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_41f3b32c2f6b4034ae7a3b9124e28bc7",
      "placeholder": "​",
      "style": "IPY_MODEL_a10d0a76010f4e508c65a9b69ebc5156",
      "value": "Tokenizing Prompts (num_proc=2): 100%"
     }
    },
    "704f2f5a9b1c49d5a75a0025a5dda11b": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "71002199df6b40c9a1ac40df5fb27a1b": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "71c8af139cd248b1b51101fd46a93f35": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_d0e9dce55cec4c1ca619a0ccf209d924",
      "max": 9675,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_4c727d40ef0443449afc31724ee79f0c",
      "value": 9675
     }
    },
    "734185351eb543fa9a00a881dcbb9fe7": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "735d4f225b24414294fc1b213c61223c": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "742b1030acfd414bbd9d5327b7e3826d": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "77304d1a46b3468a98483e02ec0ac4a4": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "7baeab52d6694c32b1efd1ea1a0a7782": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_93a44a11aa4846fa8efc6c1413ef1627",
      "placeholder": "​",
      "style": "IPY_MODEL_a55060adc3564407ac81ad7297d34aaa",
      "value": "train.jsonl: 100%"
     }
    },
    "7be6f04c284e4326bb4ff3d301e7b3c6": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "danger",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_9503a45960984adc97b58e16c50662e0",
      "max": 3963750880,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_da6e93f3e4984780b930fe7a706983ea",
      "value": 3963750502
     }
    },
    "7c2485c6cdfe463da6fdb35982a1070d": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_ad1236893754446881e153adc9d5c962",
       "IPY_MODEL_daee63fd167e4441a32324b51b00ad2b",
       "IPY_MODEL_fe41858c6bd04c58840112b67c19a336"
      ],
      "layout": "IPY_MODEL_d262c82138024169b9f3aa034ca756fa"
     }
    },
    "7c95f85a2b1f47a1bd846d110c47bb3c": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_7fd44cf9ca6e4726bfd7ac21846d6a14",
      "placeholder": "​",
      "style": "IPY_MODEL_366a343b62fa47d8985a3bd464d99f9e",
      "value": "config.json: 100%"
     }
    },
    "7cd0b85ebd204b7aba908417811ce4e0": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_7baeab52d6694c32b1efd1ea1a0a7782",
       "IPY_MODEL_519a7b154022443db6703f04a9142bae",
       "IPY_MODEL_d4183e9715f34d249942b8271cca3bdf"
      ],
      "layout": "IPY_MODEL_da2347ac94764a3fa2743343cf0d3cd2"
     }
    },
    "7e5d3774060e4589aa65982da5ea4ef4": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "7fd44cf9ca6e4726bfd7ac21846d6a14": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "80366349d81e4dcc892db6cd56e384f3": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_f8ef805b776145c3bfa9ba8d90972058",
      "max": 9985,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_cc587493c33c4f118d1b1170f85be24c",
      "value": 9985
     }
    },
    "813621384dc748b0ad06775e22761c0b": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "81c3db71ac704280ad030072655f1537": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "82177df57a494de8900c14c2f5185175": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_67da6c4260574869aa24c3cbc1bc1654",
      "placeholder": "​",
      "style": "IPY_MODEL_94b9088614464f60a203de39dbcae853",
      "value": " 8/8 [01:47&lt;00:00, 11.64s/it]"
     }
    },
    "823f1c78f15043e38bbd4dca3932a86a": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_03a3c744d716431488163b4358b80f92",
      "max": 239,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_a5434ee714f9498d83870544b67c0cb7",
      "value": 239
     }
    },
    "835bcc28a5564fb9b3d651bc8e32dc46": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "8640ac440fbc4644b9a3af7ba3ae7183": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "86816687746246b4a6105e8010384e25": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_8640ac440fbc4644b9a3af7ba3ae7183",
      "placeholder": "​",
      "style": "IPY_MODEL_5cea7996f02040b187ece0bb2d6a8d1f",
      "value": "<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.svg\nalt='Hugging Face'> <br> Copy a token from <a\nhref=\"https://huggingface.co/settings/tokens\" target=\"_blank\">your Hugging Face\ntokens page</a> and paste it below. <br> Immediately click login after copying\nyour token or it might be stored in plain text in this notebook file. </center>"
     }
    },
    "879c8ab5873847a8833bd74123be90a4": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_ef223e8504b64e3592589880326aaf41",
      "placeholder": "​",
      "style": "IPY_MODEL_598da69727bd4fb8b1caf465ac736d7a",
      "value": " 1.67M/1.67M [00:00&lt;00:00, 19.0MB/s]"
     }
    },
    "897b77a56c09479bb11d7f2a30997e55": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "8bc9d8ba866c442b9118d9630009939c": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "8c4d4fc5a30f4e7cb3be53fe2adda33d": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "8f5bd719974e41c3a8dd9a5b0d3d71e6": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "8f726dbfb45d4528afa33e36a6313267": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "9327977822be4b1294f80e876552e305": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_37de928300e34184881039378bd75e7f",
      "placeholder": "​",
      "style": "IPY_MODEL_0e936d9dbf9c4fdd86bbfe9730dedc47",
      "value": " 3.96G/3.96G [00:13&lt;00:00, 273MB/s]"
     }
    },
    "936d04b5fe1b4c63bf0b080e423d051b": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "93a44a11aa4846fa8efc6c1413ef1627": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "94b9088614464f60a203de39dbcae853": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "9503a45960984adc97b58e16c50662e0": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "95caff42f08a4c2aa14c867b8f37f231": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_de7c37ee83e24f0c889e84d07279c2ec",
       "IPY_MODEL_9d4897eefb5f48259ffb2d23e332f752",
       "IPY_MODEL_253017b0d0534e54ab44e181f6d7c82d"
      ],
      "layout": "IPY_MODEL_27beaf06e41b472abdb544a43c720c5a"
     }
    },
    "977f799afaac4a55b2dc1cffa7d5b63b": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "97e36007e1304e1583fd81bfb13f0edd": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "9858cb74a09748a39e8149baac96702c": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "9b42e08b3c9548818488268768a118b1": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_d955dcaa0e944e719f3a06139dd54a03",
      "placeholder": "​",
      "style": "IPY_MODEL_d3de2662c7964f1ba96e58da382af720",
      "value": "merges.txt: 100%"
     }
    },
    "9cd5211b5d8b457aa0002f1d17b80028": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_6932489232ec4ab18a160b1e7fbcdfe1",
      "placeholder": "​",
      "style": "IPY_MODEL_4540927d98f54466b434ba4c0edf045d",
      "value": "model-00007-of-00008.safetensors: 100%"
     }
    },
    "9d4897eefb5f48259ffb2d23e332f752": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_30a81da86f8043eca301e86a8651201a",
      "max": 2776833,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_e8b7a81040904c1e89e58978223b1737",
      "value": 2776833
     }
    },
    "9e333ed3b5014069ac1dd969255dd591": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "9ed02dc43412471a9ab47f3620ccf3a5": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "9f1c9a0695384bdaa6f8b847ef89bee8": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ButtonStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ButtonStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "button_color": null,
      "font_weight": ""
     }
    },
    "9f56a2d9979c4bd8928c644c22c3ecdf": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "a0a11e929edd4189b79723d618522c33": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "a10d0a76010f4e508c65a9b69ebc5156": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "a138859f19b74fc0928dc236ab5359db": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_9b42e08b3c9548818488268768a118b1",
       "IPY_MODEL_12b56912736849fea2ad8124456fdc5c",
       "IPY_MODEL_879c8ab5873847a8833bd74123be90a4"
      ],
      "layout": "IPY_MODEL_20352e5f58d24bb8b1f3940efd14fe4a"
     }
    },
    "a1959759c5424da9961fb2a308d4dee4": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_3aaecbf540f54a2db9ab0931e3b1fe57",
      "placeholder": "​",
      "style": "IPY_MODEL_9e333ed3b5014069ac1dd969255dd591",
      "value": " 239/239 [00:00&lt;00:00, 30.9kB/s]"
     }
    },
    "a20927bf5f2c41f58c1e31ac858ab36c": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_1811cda0644e4190a9469d1774435d82",
      "placeholder": "​",
      "style": "IPY_MODEL_35c811d2ae8e43f3b5cecbdd3cfa857f",
      "value": "tokenizer.json: 100%"
     }
    },
    "a3a945817f684328b34651fe052393ec": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "a44f630e099e43899f20a77084ae60cd": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_ed5ca967ad5342929e578ac6aa4dc4c0",
      "placeholder": "​",
      "style": "IPY_MODEL_af401d117d5047629d3a6e2361757b62",
      "value": "model-00001-of-00008.safetensors: 100%"
     }
    },
    "a4e5789584564049b83df7c6c54a3e08": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "a5434ee714f9498d83870544b67c0cb7": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "a55060adc3564407ac81ad7297d34aaa": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "a6f48410b9964fefba0c3009a77dc838": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "a7cf477e80fc43e0ad82c7997b076dce": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "a80410b919e442c49aea15acc1ce1a72": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_fa1282ccc7544e4f818e2f03ccffe4a5",
      "placeholder": "​",
      "style": "IPY_MODEL_bbbf575d2a4b4c6ea8389be79b2a6039",
      "value": "model.safetensors.index.json: 100%"
     }
    },
    "ab93eabd7cea4b94b4b7a387f101e8a1": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "ac764024cf1c4e08ba7749afd2cd20ac": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "ad1236893754446881e153adc9d5c962": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_62e302ebdad64aada0ffe64ae1c873f3",
      "placeholder": "​",
      "style": "IPY_MODEL_bd1b0dfed6d34d16af33a4a58330f5ec",
      "value": "Saving the dataset (1/1 shards): 100%"
     }
    },
    "ad7599de524549c48bf2d3124ad4b299": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "adacfdcc1b0140efac56918e9ccf064e": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "af401d117d5047629d3a6e2361757b62": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "b191ac001a2e4962bc9a245fcdf26e6b": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "b195f160ca20442fadd8b5aed0ee41af": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "b1bea589efa14258a9982071b87938bf": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "b5b65414154544aa8a71b1a39164aad7": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "b634bb73cfa743d09a5999101b840976": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "b82aa8c57f7c422a9a9c90f333ed2a99": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_c0991cf63ee6458b96e9a75e7a88b61a",
       "IPY_MODEL_71c8af139cd248b1b51101fd46a93f35",
       "IPY_MODEL_1d5117195d4b49eb8f1a73b18419f7ce"
      ],
      "layout": "IPY_MODEL_3c21e4a511b4441192c03b7f1d0976e9"
     }
    },
    "b8766a88716948cf968f4563531a76d9": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_2b3a2659b12244bd8548320320016dbf",
      "placeholder": "​",
      "style": "IPY_MODEL_0cd7efffbb3c4c4b972e63749f61ab97",
      "value": "Generating train split: "
     }
    },
    "b87c84de30e84b3abf4871461fb9cbd3": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "b8e39e4dddc3497fbc29ae45c66da759": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "bb33aec33a6447078c31bfd728942994": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "bbbf575d2a4b4c6ea8389be79b2a6039": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "bca2c7185b6749fd899c06a2ba4c5e46": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_0f480e3a0b0a45d2a2d2dec3cad923f3",
      "placeholder": "​",
      "style": "IPY_MODEL_fcb30372e7404c5d8a1ad4df91e6c7b2",
      "value": " 1.91G/1.91G [00:05&lt;00:00, 444MB/s]"
     }
    },
    "bd1b0dfed6d34d16af33a4a58330f5ec": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "be724f04b03942b2a033a7e8898bb4fd": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "bed8726b8069434687c75452e21f19e5": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_fa864b41586f4a7aa56aeafd1d84eb75",
      "max": 9985,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_3225603166b54e7aab766b9964a2f660",
      "value": 9985
     }
    },
    "bee3501b2a17427784a717e50a85e7fa": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "bfcdbba993b74972a9e3e575f86908ff": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "bff139df987d4a62abec6456cb27f3d4": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "danger",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_c1f9c267ba3f40039cdb5eb3267e8043",
      "max": 3963750880,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_33b3b1d0295646edaac7b4822761aeb0",
      "value": 3963750502
     }
    },
    "c0892a1881de4eb4bfabc6a68f87ae99": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_158c8b85dbf34de6a94b4e35e2fc7d5a",
      "placeholder": "​",
      "style": "IPY_MODEL_0b4c9753a7cb4354b8e5f187e6e1ad7c",
      "value": " 3.96G/3.96G [00:15&lt;00:00, 564MB/s]"
     }
    },
    "c0991cf63ee6458b96e9a75e7a88b61a": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_ed28e2e0410d4e0b855467e798e53d66",
      "placeholder": "​",
      "style": "IPY_MODEL_d93f134f802b4b69b575bdaf07dbd27c",
      "value": "tokenizer_config.json: 100%"
     }
    },
    "c12ea43372ac4d57bb9605f1a429b397": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "VBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "VBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "VBoxView",
      "box_style": "",
      "children": [],
      "layout": "IPY_MODEL_131065f118274a1586ac38e39ed84ef0"
     }
    },
    "c1314f241a434c41b45d84dc4d3b30f8": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "c1f9c267ba3f40039cdb5eb3267e8043": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "c33ced495f70464aa4a3a91922090853": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "c3725c7f79fe415fbd1ea336f0cc9cf1": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "danger",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_b191ac001a2e4962bc9a245fcdf26e6b",
      "max": 3841788544,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_054c8dffadba48c6b895a6cc62448ecc",
      "value": 3841788178
     }
    },
    "c3be9109d63c485d9c0ef4f9bc0f9218": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "c42acf646f344a88b8c11f81e67f7206": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_8bc9d8ba866c442b9118d9630009939c",
      "placeholder": "​",
      "style": "IPY_MODEL_9f56a2d9979c4bd8928c644c22c3ecdf",
      "value": "model-00003-of-00008.safetensors: 100%"
     }
    },
    "c6164e05a1914ae48083db9ad7f4ef7c": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "c65dc74c7d6f4bab8f7dd28455161dd8": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "c6e00f5224364822bc4239b176686919": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_2a51b36be41745468e4c2d7a21b1c0d2",
      "max": 36514,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_4fd114abe9f5494ab59858949f5055f1",
      "value": 36514
     }
    },
    "c73055099c084dca996159e23e162d0b": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_e40d1c1ac9494b3bade9858324e7ffdf",
      "placeholder": "​",
      "style": "IPY_MODEL_d65b6b060d9845779299491ac5599c31",
      "value": " 9985/9985 [01:04&lt;00:00, 189.08 examples/s]"
     }
    },
    "c7433acd3c4841e6958ae8f7e87b1808": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "CheckboxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "CheckboxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "CheckboxView",
      "description": "Add token as git credential?",
      "description_tooltip": null,
      "disabled": false,
      "indent": true,
      "layout": "IPY_MODEL_62c028fdef904dedb9cdeca2b3bda725",
      "style": "IPY_MODEL_a7cf477e80fc43e0ad82c7997b076dce",
      "value": false
     }
    },
    "c84cc07789be48aebb322c23d355289e": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_0077aedc3d174560bce924ee89e9c006",
      "placeholder": "​",
      "style": "IPY_MODEL_00321cce58884f6f9b3855a21fcd9187",
      "value": "Add position_id column (Sample Packing) (num_proc=2): 100%"
     }
    },
    "ca65e32eb52f48c09a84b33cb18f22cd": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "cc587493c33c4f118d1b1170f85be24c": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "cc94432d08464affa3e58b560bdad194": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "danger",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_b5b65414154544aa8a71b1a39164aad7",
      "max": 3963750816,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_f0a58fbd0fca4340890041f99fa2f8c8",
      "value": 3963750438
     }
    },
    "ccfcdc95baf646f8aeb3d516742383f2": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "cdebbc55a1164c018546c2ac6f8c620c": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_a44f630e099e43899f20a77084ae60cd",
       "IPY_MODEL_c3725c7f79fe415fbd1ea336f0cc9cf1",
       "IPY_MODEL_0e50870ed0c643e0b6c18cc5d7ddae7f"
      ],
      "layout": "IPY_MODEL_c33ced495f70464aa4a3a91922090853"
     }
    },
    "d02274afd47b462291c745f261209d42": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "d07c8b97d3314f1c852e44bdd40f61ed": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "d0e9dce55cec4c1ca619a0ccf209d924": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "d1f9b10c130542f094c8fd3d1e23b5e9": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "d262c82138024169b9f3aa034ca756fa": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "d3de2662c7964f1ba96e58da382af720": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "d4183e9715f34d249942b8271cca3bdf": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_63580b6fb30642479fe3000915bf551a",
      "placeholder": "​",
      "style": "IPY_MODEL_8f726dbfb45d4528afa33e36a6313267",
      "value": " 27.3M/27.3M [00:00&lt;00:00, 31.0MB/s]"
     }
    },
    "d43c6df07ddb466587807d6dbe1ff614": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_8c4d4fc5a30f4e7cb3be53fe2adda33d",
      "placeholder": "​",
      "style": "IPY_MODEL_e90658f4bcb642baa78426012f863152",
      "value": "model-00004-of-00008.safetensors: 100%"
     }
    },
    "d65b6b060d9845779299491ac5599c31": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "d6fe74e4255444368f8f90a62157d869": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "d93f134f802b4b69b575bdaf07dbd27c": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "d955dcaa0e944e719f3a06139dd54a03": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "da2347ac94764a3fa2743343cf0d3cd2": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "da6e93f3e4984780b930fe7a706983ea": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "daee63fd167e4441a32324b51b00ad2b": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_d07c8b97d3314f1c852e44bdd40f61ed",
      "max": 9985,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_ebb69a2c3d0a4299a484698287b3087c",
      "value": 9985
     }
    },
    "dc892a596f6942d7973c616c38f0eebb": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_c84cc07789be48aebb322c23d355289e",
       "IPY_MODEL_bed8726b8069434687c75452e21f19e5",
       "IPY_MODEL_16a188a0b06d45f980dcf3933509fe0a"
      ],
      "layout": "IPY_MODEL_60c1a0d765c14a1d888317e6a507e4ea"
     }
    },
    "dd0e646fad3f4a89ba23b39d162bd8d9": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_d43c6df07ddb466587807d6dbe1ff614",
       "IPY_MODEL_e0e8b840b8ea4d0d9db09afe99fa287d",
       "IPY_MODEL_9327977822be4b1294f80e876552e305"
      ],
      "layout": "IPY_MODEL_77304d1a46b3468a98483e02ec0ac4a4"
     }
    },
    "de7c37ee83e24f0c889e84d07279c2ec": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_34cf3df51fbc41cabfdbba153c007f0e",
      "placeholder": "​",
      "style": "IPY_MODEL_ac764024cf1c4e08ba7749afd2cd20ac",
      "value": "vocab.json: 100%"
     }
    },
    "dfd2a2649b8341ef913207526708aff1": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "e09f1bcbb9d94c09be53e5e1303642c2": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_e7d8e4fe58384e93a106de546068c65e",
      "max": 8,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_0aa8ab56b85f4171a79c3bc210594025",
      "value": 8
     }
    },
    "e0e8b840b8ea4d0d9db09afe99fa287d": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "danger",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_f7434f3e03124a1c938a39af79d7fa59",
      "max": 3963750880,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_c1314f241a434c41b45d84dc4d3b30f8",
      "value": 3963750502
     }
    },
    "e21e180307e5485cbbe908672fd6639a": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_2e2b0c1599c341a198f632f46a40c90e",
       "IPY_MODEL_bff139df987d4a62abec6456cb27f3d4",
       "IPY_MODEL_ebe1cc366d324ad59b264c8b3c431441"
      ],
      "layout": "IPY_MODEL_114dece49dba437c8572ef94b23c3b1e"
     }
    },
    "e366ae3fceec4566b9ed303d6c5f90af": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "e3fb3fc6afe04b3c9b7ac61809ce78fa": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_c6164e05a1914ae48083db9ad7f4ef7c",
      "placeholder": "​",
      "style": "IPY_MODEL_813621384dc748b0ad06775e22761c0b",
      "value": " 9985/9985 [00:03&lt;00:00, 3622.89 examples/s]"
     }
    },
    "e400cbf14bcc446a9d33b210cd93550b": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "e40d1c1ac9494b3bade9858324e7ffdf": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "e575d87a7efe4ec7b1efde489839d4a6": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "e5a82df528bb4e408797a3b6c2758f4a": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "e6e969610738449887259063967f82b0": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "e7d8e4fe58384e93a106de546068c65e": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "e87ea87fcff247b5bbcc331ba79a8dc2": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "e8b7a81040904c1e89e58978223b1737": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "e90658f4bcb642baa78426012f863152": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "eb1c9535e6a546098b760528b2ea387c": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_18357b321ce44d7b8bd9d1c886f69275",
       "IPY_MODEL_279937fe03bc4e4eb25b472d7e9df163",
       "IPY_MODEL_bca2c7185b6749fd899c06a2ba4c5e46"
      ],
      "layout": "IPY_MODEL_1f7d30f71bbd4547a9150d21da071055"
     }
    },
    "ebb69a2c3d0a4299a484698287b3087c": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "ebc80d1a55fa47f4a5ea2756588569ec": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "ebe1cc366d324ad59b264c8b3c431441": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_fba7aa824b38467ab3061b226114cdec",
      "placeholder": "​",
      "style": "IPY_MODEL_f3075dccbd2747b4a7913b66f44f2596",
      "value": " 3.96G/3.96G [00:13&lt;00:00, 398MB/s]"
     }
    },
    "ec030fc3c346426f9abc3a89892258d3": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "success",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_dfd2a2649b8341ef913207526708aff1",
      "max": 9985,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_4f1977d7e4824ef1a14b65f0f42bba10",
      "value": 9985
     }
    },
    "ec11d1e5ae7b42c883d9b1f38a65356e": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_936d04b5fe1b4c63bf0b080e423d051b",
      "placeholder": "​",
      "style": "IPY_MODEL_f1cef8e8dc2646fb9fd09f3b09081074",
      "value": " 36.5k/36.5k [00:00&lt;00:00, 4.32MB/s]"
     }
    },
    "ed28e2e0410d4e0b855467e798e53d66": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "ed5ca967ad5342929e578ac6aa4dc4c0": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "edc99591b9c747b689b94d0052fec14c": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "ef0a3c7a6f14460fb4da096928ae249e": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_07fb3a2c8315494e97b447e672dfae06",
       "IPY_MODEL_ec030fc3c346426f9abc3a89892258d3",
       "IPY_MODEL_e3fb3fc6afe04b3c9b7ac61809ce78fa"
      ],
      "layout": "IPY_MODEL_c3be9109d63c485d9c0ef4f9bc0f9218"
     }
    },
    "ef223e8504b64e3592589880326aaf41": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "f0a58fbd0fca4340890041f99fa2f8c8": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "ProgressStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "ProgressStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "bar_color": null,
      "description_width": ""
     }
    },
    "f113ebd8c1c34806bea4dd7ed3035173": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "f1cef8e8dc2646fb9fd09f3b09081074": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "f3075dccbd2747b4a7913b66f44f2596": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "f365820a3d3c42b2948abfe32065de14": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_735d4f225b24414294fc1b213c61223c",
      "placeholder": "​",
      "style": "IPY_MODEL_5e5e15b0569b474c9620083b3ec6af55",
      "value": "generation_config.json: 100%"
     }
    },
    "f4667818b9d34a09891cd727a429a610": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_4b27c267393640f28f6eae0875bd2ed9",
      "placeholder": "​",
      "style": "IPY_MODEL_9858cb74a09748a39e8149baac96702c",
      "value": " 3.96G/3.96G [00:11&lt;00:00, 457MB/s]"
     }
    },
    "f4a1795dc7514a718f478245f521f0ba": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "f60a2bdb6b6b4e0e8c3508580e247132": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "FloatProgressModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "FloatProgressModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "ProgressView",
      "bar_style": "danger",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_edc99591b9c747b689b94d0052fec14c",
      "max": 3963750880,
      "min": 0,
      "orientation": "horizontal",
      "style": "IPY_MODEL_35cc989ca3374e7dba0cb166febc4bde",
      "value": 3963750502
     }
    },
    "f7434f3e03124a1c938a39af79d7fa59": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "f8ef805b776145c3bfa9ba8d90972058": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "fa1282ccc7544e4f818e2f03ccffe4a5": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "fa864b41586f4a7aa56aeafd1d84eb75": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "fba7aa824b38467ab3061b226114cdec": {
     "model_module": "@jupyter-widgets/base",
     "model_module_version": "1.2.0",
     "model_name": "LayoutModel",
     "state": {
      "_model_module": "@jupyter-widgets/base",
      "_model_module_version": "1.2.0",
      "_model_name": "LayoutModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "LayoutView",
      "align_content": null,
      "align_items": null,
      "align_self": null,
      "border": null,
      "bottom": null,
      "display": null,
      "flex": null,
      "flex_flow": null,
      "grid_area": null,
      "grid_auto_columns": null,
      "grid_auto_flow": null,
      "grid_auto_rows": null,
      "grid_column": null,
      "grid_gap": null,
      "grid_row": null,
      "grid_template_areas": null,
      "grid_template_columns": null,
      "grid_template_rows": null,
      "height": null,
      "justify_content": null,
      "justify_items": null,
      "left": null,
      "margin": null,
      "max_height": null,
      "max_width": null,
      "min_height": null,
      "min_width": null,
      "object_fit": null,
      "object_position": null,
      "order": null,
      "overflow": null,
      "overflow_x": null,
      "overflow_y": null,
      "padding": null,
      "right": null,
      "top": null,
      "visibility": null,
      "width": null
     }
    },
    "fcb30372e7404c5d8a1ad4df91e6c7b2": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "fcbab4d8dced41a18dfccce81e3a45a0": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "fd4f333f7ece4450b04e1a9af1f9d2f6": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_d1f9b10c130542f094c8fd3d1e23b5e9",
      "placeholder": "​",
      "style": "IPY_MODEL_e575d87a7efe4ec7b1efde489839d4a6",
      "value": "model-00006-of-00008.safetensors: 100%"
     }
    },
    "fe18bba7f3fb4c31bf840541f36b3425": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_fd4f333f7ece4450b04e1a9af1f9d2f6",
       "IPY_MODEL_f60a2bdb6b6b4e0e8c3508580e247132",
       "IPY_MODEL_c0892a1881de4eb4bfabc6a68f87ae99"
      ],
      "layout": "IPY_MODEL_1bec6297c90242a88672d195bc09d429"
     }
    },
    "fe41858c6bd04c58840112b67c19a336": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_e5a82df528bb4e408797a3b6c2758f4a",
      "placeholder": "​",
      "style": "IPY_MODEL_f113ebd8c1c34806bea4dd7ed3035173",
      "value": " 9985/9985 [00:00&lt;00:00, 44264.88 examples/s]"
     }
    },
    "fea1b70fb46745feb5111b3929175b5d": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HBoxModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HBoxModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HBoxView",
      "box_style": "",
      "children": [
       "IPY_MODEL_f365820a3d3c42b2948abfe32065de14",
       "IPY_MODEL_823f1c78f15043e38bbd4dca3932a86a",
       "IPY_MODEL_a1959759c5424da9961fb2a308d4dee4"
      ],
      "layout": "IPY_MODEL_34c9c0137b504cd799c6bd6de69507c2"
     }
    },
    "ff3a94b146a948b6907f5d80c7157f99": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "DescriptionStyleModel",
     "state": {
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "DescriptionStyleModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/base",
      "_view_module_version": "1.2.0",
      "_view_name": "StyleView",
      "description_width": ""
     }
    },
    "ffdbb12a2f2c4d14911685e7683e0ef0": {
     "model_module": "@jupyter-widgets/controls",
     "model_module_version": "1.5.0",
     "model_name": "HTMLModel",
     "state": {
      "_dom_classes": [],
      "_model_module": "@jupyter-widgets/controls",
      "_model_module_version": "1.5.0",
      "_model_name": "HTMLModel",
      "_view_count": null,
      "_view_module": "@jupyter-widgets/controls",
      "_view_module_version": "1.5.0",
      "_view_name": "HTMLView",
      "description": "",
      "description_tooltip": null,
      "layout": "IPY_MODEL_ab93eabd7cea4b94b4b7a387f101e8a1",
      "placeholder": "​",
      "style": "IPY_MODEL_704f2f5a9b1c49d5a75a0025a5dda11b",
      "value": " 3.96G/3.96G [00:12&lt;00:00, 656MB/s]"
     }
    }
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
}


================================================
FILE: examples/deepcogito/cogito-v1-preview-llama-3B-lora.yml
================================================
base_model: deepcogito/cogito-v1-preview-llama-3B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: true
load_in_4bit: false
strict: false

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out

sequence_len: 4096
sample_packing: true
eval_sample_packing: false


adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/deepcogito/cogito-v1-preview-qwen-14B-lora.yml
================================================
base_model: deepcogito/cogito-v1-preview-qwen-14B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: true
load_in_4bit: false
strict: false

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out

sequence_len: 4096
sample_packing: true
eval_sample_packing: false


adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/deepseek-v2/fft-fsdp-16b.yaml
================================================
base_model: deepseek-ai/DeepSeek-V2-Lite
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
trust_remote_code: true

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

sequence_len: 2048
sample_packing: true


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 8
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 2e-5

bf16: auto
tf32: false

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 2
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/deepseek-v2/qlora-fsdp-2_5.yaml
================================================
base_model: axolotl-quants/DeepSeek-V2.5-bnb-nf4-bf16
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

load_in_8bit: false
load_in_4bit: true


plugins:
  - axolotl.integrations.liger.LigerPlugin
liger_rms_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: true

chat_template: deepseek_v2
datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

sequence_len: 4096
sample_packing: true


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

adapter: qlora
lora_r: 256
lora_alpha: 256
lora_target_linear: true
peft_use_rslora: true

gradient_accumulation_steps: 1
micro_batch_size: 8
num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 2e-5

bf16: auto
tf32: false

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 2
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: DeepseekV2DecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/devstral/README.md
================================================
# Finetune Devstral with Axolotl

Devstral Small is a 24B parameter opensource model from MistralAI found on HuggingFace [Devstral-Small-2505](https://huggingface.co/mistralai/Devstral-Small-2505) and [Devstral-Small-2507](https://huggingface.co/mistralai/Devstral-Small-2507). `Devstral-Small-2507` is the latest version of the model and has [function calling](https://mistralai.github.io/mistral-common/usage/tools/) support.

This guide shows how to fine-tune it with Axolotl with multi-turn conversations with proper masking.

The model was fine-tuned ontop of [Mistral-Small-3.1](https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Base-2503) without the vision layer and has a context of up to 128k tokens.

Thanks to the team at MistralAI for giving us early access to prepare for this release.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

    Here is an example of how to install from pip:

```bash
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
```

2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage

```bash
python scripts/cutcrossentropy_install.py | sh
```

3. Run the finetuning example:

```bash
axolotl train examples/devstral/devstral-small-qlora.yml
```

This config uses about 21GB VRAM.

Let us know how it goes. Happy finetuning! 🚀

### TIPS

- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
- Learn how to use function calling with Axolotl at [docs](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#using-tool-use).

## Optimization Guides

- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)
- [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy)
- [Liger Kernel](https://docs.axolotl.ai/docs/custom_integrations.html#liger-kernels)

## Limitations

We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.

In addition, we do not support overriding tokens yet.

## Related Resources

- [MistralAI Devstral Blog](https://mistral.ai/news/devstral)
- [MistralAI Devstral 1.1 Blog](https://mistral.ai/news/devstral-2507)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


## Future Work

- Add parity to Preference Tuning, RL, Multi-modal, etc.
- Add parity to other tokenizer configs like overriding tokens.


================================================
FILE: examples/devstral/devstral-small-qlora.yml
================================================
base_model: mistralai/Devstral-Small-2507

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

# Enable to use mistral-common tokenizer
tokenizer_use_mistral_common: true

load_in_8bit: false
load_in_4bit: true

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/qlora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true


lora_r: 32
lora_alpha: 16
lora_dropout: 0
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_torch
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
scaling_softmax: true

loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

warmup_ratio: 0.05
evals_per_epoch: 4
saves_per_epoch: 1

weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/distributed-parallel/README.md
================================================
# ND Parallelism Examples

This directory contains example configurations for training models using ND Parallelism in Axolotl. These examples demonstrate how to compose different parallelism strategies (FSDP, TP, CP, HSDP) for efficient multi-GPU training.

## Quick Start

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

2. Run the command below:

```bash
# Train Qwen3 8B with FSDP + TP + CP on a single 8-GPU node
axolotl train examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml

# Train Llama 3.1 8B with HSDP + TP on 2 nodes (16 GPUs total)
axolotl train examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml
```

## Example Configurations

### Single Node (8 GPUs)

**Qwen3 8B with FSDP + TP + CP** ([qwen3-8b-fsdp-tp-cp.yaml](./qwen3-8b-fsdp-tp-cp.yaml))
- Uses all 3 parallelism dimensions on a single node
- Ideal for: when model weights, activations, and/or context are too large to fit on single GPU

```yaml
dp_shard_size: 2         # FSDP across 2 GPUs
tensor_parallel_size: 2  # TP across 2 GPUs
context_parallel_size: 2 # CP across 2 GPUs
# Total: 2 × 2 × 2 = 8 GPUs
```

### Multi-Node

**Llama 3.1 8B with HSDP + TP** ([llama-3_1-8b-hsdp-tp.yaml](./llama-3_1-8b-hsdp-tp.yaml))
- FSDP & TP within nodes, DDP across nodes to minimize inter-node communication
- Ideal for: Scaling to multiple nodes while maintaining training efficiency

```yaml
dp_shard_size: 4        # FSDP within each 4-GPU group
tensor_parallel_size: 2 # TP within each node
dp_replicate_size: 2    # DDP across 2 groups
# Total: (4 × 2) × 2 = 16 GPUs (2 nodes)
```

## Learn More

- [ND Parallelism Documentation](https://docs.axolotl.ai/docs/nd_parallelism.html)
- [Blog: Accelerate ND-Parallel Guide](https://huggingface.co/blog/accelerate-nd-parallel)
- [Multi-GPU Training Guide](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/distributed-parallel/llama-3_1-8b-hsdp-tp.yaml
================================================
base_model: meta-llama/Llama-3.1-8B

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

dp_shard_size: 4
dp_replicate_size: 2
tensor_parallel_size: 2
# context_parallel_size: 2

dataset_prepared_path: last_run_prepared

special_tokens:
  pad_token: <|end_of_text|>

fsdp_version: 2
fsdp_config:
  offload_params: false
  state_dict_type: FULL_STATE_DICT
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: LlamaDecoderLayer
  reshard_after_forward: true

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca

output_dir: ./outputs/ndp-out/

sequence_len: 2048
sample_packing: true
flash_attention: true

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 2
optimizer: adamw_torch_fused
lr_scheduler: constant_with_warmup
learning_rate: 2e-6

bf16: true
tf32: true

logging_steps: 1
saves_per_epoch: 1

warmup_ratio: 0.1


================================================
FILE: examples/distributed-parallel/qwen3-8b-fsdp-tp-cp.yaml
================================================
base_model: Qwen/Qwen3-8B

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

dp_shard_size: 2
# dp_replicate_size: 1
context_parallel_size: 2
tensor_parallel_size: 2

dataset_prepared_path: last_run_prepared

fsdp_version: 2
fsdp_config:
  offload_params: false
  state_dict_type: FULL_STATE_DICT
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Qwen3DecoderLayer
  reshard_after_forward: true

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca

output_dir: ./outputs/ndp-out/

sequence_len: 8192
sample_packing: true
flash_attention: true

gradient_accumulation_steps: 1
micro_batch_size: 1  # must be 1 when using context parallel
num_epochs: 2
optimizer: adamw_torch_fused
lr_scheduler: constant_with_warmup
learning_rate: 2e-6

bf16: true
tf32: true

logging_steps: 1
saves_per_epoch: 1

warmup_ratio: 0.1

special_tokens:


================================================
FILE: examples/eaft/eaft-example.yml
================================================
base_model: google/gemma-3-1b-it

model_type: Gemma3ForCausalLM
cls_model_config: Gemma3TextConfig

# gemma3 doesn't seem to play nice with ddp
ddp_find_unused_parameters: true

chat_template: gemma3
eot_tokens:
  - <end_of_turn>

load_in_8bit: false
load_in_4bit: false
strict: false

datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

dataset_prepared_path:
val_set_size: 0
output_dir: ./outputs/eaft-gemma-3-1b

use_eaft: true
eaft_alpha: 1.0
eaft_k: 20

sequence_len: 1024
sample_packing: false

adapter:
lora_model_dir:

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
eval_batch_size: 1
max_steps: 1000
evaluation_strategy: "no"
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 5e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false

early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
xformers_attention:
flash_attention: true

warmup_ratio: 0.1
weight_decay: 0.0
debug:
deepspeed:
fsdp:
fsdp_config:
special_tokens:


================================================
FILE: examples/falcon-h1/falcon-h1-1b-deep-qlora.yaml
================================================
base_model: tiiuae/Falcon-H1-1.5B-Deep-Base
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

# huggingface repo
chat_template: falcon_h1
datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

val_set_size: 0.0
output_dir: ./outputs/out

adapter: qlora
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
  - in_proj
  - gate_proj
  - up_proj
  - down_proj

sequence_len: 2048
sample_packing: false
eval_sample_packing: false


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:


gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/falcon-h1/falcon-h1-1b-qlora.yaml
================================================
base_model: tiiuae/Falcon-H1-1.5B-Base
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

# huggingface repo
chat_template: falcon_h1
datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

val_set_size: 0.0
output_dir: ./outputs/out

adapter: qlora
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
  - in_proj
  - gate_proj
  - up_proj
  - down_proj

sequence_len: 2048
sample_packing: false
eval_sample_packing: false


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/falcon-h1/falcon-h1-34b-qlora.yaml
================================================
base_model: tiiuae/Falcon-H1-34B-Base
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

# huggingface repo
chat_template: falcon_h1
datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

val_set_size: 0.0
output_dir: ./outputs/out

adapter: qlora
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
  - in_proj
  - gate_proj
  - up_proj
  - down_proj

sequence_len: 2048
sample_packing: false
eval_sample_packing: false


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:


gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/falcon-h1/falcon-h1-3b-qlora.yaml
================================================
base_model: tiiuae/Falcon-H1-3B-Base
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

# huggingface repo
chat_template: falcon_h1
datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

val_set_size: 0.0
output_dir: ./outputs/out

adapter: qlora
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
  - in_proj
  - gate_proj
  - up_proj
  - down_proj

sequence_len: 2048
sample_packing: false
eval_sample_packing: false


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:


gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/falcon-h1/falcon-h1-500m-qlora.yaml
================================================
base_model: tiiuae/Falcon-H1-0.5B-Instruct
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

# huggingface repo
chat_template: falcon_h1
datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

val_set_size: 0.0
output_dir: ./outputs/out

adapter: qlora
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
  - in_proj
  - gate_proj
  - up_proj
  - down_proj

sequence_len: 2048
sample_packing: false
eval_sample_packing: false


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:


gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/falcon-h1/falcon-h1-7b-qlora.yaml
================================================
base_model: tiiuae/Falcon-H1-7B-Base
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

# huggingface repo
chat_template: falcon_h1
datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

val_set_size: 0.0
output_dir: ./outputs/out

adapter: qlora
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
  - in_proj
  - gate_proj
  - up_proj
  - down_proj

sequence_len: 2048
sample_packing: false
eval_sample_packing: false


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:


gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/gemma2/qlora.yml
================================================
base_model: google/gemma-2-9b
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

# huggingface repo
chat_template: gemma
datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    drop_system_message: true
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

val_set_size: 0.0
output_dir: ./outputs/out

adapter: qlora
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

sequence_len: 2048
sample_packing: true
eval_sample_packing: false


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:


gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/gemma2/reward-model.yaml
================================================
base_model: google/gemma-2-2b
# optionally might have model_type or tokenizer_type
model_type: AutoModelForSequenceClassification
num_labels: 1
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

reward_model: true
chat_template: gemma
datasets:
  - path: argilla/distilabel-intel-orca-dpo-pairs
    type: bradley_terry.chat_template
val_set_size: 0.0
output_dir: ./outputs/out
remove_unused_columns: false

sequence_len: 2048
sample_packing: false
eval_sample_packing: false


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:


gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/gemma3/gemma-3-1b-qlora.yml
================================================
base_model: google/gemma-3-1b-it

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

# gemma3 doesn't seem to play nice with ddp
ddp_find_unused_parameters: true

load_in_8bit: false
load_in_4bit: true

# huggingface repo
chat_template: gemma3
eot_tokens:
  - <end_of_turn>
datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

val_set_size: 0.0
output_dir: ./outputs/out

# Freeze vision tower
unfrozen_parameters:
  - ^model\.language_model\..*
  - ^lm_head\..*

adapter: qlora
lora_r: 32
lora_alpha: 16
lora_dropout: 0
lora_target_linear: true

sequence_len: 2048
sample_packing: true
eval_sample_packing: false


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:


gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/gemma3/gemma-3-270m-qlora.yml
================================================
base_model: google/gemma-3-270m-it

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

# gemma3 doesn't seem to play nice with ddp
ddp_find_unused_parameters: true

load_in_8bit: false
load_in_4bit: true

# huggingface repo
chat_template: gemma3
eot_tokens:
  - <end_of_turn>
datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

val_set_size: 0.0
output_dir: ./outputs/out

# Freeze vision tower
unfrozen_parameters:
  - ^model\.language_model\..*
  - ^lm_head\..*

adapter: qlora
lora_r: 32
lora_alpha: 16
lora_dropout: 0
lora_target_linear: true

sequence_len: 2048
sample_packing: true
eval_sample_packing: false


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:


gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: examples/gemma3/gemma-3-4b-qlora.yml
================================================
base_model: google/gemma-3-4b-it

load_in_4bit: true

# gemma3 doesn't seem to play nice with ddp
ddp_find_unused_parameters: true

chat_template: gemma3
eot_tokens:
  - <end_of_turn>
datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

dataset_prepared_path: last_run_prepared
val_set_size: 0.01
output_dir: ./outputs/out

# Freeze vision tower
unfrozen_parameters:
  - ^model\.language_model\..*
  - ^lm_head\..*

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true


lora_r: 32
lora_alpha: 16
lora_dropout: 0
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
logging_steps: 1
flash_attention: true
eager_attention:

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/gemma3/gemma-3-4b-vision-qlora.yml
================================================
base_model: google/gemma-3-4b-it
processor_type: AutoProcessor

load_in_4bit: true

# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

# gemma3 doesn't seem to play nice with ddp
ddp_find_unused_parameters: true

chat_template: gemma3
eot_tokens:
  - <end_of_turn>
datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]

dataset_prepared_path: last_run_prepared
val_set_size: 0.01
output_dir: ./outputs/out

adapter: qlora
lora_model_dir:

sequence_len: 2048
pad_to_sequence_len: false

lora_r: 32
lora_alpha: 16
lora_dropout: 0
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
logging_steps: 1
flash_attention: true
eager_attention:

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/gemma3n/README.md
================================================
# Finetune Gemma-3n with Axolotl

Gemma-3n is a family of multimodal models from Google found on [HuggingFace](https://huggingface.co/collections/google/gemma-3n-685065323f5984ef315c93f4). This guide shows how to fine-tune it with Axolotl.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

    Here is an example of how to install from pip:

```bash
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
```

2. In addition to Axolotl's requirements, Gemma-3n requires:

```bash
pip3 install timm==1.0.17

# for loading audio data
pip3 install librosa==0.11.0
```

3. Download sample dataset files

```bash
# for text + vision + audio only
wget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/African_elephant.jpg
wget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/En-us-African_elephant.oga
```

4. Run the finetuning example:

```bash
# text only
axolotl train examples/gemma3n/gemma-3n-e2b-qlora.yml

# text + vision
axolotl train examples/gemma3n/gemma-3n-e2b-vision-qlora.yml

# text + vision + audio
axolotl train examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml
```

Let us know how it goes. Happy finetuning! 🚀

WARNING: The loss and grad norm will be much higher than normal. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.

### TIPS

- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
- The multimodal dataset format follows the OpenAI multi-content Messages format as seen [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).

## Optimization Guides

- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)

## Related Resources

- [Gemma 3n Blog](https://ai.google.dev/gemma/docs/gemma-3n)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/gemma3n/gemma-3n-e2b-qlora.yml
================================================
base_model: google/gemma-3n-E2B-it

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
cut_cross_entropy: true

load_in_8bit: false
load_in_4bit: true

# for use with fft to only train on language model layers
# unfrozen_parameters:
  # - model.language_model.*
  # - lm_head
  # - embed_tokens


chat_template: gemma3n
eot_tokens:
  - <end_of_turn>
datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    split: train[:1%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

val_set_size: 0.0
output_dir: ./outputs/out

adapter: qlora
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
# lora_target_linear: # Does not work with gemma3n currently
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'

sequence_len: 2048
sample_packing: true
eval_sample_packing: true
pad_to_sequence_len: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
# flash_attention: true  # Any attention impl does not work with gemma3n now

warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: examples/gemma3n/gemma-3n-e2b-vision-audio-qlora.yml
================================================
base_model: google/gemma-3n-E2B-it
processor_type: AutoProcessor

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
cut_cross_entropy: true

# for use with fft to only train on language model layers
# unfrozen_parameters:
  # - model.language_model.*
  # - lm_head
  # - embed_tokens

load_in_4bit: true

# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

# gemma3 doesn't seem to play nice with ddp
ddp_find_unused_parameters: true

chat_template: gemma3n
eot_tokens:
  - <end_of_turn>

# sample dataset below requires downloading audio/image in advance
# wget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/African_elephant.jpg
# wget https://huggingface.co/datasets/Nanobit/text-vision-audio-2k-test/resolve/main/En-us-African_elephant.oga
datasets:
  - path: Nanobit/text-vision-audio-2k-test
    type: chat_template
dataset_prepared_path:
val_set_size: 0.01
output_dir: ./outputs/out

adapter: qlora
lora_model_dir:

sequence_len: 2048
pad_to_sequence_len: false

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
logging_steps: 1
# flash_attention: true  # Any attention impl does not work with gemma3n now

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0


================================================
FILE: examples/gemma3n/gemma-3n-e2b-vision-qlora.yml
================================================
base_model: google/gemma-3n-E2B-it
processor_type: AutoProcessor

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
cut_cross_entropy: true

# for use with fft to only train on language model layers
# unfrozen_parameters:
  # - model.language_model.*
  # - lm_head
  # - embed_tokens

load_in_4bit: true

# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

# gemma3 doesn't seem to play nice with ddp
ddp_find_unused_parameters: true

chat_template: gemma3n
eot_tokens:
  - <end_of_turn>
datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
dataset_prepared_path:
val_set_size: 0.01
output_dir: ./outputs/out

adapter: qlora
lora_model_dir:

sequence_len: 2048
pad_to_sequence_len: false

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
logging_steps: 1
# flash_attention: true  # Any attention impl does not work with gemma3n now

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0


================================================
FILE: examples/glm4/qlora-32b.yaml
================================================
base_model: THUDM/GLM-4-32B-0414
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_4bit: true

datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0
output_dir: ./outputs/qlora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true
eval_sample_packing: true


lora_r: 16
lora_alpha: 32
lora_dropout: 0.05
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/glm45/README.md
================================================
# Finetune Z.ai's GLM-4.5-Air with Axolotl

[GLM-4.5-Air](https://huggingface.co/zai-org/GLM-4.5-Air) is a MoE model by Z.ai.

This guide shows how to fine-tune it with Axolotl.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.

3. Run the finetuning example:

```bash
# QLoRA (1x80GB @ ~63.4GiB/GPU)
axolotl train examples/glm45/glm-45-air-qlora.yaml
```

### Dataset

In addition to the standard OpenAI Messages format, GLM-4.5 supports an extra parameter for thinking in the assistant section.

```json
{
    "role": "assistant",
    "reasoning_content": "...",  // or have </think>...</think> in `content`
    "content": "..."
}
```

Make sure you set the below extra attributes if needed:

```yaml
datasets:
  - path: ...
    type: chat_template
    message_property_mappings:
      role: role
      content: content

    #   tool_calls: tool_calls  # uncomment if using tools
    #   reasoning_content: reasoning_content  # uncomment if have reasoning

# Uncomment if training on tool role (you would rarely if ever need this)
# eot_tokens:
#   - <|observation|>
```

### Tips

- The role name for tools in this template is `tool`.
- You will see this Axolotl WARNING — this is expected as the template does not use EOS:
  ```
  EOS token '<|endoftext|>' not found in chat_template. Please check if your template/EOS token is correct.
  ```
- You can run a full finetuning by removing `adapter: qlora`, `load_in_4bit: true`, and `quantize_moe_experts: true` from the config.
- **LoRA kernels**: Incompatible with this model. Must be explicitly disabled (`lora_*_kernel: false`).
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).

## Optimization Guides

Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).

## Related Resources

- [GLM-4.5-Air on HuggingFace](https://huggingface.co/zai-org/GLM-4.5-Air)
- [GLM-4.5 Blog](https://z.ai/blog/glm-4.5)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/glm45/glm-45-air-qlora.yaml
================================================
base_model: zai-org/GLM-4.5-Air

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: false
load_in_4bit: true

quantize_moe_experts: true # important

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 16
lora_alpha: 8
lora_dropout: 0
lora_target_modules:
  - q_proj
  - v_proj
  - k_proj
  - o_proj

# lora_target_parameters:
#   - mlp.experts.gate_up_proj
#   - mlp.experts.down_proj

lora_mlp_kernel: false
lora_qkv_kernel: false
lora_o_kernel: false

gradient_accumulation_steps: 2
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/glm46v/README.md
================================================
# Finetune GLM-4.6V with Axolotl

GLM-4.6V is a family of vision-language models from ZhipuAI found on [HuggingFace](https://huggingface.co/zai-org/GLM-4.6V). This guide shows how to fine-tune it with Axolotl for vision-language tasks.


## Getting started

1. Install Axolotl from source following the [installation guide](https://docs.axolotl.ai/docs/installation.html#sec-edge-build).

2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.


3. Run the fine-tuning:

    glm-4-6v-flash(9B)
    ```bash
    axolotl train examples/glm46v/glm-4-6v-flash-qlora.yaml
    ```

Let us know how it goes. Happy finetuning! 🚀

## Tips

- Vision datasets should follow the format described in the [multimodal docs](https://docs.axolotl.ai/docs/multimodal.html#dataset-format)
- You can run a **full finetuning** by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset in the [dataset loading docs](https://docs.axolotl.ai/docs/dataset_loading.html).

## Supported Models

- **GLM-4.6V**: Full vision-language model (`zai-org/GLM-4.6V`)
- **GLM-4.6V-Flash**: Faster variant (`zai-org/GLM-4.6V-Flash`)

## Optimization Guides

Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).

## Related Resources

- [ZhipuAI GLM-4.6V](https://huggingface.co/zai-org/GLM-4.6V)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/glm46v/glm-4-6v-flash-ddp.yaml
================================================
base_model: zai-org/GLM-4.6V-Flash
trust_remote_code: true

processor_type: AutoProcessor
load_in_4bit: true

# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false
ddp_find_unused_parameters: true

output_dir: ./outputs/glm-4-6v-flash-qlora
datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]

adapter: qlora
lora_r: 16
lora_alpha: 32
lora_dropout: 0.05
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

sequence_len: 2048

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
logging_steps: 1
sdp_attention: true

warmup_ratio: 0.1
evals_per_epoch: 0
saves_per_epoch: 1
weight_decay: 0.0


================================================
FILE: examples/glm46v/glm-4-6v-flash-qlora.yaml
================================================
base_model: zai-org/GLM-4.6V-Flash
trust_remote_code: true

processor_type: AutoProcessor
load_in_4bit: true

# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

output_dir: ./outputs/glm-4-6v-flash-qlora
datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]

adapter: qlora
lora_r: 16
lora_alpha: 32
lora_dropout: 0.05
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

sequence_len: 2048

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
logging_steps: 1
sdp_attention: true

warmup_ratio: 0.1
evals_per_epoch: 0
saves_per_epoch: 1
weight_decay: 0.0


================================================
FILE: examples/glm47-flash/README.md
================================================
# Finetune Z.ai's GLM-4.7-Flash with Axolotl

[GLM-4.7-Flash](https://huggingface.co/zai-org/GLM-4.7-Flash) is a 30B-A3B MoE model by Z.ai.

This guide shows how to fine-tune it with Axolotl.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.

3. Run the finetuning example:

```bash
# QLoRA
# - no target experts (1x48GB @ ~24GiB/GPU)
# - target experts (1x48GB @ ~34GiB/GPU)
axolotl train examples/glm47-flash/qlora.yaml

# QLoRA FSDP2 no target experts (2x48GB @ ~29GiB/GPU)
axolotl train examples/glm47-flash/qlora_fsdp.yaml
```

```bash
# LoRA
# - no target experts (1x48GB @ ~35GiB/GPU)
# - target experts (1x48GB @ OOM. Projected ~45-50GiB/GPU)
axolotl train examples/glm47-flash/lora.yaml

# LoRA FSDP2 no target experts (2x48GB @ ~43GiB/GPU)
axolotl train examples/glm47-flash/lora_fsdp.yaml
```

### MoE Expert Quantization & Expert LoRA

This model quantize expert weights on load. To learn about expert quantization, expert LoRA targeting, and related limitations, see the [MoE Expert Quantization](https://docs.axolotl.ai/docs/expert_quantization.html) docs.

## Limitations

- **lora_target_linear**: Incompatible for this model.
- **LoRA kernels**: Incompatible with this model due to non-standard attention projections (DSA). Must be explicitly disabled (`lora_*_kernel: false`).


### TIPS

- For inference, the official Z.ai team recommends these default settings (most tasks):
  - `temperature: 1.0`
  - `top_p: 0.95`
  - `max_new_tokens: 131072`
- You can run a full finetuning by removing `adapter: qlora`, `load_in_4bit: true`, and `quantize_moe_experts: true` from the config. This is heavy, so we have not tested this.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).

## Optimization Guides

Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).

## Related Resources

- [GLM-4.7-Flash on HuggingFace](https://huggingface.co/zai-org/GLM-4.7-Flash)
- [GLM-4.7 Blog](https://z.ai/blog/glm-4.7)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/glm47-flash/lora.yaml
================================================
base_model: zai-org/GLM-4.7-Flash

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: true
quantize_moe_experts: true

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/glm4.7-flash-lora-8bit-out

adapter: lora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0
lora_target_modules:
  - q_proj
  - v_proj
  - k_proj
  - o_proj

# Uncomment to also target MoE expert weights:
# lora_target_parameters:
#   - mlp.experts.gate_up_proj
#   - mlp.experts.down_proj

# LoRA kernels incompatible with DSA attention
lora_mlp_kernel: false
lora_qkv_kernel: false
lora_o_kernel: false

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_torch_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1


================================================
FILE: examples/glm47-flash/lora_fsdp.yaml
================================================
base_model: zai-org/GLM-4.7-Flash

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: true
quantize_moe_experts: true

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/glm4.7-flash-lora-8bit-fsdp-out

adapter: lora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0
lora_target_modules:
  - q_proj
  - v_proj
  - k_proj
  - o_proj

# Uncomment to also target MoE expert weights:
# lora_target_parameters:
#   - mlp.experts.gate_up_proj
#   - mlp.experts.down_proj

# LoRA kernels incompatible with DSA attention
lora_mlp_kernel: false
lora_qkv_kernel: false
lora_o_kernel: false

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_torch_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

fsdp_config:
  fsdp_version: 2
  offload_params: false
  cpu_ram_efficient_loading: false
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Glm4MoeLiteDecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: true


================================================
FILE: examples/glm47-flash/qlora.yaml
================================================
base_model: zai-org/GLM-4.7-Flash

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_4bit: true
quantize_moe_experts: true

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/glm4.7-flash-qlora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0
lora_target_modules:
  - q_proj
  - v_proj
  - k_proj
  - o_proj

# Uncomment to also target MoE expert weights:
# lora_target_parameters:
#   - mlp.experts.gate_up_proj
#   - mlp.experts.down_proj

# LoRA kernels incompatible with DSA attention
lora_mlp_kernel: false
lora_qkv_kernel: false
lora_o_kernel: false

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_torch_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1


================================================
FILE: examples/glm47-flash/qlora_fsdp.yaml
================================================
base_model: zai-org/GLM-4.7-Flash

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_4bit: true
quantize_moe_experts: true

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/glm4.7-flash-qlora-fsdp-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0
lora_target_modules:
  - q_proj
  - v_proj
  - k_proj
  - o_proj

# Uncomment to also target MoE expert weights:
# lora_target_parameters:
#   - mlp.experts.gate_up_proj
#   - mlp.experts.down_proj

# LoRA kernels incompatible with DSA attention
lora_mlp_kernel: false
lora_qkv_kernel: false
lora_o_kernel: false

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_torch_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

fsdp_config:
  fsdp_version: 2
  offload_params: false
  cpu_ram_efficient_loading: false
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Glm4MoeLiteDecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: true


================================================
FILE: examples/gpt-oss/README.md
================================================
# Finetune OpenAI's GPT-OSS with Axolotl

[GPT-OSS](https://huggingface.co/collections/openai/gpt-oss-68911959590a1634ba11c7a4) are a family of open-weight MoE models trained by OpenAI, released in August 2025. There are two variants: 20B and 120B.

In October 2025, OpenAI released safeguard models built upon GPT-OSS called [GPT-OSS-Safeguard](https://huggingface.co/collections/openai/gpt-oss-safeguard). They use the same architecture, so the same examples below can be re-used.

This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

    Here is an example of how to install from pip:

```bash
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
```

2. Choose one of the following configs below for training the 20B model. (for 120B, see [below](#training-120b))

```bash
# LoRA SFT linear layers (1x48GB @ ~44GiB)
axolotl train examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml

# FFT SFT with offloading (2x24GB @ ~21GiB/GPU)
axolotl train examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml

# FFT SFT (8x48GB @ ~36GiB/GPU or 4x80GB @ ~46GiB/GPU)
axolotl train examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
```

Note: Memory usage taken from `device_mem_reserved(gib)` from logs.

### Training 120B

On 8xH100s, make sure you have ~3TB of free disk space. With each checkpoint clocking in at ~720GB, along with the base
model, and final model output, you may need at least 3TB of free disk space to keep at least 2 checkpoints.

```bash
# FFT SFT with offloading (8x80GB @ ~49GiB/GPU)
axolotl train examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
```

To simplify fine-tuning across 2 nodes × 8x H100 (80GB) GPUs, we've partnered with [Baseten](https://baseten.co) to showcase multi-node
training of the 120B model using Baseten Truss. You can read more about this recipe on
[Baseten's blog](https://www.baseten.co/blog/how-to-fine-tune-gpt-oss-120b-with-baseten-and-axolotl/). The recipe can
be found on their
[GitHub](https://github.com/basetenlabs/ml-cookbook/tree/main/examples/oss-gpt-120b-axolotl/training).

ERRATA: Transformers saves the model Architecture prefixed with `FSDP` which needs to be manually renamed in `config.json`.
See https://github.com/huggingface/transformers/pull/40207 for the status of this issue.

```bash
sed -i 's/FSDPGptOssForCausalLM/GptOssForCausalLM/g' ./outputs/gpt-oss-out/config.json
```

When using SHARDED_STATE_DICT with FSDP, the final checkpoint should automatically merge the sharded weights to your
configured `output_dir`. However, if that step fails due to a disk space error, you can take an additional step to
merge the sharded weights.  This step will automatically determine the last checkpoint directory and merge the sharded
weights to `{output_dir}/merged`.

```bash
axolotl merge-sharded-fsdp-weights examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
mv ./outputs/gpt-oss-out/merged/* ./outputs/gpt-oss-out/
```

### How to set reasoning_effort in template?

The harmony template has a feature to set the `reasoning_effort` during prompt building. The default is `medium`. If you would like to adjust this, you can add the following to your config:

```yaml
chat_template_kwargs:
  reasoning_effort: "high"  # low | medium | high
```

Currently, this applies globally. There is no method to apply per sample yet. If you are interested in adding this, please feel free to create an Issue to discuss.

### Inferencing your fine-tuned model

#### vLLM

GPT-OSS support in vLLM does not exist in a stable release yet. See https://x.com/MaziyarPanahi/status/1955741905515323425
for more information about using a special vllm-openai docker image for inferencing with vLLM.

Optionally, vLLM can be installed from nightly:

```bash
pip install --no-build-isolation --pre -U vllm --extra-index-url https://wheels.vllm.ai/nightly
```
and the vLLM server can be started with the following command (modify `--tensor-parallel-size 8` to match your environment):
```bash
vllm serve ./outputs/gpt-oss-out/ --served-model-name axolotl/gpt-oss-20b --host 0.0.0.0 --port 8888  --tensor-parallel-size 8
```

#### SGLang

SGLang has 0-day support in main, see https://github.com/sgl-project/sglang/issues/8833 for infomation on installing
SGLang from source. Once you've installed SGLang, run the following command to launch a SGLang server:

```bash
python3 -m sglang.launch_server --model ./outputs/gpt-oss-out/ --served-model-name axolotl/gpt-oss-120b --host 0.0.0.0 --port 8888 --tp 8
```

### Tool use

GPT-OSS has a comprehensive tool understanding. Axolotl supports tool calling datasets for Supervised Fine-tuning.

Here is an example dataset config:
```yaml
datasets:
  - path: Nanobit/text-tools-2k-test
    type: chat_template
```

See [Nanobit/text-tools-2k-test](https://huggingface.co/datasets/Nanobit/text-tools-2k-test) for the sample dataset.

Refer to [our docs](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#using-tool-use) for more info.

### Thinking and chat_template masking conflict

OpenAI’s Harmony template hides `thinking` in all non-final turns, which conflicts with Axolotl’s `chat_template` masking.

If your dataset has `thinking` content mid-turn, there are two paths we recommend:

- Train only on the last turn. This can be accomplished via chat_template's [train on last doc](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#training-on-last-message).

- Adjust your dataset to only have `thinking` content in the last turn.

### TIPS

- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).

## Optimization Guides

- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)

## Related Resources

- [GPT-OSS Blog](https://openai.com/index/introducing-gpt-oss/)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/gpt-oss/gpt-oss-120b-fft-fsdp2-offload.yaml
================================================
# the original mxfp4 quantized model is not supported with FSDP cpu_ram_efficient_loading
# FSDP cpu_ram_efficient_loading is used to reduce the initial CPU memory usage when loading the model
base_model: axolotl-ai-co/gpt-oss-120b-dequantized

use_kernels: false

dp_shard_size: 16  # requires 2x8xH100 nodes

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

experimental_skip_move_to_device: true  # prevent OOM by NOT putting model to GPU before sharding

datasets:
  - path: HuggingFaceH4/Multilingual-Thinking
    type: chat_template
    field_thinking: thinking
    template_thinking_key: thinking

dataset_prepared_path: last_run_prepared
val_set_size: 0
output_dir: ./outputs/gpt-oss-out/
save_total_limit: 2  # the 120B model can use up to 720GB of disk space per checkpoint, so let's only keep the last 2

sequence_len: 4096
sample_packing: true
pad_to_sequence_len: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

trackio_project_name:
trackio_run_name:
trackio_space_id:

gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1

optimizer: adamw_torch_fused  # 8bit optimizers do not work with FSDP2 offload
lr_scheduler: constant_with_warmup
learning_rate: 2e-5

bf16: true
tf32: true

flash_attention: true
attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3

gradient_checkpointing: true
activation_offloading: true

logging_steps: 1
saves_per_epoch: 1

warmup_ratio: 0.03

special_tokens:
eot_tokens:
  - "<|end|>"

fsdp_version: 2
fsdp_config:
  offload_params: true
  state_dict_type: SHARDED_STATE_DICT
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: GptOssDecoderLayer
  reshard_after_forward: true
  cpu_ram_efficient_loading: true


================================================
FILE: examples/gpt-oss/gpt-oss-20b-fft-deepspeed-zero3.yaml
================================================
base_model: openai/gpt-oss-20b
use_kernels: false
model_quantization_config: Mxfp4Config
model_quantization_config_kwargs:
  dequantize: true

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

experimental_skip_move_to_device: true  # prevent OOM by NOT putting model to GPU before sharding

datasets:
  - path: HuggingFaceH4/Multilingual-Thinking
    type: chat_template
    field_thinking: thinking
    template_thinking_key: thinking

dataset_prepared_path: last_run_prepared
val_set_size: 0
output_dir: ./outputs/gpt-oss-out/

sequence_len: 4096
sample_packing: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

trackio_project_name:
trackio_run_name:
trackio_space_id:

gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1

optimizer: adamw_torch_8bit
lr_scheduler: constant_with_warmup
learning_rate: 2e-5

bf16: true
tf32: true

flash_attention: true
attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3

gradient_checkpointing: true
activation_offloading: true

logging_steps: 1
saves_per_epoch: 1

warmup_ratio: 0.03

special_tokens:
eot_tokens:
  - "<|end|>"

# choose the zero3 configuration that best fits your system capabilities
deepspeed: deepspeed_configs/zero3_bf16.json


================================================
FILE: examples/gpt-oss/gpt-oss-20b-fft-fsdp2-offload.yaml
================================================
base_model: openai/gpt-oss-20b
use_kernels: true
model_quantization_config: Mxfp4Config
model_quantization_config_kwargs:
  dequantize: true

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

experimental_skip_move_to_device: true  # prevent OOM by NOT putting model to GPU before sharding

datasets:
  - path: HuggingFaceH4/Multilingual-Thinking
    type: chat_template
    field_thinking: thinking
    template_thinking_key: thinking

dataset_prepared_path: ./outputs/last_run_prepared
val_set_size: 0
output_dir: ./outputs/gpt-oss-out/

sequence_len: 4096
sample_packing: true
pad_to_sequence_len: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

trackio_project_name:
trackio_run_name:
trackio_space_id:

gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1

optimizer: adamw_torch_fused  # 8bit optimizers do not work with FSDP2 offload
lr_scheduler: constant_with_warmup
learning_rate: 2e-5

bf16: true
tf32: true

flash_attention: true
attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3

gradient_checkpointing: true
activation_offloading: true

logging_steps: 1
saves_per_epoch: 1

warmup_ratio: 0.03

special_tokens:
eot_tokens:
  - "<|end|>"

fsdp_version: 2
fsdp_config:
  offload_params: true
  state_dict_type: SHARDED_STATE_DICT
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: GptOssDecoderLayer
  reshard_after_forward: true
  #  cpu_ram_efficient_loading: true

# cpu_ram_efficient_loading cannot be used with MXFP4 model quantization.
# It can only be used with a dequantized model like `axolotl-ai-co/gpt-oss-120b-dequantized`


================================================
FILE: examples/gpt-oss/gpt-oss-20b-fft-fsdp2.yaml
================================================
base_model: openai/gpt-oss-20b
use_kernels: false
model_quantization_config: Mxfp4Config
model_quantization_config_kwargs:
  dequantize: true

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

experimental_skip_move_to_device: true  # prevent OOM by NOT putting model to GPU before sharding

datasets:
  - path: HuggingFaceH4/Multilingual-Thinking
    type: chat_template
    field_thinking: thinking
    template_thinking_key: thinking

dataset_prepared_path: ./outputs/last_run_prepared
val_set_size: 0
output_dir: ./outputs/gpt-oss-out/

sequence_len: 4096
sample_packing: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

trackio_project_name:
trackio_run_name:
trackio_space_id:

gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1

optimizer: adamw_torch_8bit
lr_scheduler: constant_with_warmup
learning_rate: 2e-5

bf16: true
tf32: true

flash_attention: true
attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3

gradient_checkpointing: true
activation_offloading: true

logging_steps: 1
saves_per_epoch: 1

warmup_ratio: 0.03

special_tokens:
eot_tokens:
  - "<|end|>"

fsdp_version: 2
fsdp_config:
  offload_params: false
  state_dict_type: SHARDED_STATE_DICT
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: GptOssDecoderLayer
  reshard_after_forward: true
#  cpu_ram_efficient_loading: true


================================================
FILE: examples/gpt-oss/gpt-oss-20b-sft-lora-singlegpu.yaml
================================================
base_model: openai/gpt-oss-20b
use_kernels: true
model_quantization_config: Mxfp4Config
model_quantization_config_kwargs:
  dequantize: true

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

experimental_skip_move_to_device: true  # prevent OOM by not putting model to GPU before sharding

datasets:
  - path: HuggingFaceH4/Multilingual-Thinking
    type: chat_template
    field_thinking: thinking
    template_thinking_key: thinking

dataset_prepared_path: last_run_prepared
val_set_size: 0
output_dir: ./outputs/gpt-oss-out/

sequence_len: 4096
sample_packing: true

adapter: lora
lora_r: 8
lora_alpha: 16
lora_dropout: 0.0  # dropout not supported when using LoRA over expert parameters
lora_target_linear: true

# TODO: not supported for now, see peft#2710
#lora_target_parameters:  # target the experts in the last two layers
#  - "22._checkpoint_wrapped_module.mlp.experts.gate_up_proj"
#  - "22._checkpoint_wrapped_module.mlp.experts.down_proj"
#  - "23._checkpoint_wrapped_module.mlp.experts.gate_up_proj"
#  - "23._checkpoint_wrapped_module.mlp.experts.down_proj"

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

trackio_project_name:
trackio_run_name:
trackio_space_id:

gradient_accumulation_steps: 8
micro_batch_size: 1
num_epochs: 1

optimizer: adamw_torch_8bit
lr_scheduler: constant_with_warmup
learning_rate: 2e-4

bf16: true
tf32: true

flash_attention: true
attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3

gradient_checkpointing: true
activation_offloading: true

logging_steps: 1
saves_per_epoch: 1
warmup_ratio: 0.1

special_tokens:
eot_tokens:
  - "<|end|>"


================================================
FILE: examples/gpt-oss/gpt-oss-safeguard-20b-sft-lora-singlegpu.yaml
================================================
base_model: openai/gpt-oss-safeguard-20b
use_kernels: true
model_quantization_config: Mxfp4Config
model_quantization_config_kwargs:
  dequantize: true

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

experimental_skip_move_to_device: true  # prevent OOM by not putting model to GPU before sharding

datasets:
  - path: HuggingFaceH4/Multilingual-Thinking
    type: chat_template
    field_thinking: thinking
    template_thinking_key: thinking

dataset_prepared_path: last_run_prepared
val_set_size: 0
output_dir: ./outputs/gpt-oss-safeguard-out/

sequence_len: 4096
sample_packing: true

adapter: lora
lora_r: 8
lora_alpha: 16
lora_dropout: 0.0  # dropout not supported when using LoRA over expert parameters
lora_target_linear: true

# TODO: not supported for now, see peft#2710
#lora_target_parameters:  # target the experts in the last two layers
#  - "22._checkpoint_wrapped_module.mlp.experts.gate_up_proj"
#  - "22._checkpoint_wrapped_module.mlp.experts.down_proj"
#  - "23._checkpoint_wrapped_module.mlp.experts.gate_up_proj"
#  - "23._checkpoint_wrapped_module.mlp.experts.down_proj"

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

trackio_project_name:
trackio_run_name:
trackio_space_id:

gradient_accumulation_steps: 8
micro_batch_size: 1
num_epochs: 1

optimizer: adamw_torch_8bit
lr_scheduler: constant_with_warmup
learning_rate: 2e-4

bf16: true
tf32: true

flash_attention: true
attn_implementation: kernels-community/vllm-flash-attn3  # this is not needed if using flash_attn >= 2.8.3

gradient_checkpointing: true
activation_offloading: true

logging_steps: 1
saves_per_epoch: 1
warmup_ratio: 0.1

special_tokens:
eot_tokens:
  - "<|end|>"


================================================
FILE: examples/granite4/README.md
================================================
# Finetune IBM's Granite 4.0 with Axolotl

[Granite 4.0](https://huggingface.co/collections/ibm-granite/granite-40-language-models) are a family of open source models trained by IBM Research.

This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as Granite4 is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).

    Here is an example of how to install from main for pip:

```bash
# Ensure you have Pytorch installed (Pytorch 2.7.1 min)
git clone https://github.com/axolotl-ai-cloud/axolotl.git
cd axolotl

pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation -e '.[flash-attn]'

# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
python scripts/cutcrossentropy_install.py | sh
```

2. Run the finetuning example:

```bash
axolotl train examples/granite4/granite-4.0-tiny-fft.yaml
```

This config uses about 40.8GiB VRAM.

Let us know how it goes. Happy finetuning! 🚀

### TIPS

- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).

### Limitation

Adapter finetuning does not work at the moment. It would error with

```bash
RuntimeError: mat1 and mat2 shapes cannot be multiplied (4096x3072 and 1x1179648)
```

In addition, if adapter training works, `lora_target_linear: true` will not work due to:
```bash
ValueError: Target module GraniteMoeHybridParallelExperts() is not supported.
```

## Optimization Guides

- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)

## Related Resources

- [Granite Docs](https://www.ibm.com/granite/docs/models/granite)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/granite4/granite-4.0-tiny-fft.yaml
================================================
base_model: ibm-granite/granite-4.0-tiny-preview

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/model-out

sequence_len: 2048
sample_packing: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/hunyuan/README.md
================================================
# Finetune HunYuan with Axolotl

Tencent released a family of opensource models called HunYuan with varying parameter scales of 0.5B, 1.8B, 4B, and 7B scale for both Pre-trained and Instruct variants. The models can be found at [HuggingFace](https://huggingface.co/collections/tencent/hunyuan-dense-model-6890632cda26b19119c9c5e7). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html). You need to install from main as HunYuan is only on nightly or use our latest [Docker images](https://docs.axolotl.ai/docs/docker.html).

    Here is an example of how to install from main for pip:

```bash
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
git clone https://github.com/axolotl-ai-cloud/axolotl.git
cd axolotl

pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation -e '.[flash-attn]'

# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
python scripts/cutcrossentropy_install.py | sh
```

2. Run the finetuning example:

```bash
axolotl train examples/hunyuan/hunyuan-v1-dense-qlora.yaml
```

This config uses about 4.7 GB VRAM.

Let us know how it goes. Happy finetuning! 🚀

### Dataset

HunYuan Instruct models can choose to enter a slow think or fast think pattern. For best performance on fine-tuning their Instruct models, your dataset should be adjusted to match their pattern.

```python
# fast think pattern
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "/no_think What color is the sun?" },
    {"role": "assistant", "content": "<think>\n\n</think>\n<answer>\nThe sun is yellow.\n</answer>"}
]

# slow think pattern
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "/no_think What color is the sun?" },
    {"role": "assistant", "content": "<think>\nThe user is asking about the color of the sun. I need to ...\n</think>\n<answer>\nThe sun is yellow.\n</answer>"}
]
```

### TIPS

- For inference, the official Tencent team recommends

```json

{
  "do_sample": true,
  "top_k": 20,
  "top_p": 0.8,
  "repetition_penalty": 1.05,
  "temperature": 0.7
}

```

- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).

## Optimization Guides

- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)

## Related Resources

- [Tencent HunYuan Blog](https://hunyuan.tencent.com/)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/hunyuan/hunyuan-v1-dense-qlora.yaml
================================================
base_model: tencent/Hunyuan-0.5B-Instruct

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/internvl3_5/README.md
================================================
# Finetune OpenGV's InternVL with Axolotl

[InternVL 3.5](https://huggingface.co/OpenGVLab/InternVL3_5-8B-HF) is a family of powerful vision-language models supporting dynamic resolution and multi-image understanding by OpenGV. It features a ViT-style vision encoder and strong language model backbone for tasks like visual question answering, OCR, and scene text understanding.

This guide shows how to fine-tune it with Axolotl.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

2. Install `timm` for vision model support:

    ```bash
    pip install timm==1.0.19
    ```

3. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.

4. Run the finetuning example:

    ```bash
    axolotl train examples/internvl3_5/internvl3_5-8b-qlora.yml
    ```

This config uses about 8.21 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀

### Tips

- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the multi-modal format as seen [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).

## Optimization Guides

Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).

## Related Resources

- [InternVL Paper](https://huggingface.co/papers/2508.18265)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/internvl3_5/internvl3_5-8b-qlora.yml
================================================
base_model: OpenGVLab/InternVL3_5-8B-HF
processor_type: AutoProcessor

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_4bit: true

# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
    field_messages: messages

dataset_prepared_path: last_run_prepared
val_set_size: 0.01
output_dir: ./outputs/out

adapter: qlora
lora_model_dir:

sequence_len: 2048

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32: true

gradient_checkpointing: true
logging_steps: 1
flash_attention: true
eager_attention:

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/jamba/README.md
================================================
# Jamba

- ✅ qlora w/ deepspeed Zero-2 needs at least 2x GPUs and
  - 35GiB VRAM per GPU w minimal context length
  - 56GiB VRAM per GPU (w multipack enabled)
- ✅ qlora w/ deepspeed Zero-3 needs at least 2x GPUs and 67GiB VRAM (wtf?)
- ✅ qlora single-gpu, ~51GiB VRAM
- ✅ multipack
- ✅ FSDP
- ❓ 8-bit LoRA


================================================
FILE: examples/jamba/qlora.yaml
================================================
base_model: ai21labs/Jamba-v0.1
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.0
output_dir: ./outputs/out

sequence_len: 4096
sample_packing: false
pad_to_sequence_len: false
eval_sample_packing: false

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

adapter: qlora
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

low_cpu_mem_usage: true
gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 2
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 0.00001

bf16: auto
tf32: false

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/jamba/qlora_deepspeed.yaml
================================================
base_model: ai21labs/Jamba-v0.1
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name
trust_remote_code: true

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.0
output_dir: ./outputs/out

sequence_len: 4096
sample_packing: false
pad_to_sequence_len: false
eval_sample_packing: false

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

adapter: qlora
lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

low_cpu_mem_usage: true
gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 2
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 0.00001

bf16: auto
tf32: false

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1

deepspeed: deepspeed_configs/zero2.json
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/jamba/qlora_fsdp_large.yaml
================================================
base_model: ai21labs/AI21-Jamba-1.5-Large
# optionally might have model_type or tokenizer_type
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_4bit: true
use_tensorboard: true
chat_template: jamba
datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    drop_system_message: true
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: jamba-large-fsdp-qlora-ft
adapter: qlora
sequence_len: 2048
sample_packing: true


lora_r: 16
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: [down_proj,gate_proj,in_proj,k_proj,o_proj,out_proj,q_proj,up_proj,v_proj,x_proj]
lora_target_linear: false

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 2
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.00001

bf16: true
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: true
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: false
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: JambaAttentionDecoderLayer,JambaMambaDecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/kimi-linear/README.md
================================================
# Finetune MoonshotAI's Kimi Linear with Axolotl

[Kimi Linear](https://huggingface.co/collections/moonshotai/kimi-linear-a3b) is a MoE model (48B total, 3B active) by MoonshotAI using a hybrid linear attention architecture to achieve a 1M token context length. It uses Kimi Delta Attention (KDA), a refined version of Gated DeltaNet that reduces KV cache size by up to 75% and boosts decoding throughput by up to 6x for long contexts.

This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.

**Note:** Axolotl uses experimental training code for Kimi Linear as their original modeling code is inference-only.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

2. Install CCE via [docs](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy)

3. Run the finetuning example:

    ```bash
    axolotl train examples/kimi-linear/kimi-48b-lora.yaml
    ```

This config uses about 98.7GiB VRAM.

Let us know how it goes. Happy finetuning!

### TIPS

- Kimi Linear requires `trust_remote_code: true`.
- You can run a full finetuning by removing the `adapter: lora` and `load_in_8bit: true`.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html)
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template)

## Optimization Guides

See 👉 [docs](https://docs.axolotl.ai/docs/optimizations.html).

## Limitations

This is not yet compatible with MoE kernels from transformers v5.

## Related Resources

- [Kimi Linear Paper](https://huggingface.co/papers/2510.26692)
- [Kimi Linear GitHub](https://github.com/MoonshotAI/Kimi-Linear)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/kimi-linear/kimi-48b-lora.yaml
================================================
base_model: moonshotai/Kimi-Linear-48B-A3B-Instruct

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: true
load_in_4bit: false
strict: false

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template
    split: train

dataset_prepared_path: last_run_prepared
val_set_size: 0.2
output_dir: ./outputs/lora-out

adapter: lora
lora_model_dir:

sequence_len: 2048
sample_packing: true
pad_to_sequence_len: true

lora_r: 16
lora_alpha: 32
lora_dropout: 0.05
lora_fan_in_fan_out:
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
early_stopping_patience:
resume_from_checkpoint:
local_rank:
logging_steps: 1
flash_attention: true

loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

warmup_ratio: 0.1
evals_per_epoch: 2
saves_per_epoch: 1
debug:
deepspeed:
weight_decay: 0.0
fsdp:
fsdp_config:
special_tokens:


================================================
FILE: examples/llama-2/README.md
================================================
# Overview

This is an example of a llama-2 configuration for 7b and 13b. The yaml file contains configuration for the 7b variant, but you can just aswell use the same settings for 13b.

The 7b variant fits on any 24GB VRAM GPU and will take up about 17 GB of VRAM during training if using qlora and 20 GB if using lora. On a RTX 4090 it trains 3 epochs of the default dataset in about 15 minutes.

The 13b variant will fit if you change these settings to these values:
gradient_accumulation_steps: 2
micro_batch_size: 1

```shell
accelerate launch -m axolotl.cli.train examples/llama-2/qlora.yml
```
or

```shell
accelerate launch -m axolotl.cli.train examples/llama-2/lora.yml
```

To launch a full finetuning with 16-bit precision:

```shell
accelerate launch -m axolotl.cli.train examples/llama-2/fft_optimized.yml
```


================================================
FILE: examples/llama-2/fft_optimized.yml
================================================
base_model: NousResearch/Llama-2-7b-hf
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.05
output_dir: ./outputs/out

sequence_len: 4096
sample_packing: true


adapter:
lora_model_dir:
lora_r:
lora_alpha:
lora_dropout:
lora_target_linear:

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
flash_attn_cross_entropy: false
flash_attn_rms_norm: true
flash_attn_fuse_mlp: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1

deepspeed: #deepspeed_configs/zero2.json # multi-gpu only
weight_decay: 0.1
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-2/gptq-lora.yml
================================================
base_model: TheBloke/Llama-2-7B-GPTQ
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

gptq: true
gptq_disable_exllama: true

tokenizer_use_fast: true
tokenizer_legacy: true
push_dataset_to_hub:
hf_use_auth_token: true
datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
adapter: lora
lora_model_dir:
sequence_len: 4096
sample_packing:
lora_r: 8
lora_alpha: 32
lora_dropout: 0.05
lora_target_modules:
  - k_proj
  - o_proj
  - q_proj
  - v_proj
lora_target_linear:
wandb_project:
wandb_watch:
wandb_name:
wandb_log_model:
output_dir: ./outputs/model-out
gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_torch_fused
adam_beta2: 0.95
adam_eps: 0.00001
max_grad_norm: 1.0
torchdistx_path:
lr_scheduler: cosine
lr_quadratic_warmup: true
learning_rate: 0.000017
bf16: false
fp16: false
float16: true
tf32: true
gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention:
sdp_attention:
flash_optimum:
warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1
special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-2/lisa.yml
================================================
base_model: NousResearch/Llama-2-7b-hf
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.05
output_dir: ./outputs/lisa-out

sequence_len: 4096
sample_packing: true


adapter:
lora_model_dir:
lora_r:
lora_alpha:
lora_dropout:
lora_target_linear:

lisa_n_layers: 4
lisa_step_interval: 20
lisa_layers_attribute: model.layers

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 5e-5 # recommendation from lisa paper for 7b

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
flash_attn_cross_entropy: false
flash_attn_rms_norm: true
flash_attn_fuse_mlp: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1
special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-2/loftq.yml
================================================
base_model: NousResearch/Llama-2-7b-hf
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out

sequence_len: 4096
sample_packing: true


adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
peft:
  loftq_config:
    loftq_bits: 4

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-2/lora.yml
================================================
base_model: NousResearch/Llama-2-7b-hf
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: true
load_in_4bit: false

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out

sequence_len: 4096
sample_packing: true


adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-2/qlora-fsdp.yml
================================================
base_model: NousResearch/Llama-2-7b-hf
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: yahma/alpaca-cleaned
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.05
output_dir: ./outputs/qlora-out

adapter: qlora
lora_model_dir:

sequence_len: 512
sample_packing: false


lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 4
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.00001

bf16: auto
tf32: false

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  # fsdp_cpu_offload_pin_memory: false  # uncomment to enable swap memory usage when RAM is insufficient
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-2/qlora.yml
================================================
base_model: NousResearch/Llama-2-7b-hf
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/qlora-out

adapter: qlora
lora_model_dir:

sequence_len: 4096
sample_packing: true


lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: paged_adamw_32bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-2/relora.yml
================================================
base_model: NousResearch/Llama-2-7b-hf
model_type: LlamaForCausalLM
tokenizer_type: LlamaTokenizer


load_in_8bit: false
load_in_4bit: true

datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/relora-out

adapter: qlora
lora_model_dir:

sequence_len: 4096
sample_packing: true


lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

relora: true
relora_prune_ratio: 0.9
relora_cpu_offload: false
jagged_restart_steps: 150
jagged_restart_warmup_steps: 10
jagged_restart_anneal_steps: false

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  bos_token: "<s>"
  eos_token: "</s>"
  unk_token: "<unk>"

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/3b-fp8-fsdp2.yaml
================================================
base_model: meta-llama/Llama-3.2-3B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: false
strict: false

plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true

datasets:
  - path: yahma/alpaca-cleaned
    type: alpaca

output_dir: ./outputs/fp8_out/

sample_packing: true
pad_to_sequence_len: true
sequence_len: 512

flex_attention: true
flex_attn_compile_kwargs:
  dynamic: false
  mode: max-autotune-no-cudagraphs
torch_compile: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 16
num_epochs: 1
optimizer: adamw_torch_fused

cosine_constant_lr_ratio: 0
cosine_min_lr_ratio: 1.0
learning_rate: 2e-5
save_only_model: true

fp8: true
fp8_enable_fsdp_float8_all_gather: true

resume_from_checkpoint:
logging_steps: 1

evals_per_epoch: 1
saves_per_epoch: 1

warmup_steps: 10
weight_decay: 0.0

fsdp_version: 2
fsdp_config:
  offload_params: false
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: LlamaDecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: false

special_tokens:
  pad_token: <|end_of_text|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/3b-qat-fsdp2.yaml
================================================
base_model: meta-llama/Llama-3.2-3B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: false
strict: false

plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true


datasets:
  - path: yahma/alpaca-cleaned
    type: alpaca
    split: train[:95%]

output_dir: ./outputs/qat_out/
dataset_prepared_path: ./outputs/qat_out/dataset_prepared

sample_packing: false
sequence_len: 8192
flash_attention: true

qat:
  activation_dtype: int8
  weight_dtype: int4
  group_size: 32

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 16
num_epochs: 1
optimizer: adamw_torch_fused

cosine_constant_lr_ratio: 0
cosine_min_lr_ratio: 1.0
learning_rate: 2e-5
save_only_model: true
bf16: true

resume_from_checkpoint:
logging_steps: 1

evals_per_epoch: 1
saves_per_epoch: 1

warmup_ratio: 0.1
weight_decay: 0.0
fsdp:
  - full_shard
  - auto_wrap

fsdp_config:
  fsdp_version: 2
  fsdp_offload_params: false
  fsdp_cpu_ram_efficient_loading: false
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_reshard_after_forward: true
  fsdp_activation_checkpointing: true

special_tokens:
  pad_token: <|finetune_right_pad_id|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/3b-qat-mxfp4.yaml
================================================
base_model: meta-llama/Llama-3.2-3B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: false
strict: false

plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true

datasets:
  - path: yahma/alpaca-cleaned
    type: alpaca
    split: train[:95%]

output_dir: ./outputs/qat_out/
dataset_prepared_path: ./outputs/dataset_prepared

sequence_len: 2048
flash_attention: true

qat:
  activation_dtype: mxfp4
  weight_dtype: mxfp4
  group_size: 32

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_checkpointing: true
activation_offloading: true
gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_8bit

cosine_constant_lr_ratio: 0
cosine_min_lr_ratio: 1.0
learning_rate: 2e-5
save_only_model: true
bf16: true

resume_from_checkpoint:
logging_steps: 1

evals_per_epoch: 1
saves_per_epoch: 1

warmup_ratio: 0.1
weight_decay: 0.0

special_tokens:
  pad_token: <|finetune_right_pad_id|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/3b-qat-nvfp4.yaml
================================================
base_model: meta-llama/Llama-3.2-3B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: false
strict: false

plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true

datasets:
  - path: yahma/alpaca-cleaned
    type: alpaca
    split: train[:95%]

output_dir: ./outputs/qat_out/
dataset_prepared_path: ./outputs/dataset_prepared

sequence_len: 8192
flash_attention: true

qat:
  activation_dtype: nvfp4
  weight_dtype: nvfp4
  group_size: 16 # only group_size of 16 is supported with nvfp4

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_checkpointing: true
gradient_accumulation_steps: 1
micro_batch_size: 64
num_epochs: 1
optimizer: adamw_torch_fused

cosine_constant_lr_ratio: 0
cosine_min_lr_ratio: 1.0
learning_rate: 2e-5
save_only_model: true
bf16: true

resume_from_checkpoint:
logging_steps: 1

evals_per_epoch: 1
saves_per_epoch: 1

warmup_ratio: 0.1
weight_decay: 0.0

special_tokens:
  pad_token: <|finetune_right_pad_id|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/README.md
================================================
# Llama-3

https://llama.meta.com/llama3/

[8B Base Model](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
 - [Full Fine Tune](./fft-8b.yaml)
   - Single GPU @ 48GB VRAM
 - [LoRA](./lora-8b.yml)
   - Single GPU @ 11GB VRAM

[70B Base Model](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
 - [QLORA+FSDP](./qlora-fsdp-70b.yaml)
   - Dual GPU @ 21GB VRAM


================================================
FILE: examples/llama-3/diffusion/pretrain-1b.yaml
================================================
base_model: meta-llama/Llama-3.2-1B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

pretraining_dataset:
  - path: wikitext
    name: wikitext-103-raw-v1
    type: completion
    field: text

plugins:
  - axolotl.integrations.diffusion.DiffusionPlugin

diffusion:
  noise_schedule: cosine
  min_mask_ratio: 0.15
  max_mask_ratio: 0.85
  num_diffusion_steps: 128
  eps: 5e-4
  importance_weighting: true
  mask_token_id: 128002
  generate_samples: true
  generation_interval: 250

output_dir: ./outputs/model-out

sequence_len: 512
sample_packing: true

gradient_accumulation_steps: 8
micro_batch_size: 4
max_steps: 10000
warmup_ratio: 0.1

optimizer: adamw_8bit
lr_scheduler: cosine
learning_rate: 3e-4
sdp_attention: true

bf16: auto
tf32: true

logging_steps: 1
save_strategy: steps
save_steps: 1000

special_tokens:
  pad_token: "<|end_of_text|>"

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/diffusion/sft-1b.yaml
================================================
base_model: meta-llama/Llama-3.2-1B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
val_set_size: 0.05

plugins:
  - axolotl.integrations.diffusion.DiffusionPlugin

diffusion:
  noise_schedule: cosine
  min_mask_ratio: 0.1
  max_mask_ratio: 0.9
  num_diffusion_steps: 128
  eps: 1e-3
  importance_weighting: true
  mask_token_id: 128002
  generate_samples: true
  generation_interval: 250

output_dir: ./outputs/model-out

sequence_len: 512
sample_packing: true
eval_sample_packing: true

gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 1
warmup_steps: 0.1

optimizer: adamw_8bit
lr_scheduler: cosine
learning_rate: 1e-5

bf16: auto
tf32: true

gradient_checkpointing: true
resume_from_checkpoint:
sdp_attention: true

logging_steps: 1
save_strategy: best
eval_strategy: epoch

special_tokens:
  pad_token: "<|end_of_text|>"

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/fft-8b-liger-fsdp.yaml
================================================
base_model: NousResearch/Meta-Llama-3.1-8B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.liger.LigerPlugin
liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: true


chat_template: llama3
datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

dataset_prepared_path: last_run_prepared
val_set_size: 0.02
output_dir: ./outputs/out

sequence_len: 4096
sample_packing: true


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 2e-5

bf16: auto
tf32: false

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 2
saves_per_epoch: 1
weight_decay: 0.0
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_backward_prefetch: BACKWARD_PRE
special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot_id|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/fft-8b.yaml
================================================
base_model: NousResearch/Meta-Llama-3.1-8B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.05
output_dir: ./outputs/out

sequence_len: 8192
sample_packing: true


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 8
micro_batch_size: 1
num_epochs: 1
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 2e-5

bf16: auto
tf32: false

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 2
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  pad_token: <|end_of_text|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/instruct-dpo-lora-8b.yml
================================================
base_model: meta-llama/Meta-Llama-3-8B-Instruct
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot_id|>

load_in_8bit: true
load_in_4bit: false

chat_template: llama3
rl: dpo
datasets:
  - path: fozziethebeat/alpaca_messages_2k_dpo_test
    type: chat_template.default
    field_messages: conversation
    field_chosen: chosen
    field_rejected: rejected
    message_property_mappings:
      role: role
      content: content
    roles:
      system:
        - system
      user:
        - user
      assistant:
        - assistant

dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out

sequence_len: 4096
sample_packing: false


adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/instruct-lora-8b.yml
================================================
base_model: NousResearch/Meta-Llama-3-8B-Instruct
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: true
load_in_4bit: false

chat_template: llama3
datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out

sequence_len: 4096
sample_packing: false


adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
   pad_token: <|end_of_text|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/lora-1b-deduplicate-dpo.yml
================================================
base_model: meta-llama/Llama-3.2-1B
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: true
load_in_4bit: false

chat_template: llama3
rl: dpo
datasets:
  - path: fozziethebeat/alpaca_messages_2k_dpo_test
    type: chat_template.default
    field_messages: conversation
    field_chosen: chosen
    field_rejected: rejected
    message_property_mappings:
      role: role
      content: content
    roles:
      system:
        - system
      user:
        - user
      assistant:
        - assistant
  - path: fozziethebeat/alpaca_messages_2k_dpo_test
    type: chat_template.default
    field_messages: conversation
    field_chosen: chosen
    field_rejected: rejected
    message_property_mappings:
      role: role
      content: content
    roles:
      system:
        - system
      user:
        - user
      assistant:
        - assistant

dataset_exact_deduplication: true
dataset_prepared_path:
val_set_size: 0
output_dir: ./outputs/lora-out

sequence_len: 4096
sample_packing: false


adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/lora-1b-deduplicate-sft.yml
================================================
base_model: meta-llama/Llama-3.2-1B
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: true
load_in_4bit: false

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.0
output_dir: ./outputs/lora-out

dataset_exact_deduplication: true

sequence_len: 4096
sample_packing: true
eval_sample_packing: false


adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_modules_to_save:
  - embed_tokens
  - lm_head

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
   pad_token: <|end_of_text|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/lora-1b-kernels.yml
================================================
base_model: NousResearch/Llama-3.2-1B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out

adapter: lora
lora_model_dir:

sequence_len: 2048
sample_packing: true


lora_r: 16
lora_alpha: 32
# Currently, we don't support dropout with our custom Triton kernels
# lora_dropout: 0.05
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

# These options enable our custom Triton kernels / autograd
# functions for MLP and attention calculations
lora_mlp_kernel: true
lora_qkv_kernel: true
lora_o_kernel: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  pad_token: "<|end_of_text|>"

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/lora-1b-ray.yml
================================================
base_model: NousResearch/Llama-3.2-1B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out

adapter: lora
lora_model_dir:

sequence_len: 2048
sample_packing: true
eval_sample_packing: true


lora_r: 16
lora_alpha: 32
lora_dropout: 0.05
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1

deepspeed: deepspeed_configs/zero3.json
weight_decay: 0.0
special_tokens:
  pad_token: "<|end_of_text|>"

use_ray: true
ray_num_workers: 4

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/lora-1b-sample-packing-sequentially.yml
================================================
base_model: meta-llama/Llama-3.2-1B
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: true
load_in_4bit: false

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.0
output_dir: ./outputs/lora-out

test_value: true

sequence_len: 4096
sample_packing: true
sample_packing_sequentially: true
curriculum_sampling: true
eval_sample_packing: false


adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_modules_to_save:
  - embed_tokens
  - lm_head

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  pad_token: <|end_of_text|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/lora-1b.yml
================================================
base_model: NousResearch/Llama-3.2-1B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca

val_set_size: 0.1
output_dir: ./outputs/lora-out

adapter: lora
lora_model_dir:

sequence_len: 2048
sample_packing: true
eval_sample_packing: true


lora_r: 16
lora_alpha: 32
lora_dropout: 0.05
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 2
num_epochs: 1

optimizer: adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  pad_token: "<|end_of_text|>"

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/lora-8b.yml
================================================
base_model: NousResearch/Meta-Llama-3-8B
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: true
load_in_4bit: false

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out

sequence_len: 4096
sample_packing: true
eval_sample_packing: false


adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_modules_to_save:
  - embed_tokens
  - lm_head

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
   pad_token: <|end_of_text|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/opentelemetry-qlora.yml
================================================
base_model: NousResearch/Llama-3.2-1B
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer

load_in_4bit: true

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca

output_dir: ./outputs/opentelemetry-example

adapter: qlora
sequence_len: 512
sample_packing: false

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

# OpenTelemetry Configuration
use_otel_metrics: true
otel_metrics_host: "localhost"
otel_metrics_port: 8000

# Disable WandB
use_wandb: false

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: paged_adamw_32bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
logging_steps: 1
flash_attention: false

warmup_ratio: 0.1
evals_per_epoch: 2
saves_per_epoch: 1
weight_decay: 0.0

special_tokens:
  pad_token: "<|end_of_text|>"


================================================
FILE: examples/llama-3/qlora-1b-gdpo.yaml
================================================
base_model: meta-llama/Llama-3.2-1B-Instruct

chat_template: llama3

rl: gdpo

trl:
  beta: 0.001
  max_completion_length: 128
  num_generations: 2
  temperature: 0.7
  top_p: 0.95

  use_vllm: false


  multi_objective_aggregation: normalize_then_sum

  reward_funcs:
    - rwd.format_reward
    - rwd.correctness_reward
  reward_weights: [1.0, 2.0]

  log_completions: true
  num_completions_to_print: 3
  scale_rewards: true

datasets:
  - path: openai/gsm8k
    name: main
    split: train[:1000]
    type: rwd.gsm8k_transform

val_set_size: 0.0
output_dir: ./outputs/llama3-gdpo-out

sequence_len: 512
sample_packing: false
pad_to_sequence_len: false

gradient_accumulation_steps: 8
micro_batch_size: 1
num_epochs: 1
max_steps: 100

optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 5e-5
weight_decay: 0.01
warmup_steps: 10

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false

flash_attention: true
logging_steps: 1
save_steps: 50
save_safetensors: true

special_tokens:
  pad_token: "<|end_of_text|>"


seed: 42


================================================
FILE: examples/llama-3/qlora-1b-kto.yaml
================================================
base_model: meta-llama/Llama-3.2-1B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

rl: kto
rl_beta: 0.5
kto_desirable_weight: 0.2

datasets:
  - path: argilla/ultrafeedback-binarized-preferences-cleaned-kto
    type: llama3.ultra
    split: train
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/qlora-out

remove_unused_columns: false

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: false  # not supported with kto
eval_sample_packing: false
pad_to_sequence_len: false

lora_r: 32
lora_alpha: 64
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  pad_token: "<|end_of_text|>"

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/qlora-1b.yml
================================================
base_model: NousResearch/Llama-3.2-1B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/qlora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true
eval_sample_packing: true


lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  pad_token: "<|end_of_text|>"

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/qlora-fsdp-405b.yaml
================================================
base_model: hugging-quants/Meta-Llama-3.1-405B-BNB-NF4-BF16
# optionally might have model_type or tokenizer_type
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_4bit: true

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out/qlora-llama3_1-405b

adapter: qlora

sequence_len: 2048
sample_packing: true


lora_r: 16
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 2
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.00001

bf16: true
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: true
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
special_tokens:
  pad_token: <|finetune_right_pad_id|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/qlora-fsdp-70b.yaml
================================================
base_model: casperhansen/llama-3-70b-fp16
# optionally might have model_type or tokenizer_type
model_type: LlamaForCausalLM
tokenizer_type: AutoTokenizer  # PreTrainedTokenizerFast
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.05
output_dir: ./outputs/out/qlora-llama3-70b

adapter: qlora
lora_model_dir:

sequence_len: 512
sample_packing: false


lora_r: 8
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.00001

bf16: auto
tf32: false

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: LlamaDecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
special_tokens:
  pad_token: <|end_of_text|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/qlora.yml
================================================
base_model: NousResearch/Meta-Llama-3-8B
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: aaditya/alpaca_subset_1
    type: alpaca
dataset_prepared_path:
val_set_size: 0
output_dir: ./outputs/qlora-out

adapter: qlora
lora_model_dir:

sequence_len: 4096
sample_packing: true


lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: paged_adamw_32bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  pad_token: "<|end_of_text|>"

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3/sparse-finetuning.yaml
================================================
base_model: neuralmagic/Sparse-Llama-3.1-8B-2of4

plugins:
  - axolotl.integrations.llm_compressor.LLMCompressorPlugin

load_in_8bit: false
load_in_4bit: false
strict: false

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.05
output_dir: ./outputs/out

sequence_len: 4096
sample_packing: true

eval_sample_packing: false

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 8
micro_batch_size: 1
num_epochs: 1
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 2e-5

train_on_inputs: false
group_by_length: false
bf16: auto
fp16:
tf32: false

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
early_stopping_patience:
resume_from_checkpoint:
logging_steps: 1
xformers_attention:
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 2
eval_table_size:
saves_per_epoch: 1
debug:
deepspeed:
weight_decay: 0.0
fsdp:
fsdp_config:
special_tokens:
  pad_token: <|end_of_text|>

llmcompressor:
  recipe:
    finetuning_stage:
      finetuning_modifiers:
        ConstantPruningModifier:
          targets: [
            're:.*q_proj.weight',
            're:.*k_proj.weight',
            're:.*v_proj.weight',
            're:.*o_proj.weight',
            're:.*gate_proj.weight',
            're:.*up_proj.weight',
            're:.*down_proj.weight',
          ]
          start: 0
  save_compressed: true

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-3-vision/lora-11b.yaml
================================================
base_model: alpindale/Llama-3.2-11B-Vision-Instruct
# optionally might have model_type or tokenizer_type or processor_type
processor_type: AutoProcessor
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name


# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

chat_template: llama3_2_vision
datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
dataset_prepared_path:
val_set_size: 0.0
output_dir: ./outputs/out

adapter: lora
lora_model_dir:

sequence_len: 8192
pad_to_sequence_len: false

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32: true

gradient_checkpointing: true
logging_steps: 1
# flash_attention: true  # use for text-only mode
sdp_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-4/README.md
================================================
# Llama 4 by Meta AI

## Flash Attention vs Flex Attention

While Flash Attention to support is "enabled" for Llama-4, the upstream implementation is not correct and usage of Flex Attention is recommended.

## Available Examples

### Llama 4 Scout 17Bx16Experts (109B)

Flex Attention
- [Text Single GPU (H100) QLoRA](./scout-qlora-single-h100-flex.yaml)
- [Text Multi GPU QLoRA w/ FSDP2](./scout-qlora-flexattn-fsdp2.yaml)

[//]: # (Flash Attention &#40;Do not use&#41;)

[//]: # (- [Multi-Modal/Vision QLoRA w/ FSDP1]&#40;./scout-vision-qlora-fsdp.yaml&#41;)

[//]: # (- [Text Single GPU &#40;H100&#41; QLoRA]&#40;./scout-qlora-single-h100.yaml&#41;)

[//]: # (- [Text Multi GPU QLoRA w/ FSDP1]&#40;./scout-qlora-fsdp1.yaml&#41;)

Our Single H100 implementation for Llama 4 Scout uses only 64.5GB VRAM for post-training with 4k context length @ 519 tokens/second. [WandB logs here](https://wandb.ai/axolotl-ai/llama4-flexattn-qlora/runs/wpie7dkj)
Multi-GPU (4xH100) for Llama 4 Scout uses 62.8GB VRAM/GPU @ 4k contenxt length @ 280tps/gpu, [WandB logs here](https://wandb.ai/axolotl-ai/llama4-flexattn-qlora/runs/2lkezdj8)

### Llama 4 Maverick 17Bx128Experts (400B)

Coming Soon

## Delinearized Llama 4 Models

We provide a script to delinearize Llama 4 linearized models into regular HuggingFace Llama 4 models.

```bash
axolotl delinearize-llama4 --model path/to/model_dir --output path/to/output_dir
```

Note: This only works with the non-quantized linearized model. If you have an adapter, merge it with the *non-quantized linearized* model before delinearizing.


================================================
FILE: examples/llama-4/do-no-use-fa2/maverick-qlora-fsdp1.yaml
================================================
base_model: axolotl-quants/Llama-4-Maverick-17B-128E-Linearized-bnb-nf4-bf16
model_type: Llama4ForConditionalGeneration
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name


plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_glu_activation: true
liger_rms_norm: true
liger_layer_norm: true

llama4_linearized_experts: true
load_in_4bit: true
adapter: qlora
lora_r: 32
lora_alpha: 64
lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
  # - experts.gate_projs.[0-9]+$
  # - experts.up_projs.[0-9]+$
  # - experts.down_projs.[0-9]+$
lora_modules_to_save:
# - lm_head
# - embed_tokens

chat_template: llama4
datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

sequence_len: 4096
sample_packing: true


gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 1e-4

bf16: true
tf32: true

logging_steps: 1
flash_attention: true

gradient_checkpointing: offload
gradient_checkpointing_kwargs:
  use_reentrant: false

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
fsdp:
  - auto_wrap
  - full_shard
fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
special_tokens:
  pad_token: <|finetune_right_pad|>
  eos_token: <|eot|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-4/do-no-use-fa2/scout-qlora-fsdp1.yaml
================================================
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
model_type: Llama4ForConditionalGeneration
  # Automatically upload checkpoint and final model to HF
  # hub_model_id: username/custom_model_name


# torch_compile: true
plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_glu_activation: true
liger_rms_norm: true
liger_layer_norm: true

llama4_linearized_experts: true
load_in_4bit: true
adapter: qlora
lora_r: 32
lora_alpha: 64
lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
    # - experts.gate_projs.[0-9]+$
    # - experts.up_projs.[0-9]+$
    # - experts.down_projs.[0-9]+$
lora_modules_to_save:
  - lm_head
  - embed_tokens

chat_template: llama4
datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

sequence_len: 4096
sample_packing: true


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 2e-5

bf16: true
tf32: true

logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
fsdp:
  - auto_wrap
  - full_shard
fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_activation_checkpointing: true
special_tokens:
  pad_token: <|finetune_right_pad|>
  eos_token: <|eot|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-4/do-no-use-fa2/scout-qlora-single-h100.yaml
================================================
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
model_type: Llama4ForConditionalGeneration
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name


plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_glu_activation: true
liger_rms_norm: true
liger_layer_norm: true

llama4_linearized_experts: true
load_in_4bit: true
adapter: qlora
lora_r: 32
lora_alpha: 64
lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
  # - experts.gate_projs.[0-9]+$
  # - experts.up_projs.[0-9]+$
  # - experts.down_projs.[0-9]+$
lora_modules_to_save:
  # - lm_head
  # - embed_tokens

lora_mlp_kernel: true
lora_qkv_kernel: true
lora_o_kernel: true

chat_template: llama4
datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

sequence_len: 4096  # up to 8k will work on a single H100
sample_packing: true


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_4bit
lr_scheduler: cosine
learning_rate: 1e-4

bf16: true
tf32: true

logging_steps: 1
flash_attention: true

gradient_checkpointing: offload
gradient_checkpointing_kwargs:
  use_reentrant: false

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  pad_token: <|finetune_right_pad|>
  eos_token: <|eot|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-4/do-no-use-fa2/scout-vision-qlora-fsdp.yaml
================================================
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
model_type: Llama4ForConditionalGeneration
processor_type: Llama4Processor
  # Automatically upload checkpoint and final model to HF
  # hub_model_id: username/custom_model_name


# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

sequence_len: 4096

plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_glu_activation: true
liger_rms_norm: true
liger_layer_norm: true

llama4_linearized_experts: true  # use Axolotl's customized model
load_in_4bit: true
adapter: qlora
lora_r: 32
lora_alpha: 64
lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
  - vision_adapter.mlp.fc1
  - vision_adapter.mlp.fc2
  # - experts.gate_projs.[0-9]+$
  # - experts.up_projs.[0-9]+$
  # - experts.down_projs.[0-9]+$
lora_modules_to_save:
  - lm_head
  - embed_tokens

chat_template: llama4
datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]

dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_4bit
lr_scheduler: cosine
learning_rate: 2e-5

bf16: true
tf32: true

logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
fsdp:
  - auto_wrap
  - full_shard
fsdp_config:
  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_activation_checkpointing: true
special_tokens:
  pad_token: <|finetune_right_pad|>
  eos_token: <|eot|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-4/scout-qlora-flexattn-fsdp2.yaml
================================================
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
model_type: Llama4ForConditionalGeneration
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_glu_activation: true
liger_rms_norm: true
liger_layer_norm: true

llama4_linearized_experts: true
load_in_4bit: true
adapter: qlora
lora_r: 32
lora_alpha: 64
lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
  # - experts.gate_projs.[0-9]+$
  # - experts.up_projs.[0-9]+$
  # - experts.down_projs.[0-9]+$
lora_modules_to_save:
  # - lm_head
  # - embed_tokens

chat_template: llama4
datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

sequence_len: 4096
sample_packing: true


gradient_accumulation_steps: 1
micro_batch_size: 2
num_epochs: 3
optimizer: adamw_torch_4bit
lr_scheduler: cosine
learning_rate: 1e-4

bf16: true
tf32: true

logging_steps: 1
flex_attention: true
flex_attn_compile_kwargs:
  dynamic: false
  mode: max-autotune-no-cudagraphs

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
fsdp:
  - auto_wrap
  - full_shard
fsdp_config:
  fsdp_version: 2
  fsdp_offload_params: false
  # fsdp_cpu_ram_efficient_loading: true # does not work with load_in_8bit/4bit
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_reshard_after_forward: true
  fsdp_activation_checkpointing: true
special_tokens:
  pad_token: <|finetune_right_pad|>
  eos_token: <|eot|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-4/scout-qlora-single-h100-flex.yaml
================================================
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
model_type: Llama4ForConditionalGeneration
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.liger.LigerPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

liger_glu_activation: true
liger_rms_norm: true
liger_layer_norm: true

llama4_linearized_experts: true  # needed with custom linearized experts model
load_in_4bit: true
adapter: qlora
lora_r: 32
lora_alpha: 64
lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
  # - experts.gate_projs.[0-9]+$  # optionally train the moe experts
  # - experts.up_projs.[0-9]+$
  # - experts.down_projs.[0-9]+$
lora_modules_to_save:
  # - lm_head  # needed if modifying vocabulary
  # - embed_tokens

lora_mlp_kernel: true
lora_qkv_kernel: true
lora_o_kernel: true

chat_template: llama4
datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

sequence_len: 4096  # up to 8k will work on a single H100
sample_packing: true


gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_4bit
lr_scheduler: cosine
learning_rate: 1e-4

bf16: true
tf32: true

torch_compile: true
flex_attention: true
flex_attn_compile_kwargs:
  dynamic: false
  mode: max-autotune-no-cudagraphs

gradient_checkpointing: offload
gradient_checkpointing_kwargs:
  use_reentrant: false

logging_steps: 1
warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

weight_decay: 0.0
special_tokens:
  pad_token: <|finetune_right_pad|>
  eos_token: <|eot|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llama-4/scout-vision-qlora-fsdp2-flex.yaml
================================================
base_model: axolotl-quants/Llama-4-Scout-17B-16E-Linearized-bnb-nf4-bf16
model_type: Llama4ForConditionalGeneration
processor_type: Llama4Processor
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

sequence_len: 4096

plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_glu_activation: true
liger_rms_norm: true
liger_layer_norm: true

llama4_linearized_experts: true  # use Axolotl's customized model
load_in_4bit: true
adapter: qlora
lora_r: 32
lora_alpha: 64
lora_target_modules:
  - self_attn.q_proj
  - self_attn.k_proj
  - self_attn.v_proj
  - self_attn.o_proj
  - shared_expert.gate_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
  - vision_adapter.mlp.fc1
  - vision_adapter.mlp.fc2
  # - experts.gate_projs.[0-9]+$
  # - experts.up_projs.[0-9]+$
  # - experts.down_projs.[0-9]+$
lora_modules_to_save:
  - lm_head
  - embed_tokens

chat_template: llama4
datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]

dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_4bit
lr_scheduler: cosine
learning_rate: 1e-4

bf16: true
tf32: true

logging_steps: 1
flex_attention: true
flex_attn_compile_kwargs:
  dynamic: false
  mode: max-autotune-no-cudagraphs

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
fsdp:
  - auto_wrap
  - full_shard
fsdp_config:
  fsdp_version: 2
  fsdp_offload_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: Llama4TextDecoderLayer
  fsdp_state_dict_type: SHARDED_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_reshard_after_forward: true
  fsdp_activation_checkpointing: true
special_tokens:
  pad_token: <|finetune_right_pad|>
  eos_token: <|eot|>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/llava/lora-7b.yaml
================================================
base_model: llava-hf/llava-1.5-7b-hf
processor_type: AutoProcessor

# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

chat_template: llava
datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
dataset_prepared_path:
val_set_size: 0.0
output_dir: ./outputs/out

adapter: lora
lora_model_dir:

sequence_len: 8192
pad_to_sequence_len: false

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32: true

gradient_checkpointing: true
logging_steps: 1
flash_attention: true
eager_attention:

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/magistral/README.md
================================================
# Finetune Magistral Small with Axolotl

Magistral Small is a 24B parameter opensource model from MistralAI found on HuggingFace at [2506](https://huggingface.co/mistralai/Magistral-Small-2506), [2507](https://huggingface.co/mistralai/Magistral-Small-2507) (see [Thinking](#thinking)), and [2509](https://huggingface.co/mistralai/Magistral-Small-2509) (see [Vision](#vision)). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.

MistralAI has also released a proprietary medium-sized version called Magistral Medium.

Thanks to the team at MistralAI for giving us early access to prepare for these releases.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

    Here is an example of how to install from pip:

```bash
# Ensure you have Pytorch installed (Pytorch 2.7.0 min)
pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
```

2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage

```bash
python scripts/cutcrossentropy_install.py | sh
```

3. Run the finetuning example:

```bash
axolotl train examples/magistral/magistral-small-qlora.yaml
```

This config uses about 24GB VRAM.

Let us know how it goes. Happy finetuning! 🚀

### Thinking

MistralAI has released their [2507](https://huggingface.co/mistralai/Magistral-Small-2507) model with thinking capabilities, enabling Chain-of-Thought reasoning with explicit thinking steps.

📚 **[See the Thinking fine-tuning guide →](./think/README.md)**

### Vision

MistralAI has released their [2509](https://huggingface.co/mistralai/Magistral-Small-2509) model with vision capabilities.

📚 **[See the Vision fine-tuning guide →](./vision/README.md)**

### Tips

- We recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo's files titled `SYSTEM_PROMPT.txt`.
- For inference, the official MistralAI team recommends `top_p: 0.95` and `temperature: 0.7` with `max_tokens: 40960`.
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).

## Optimization Guides

- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)

## Limitations

We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.

In addition, we do not support overriding tokens yet.

## Related Resources

- [MistralAI Magistral Blog](https://mistral.ai/news/magistral/)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


## Future Work

- Add parity to Preference Tuning, RL, etc.
- Add parity to other tokenizer configs like overriding tokens.


================================================
FILE: examples/magistral/magistral-small-fsdp-qlora.yaml
================================================
base_model: mistralai/Magistral-Small-2506

# Enable to use mistral-common tokenizer
tokenizer_use_mistral_common: true

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true
eval_sample_packing: false

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing:
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer
  fsdp_activation_checkpointing: true

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/magistral/magistral-small-qlora.yaml
================================================
base_model: mistralai/Magistral-Small-2506

# Enable to use mistral-common tokenizer
tokenizer_use_mistral_common: true

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/magistral/think/README.md
================================================
# Magistral Small Thinking Fine-tuning

This guide covers fine-tuning [Magistral Small 2507](https://huggingface.co/mistralai/Magistral-Small-2507) with thinking capabilities using Axolotl. The thinking model enables explicit Chain-of-Thought reasoning with separate thinking and response sections.

## Prerequisites

Before starting, ensure you have:

- Installed Axolotl (see [main README](../README.md))

## Getting Started

Run the thinking model fine-tuning:

```bash
axolotl train examples/magistral/think/magistral-small-think-qlora.yaml
```

This config uses about 19.1 GiB VRAM.

### Tips

- Dataset uses multi-content format with `type: thinking` support. See [Dataset Format](#dataset-format) below.
- You cannot mix `content: str` and `content: list[dict]`, otherwise, dataset loading will fail. Keep it consistent.

## Dataset Format

The thinking model requires the multi-content dataset format with support for an extra `role: thinking` within system and assistant messages.

Example format:

```json
{
    "messages": [
        {
            "role": "system",
            "content": [
                { "type": "text", "text": "{SYSTEM_PROMPT}"}
            ]
        },
        {
            "role": "user",
            "content": [
                { "type": "text", "text": "Solve this step by step: What is 15% of 240?"}
            ]
        },
        {
            "role": "assistant",
            "content": [
                {
                    "type": "thinking",
                    "thinking": "I need to calculate 15% of 240. First, I'll convert 15% to decimal: 0.15. Then multiply: 0.15 × 240 = 36."
                },
                {
                    "type": "text",
                    "text": "To find 15% of 240, I'll multiply 240 by 0.15:\n\n240 × 0.15 = 36\n\nTherefore, 15% of 240 is 36."
                }
            ]
        }
    ]
}
```

### Advanced Options

The `thinking` section supports an optional `closed` parameter:

```json
{
    "type": "thinking",
    "thinking": "Internal reasoning here...",
    "closed": true  // Default: true, controls adding the closing [/THINK] tag
}
```


================================================
FILE: examples/magistral/think/magistral-small-think-qlora.yaml
================================================
base_model: mistralai/Magistral-Small-2507

# Enable to use mistral-common tokenizer
tokenizer_use_mistral_common: true

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: Nanobit/text-think-2k-test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0
output_dir: ./outputs/lora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/magistral/vision/README.md
================================================
# Magistral Small Vision Fine-tuning

This guide covers fine-tuning [Magistral Small 2509](https://huggingface.co/mistralai/Magistral-Small-2509) with vision capabilities using Axolotl.

## Prerequisites

Before starting, ensure you have:

- Installed Axolotl from source (see [main README](../README.md))

## Getting started

1. Install the required vision lib:
    ```bash
    pip install 'mistral-common[opencv]==1.8.5'
    ```

2. Download the example dataset image:
   ```bash
   wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg
   ```

3. Run the fine-tuning:
   ```bash
   axolotl train examples/magistral/vision/magistral-small-vision-24B-qlora.yml
   ```

This config uses about 17GiB VRAM.

WARNING: The loss and grad norm will be much higher than normal at first. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.

### Tips

Key differences from text-only model:
- `max_tokens: 131072` for inference
- Multi-modal dataset format required
- Sample packing not supported

## Dataset Format

The vision model requires multi-modal dataset format as documented [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).

One exception is that, passing `"image": PIL.Image` is not supported. MistralTokenizer only supports `path`, `url`, and `base64` for now.

Example:
```json
{
    "messages": [
        {"role": "system", "content": [{ "type": "text", "text": "{SYSTEM_PROMPT}"}]},
        {"role": "user", "content": [
            { "type": "text", "text": "What's in this image?"},
            {"type": "image", "path": "path/to/image.jpg" }
        ]},
        {"role": "assistant", "content": [{ "type": "text", "text": "..." }]},
    ],
}
```

## Limitations

- Sample Packing is not supported for multi-modality training currently.


================================================
FILE: examples/magistral/vision/magistral-small-vision-24B-qlora.yml
================================================
base_model: mistralai/Magistral-Small-2509
processor_type: AutoProcessor

# Enable to use mistral-common tokenizer
tokenizer_use_mistral_common: true

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_4bit: true

# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

# sample dataset below requires downloading image in advance
# wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg
datasets:
  - path: Nanobit/text-vision-2k-test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.01
output_dir: ./outputs/out

adapter: qlora
lora_model_dir:

sequence_len: 2048

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32: true

gradient_checkpointing: true
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/mamba/config.yml
================================================
base_model: state-spaces/mamba-2.8b
# optionally might have model_type or tokenizer_type or tokenizer_config
model_type: MambaLMHeadModel
tokenizer_type: AutoTokenizer
tokenizer_config: EleutherAI/gpt-neox-20b
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.0
output_dir: ./outputs/out

sequence_len: 2048
sample_packing: false
pad_to_sequence_len: false

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 2
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 5e-5

train_on_inputs: false
group_by_length: true

bf16: auto
tf32: true

gradient_checkpointing: false
resume_from_checkpoint:
logging_steps: 1
flash_attention:

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/mimo/README.md
================================================
# Finetune Xiaomi's MiMo with Axolotl

[MiMo](https://huggingface.co/XiaomiMiMo/MiMo-7B-RL) is a family of models trained from scratch for reasoning tasks, incorporating **Multiple-Token Prediction (MTP)** as an additional training objective for enhanced performance and faster inference. Pre-trained on ~25T tokens with a three-stage data mixture strategy and optimized reasoning pattern density.

This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

2. Run the finetuning example:

    ```bash
    axolotl train examples/mimo/mimo-7b-qlora.yaml
    ```

This config uses about 17.2 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀

### Tips

- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).

## Optimization Guides

Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).

## Limitations

**Cut Cross Entropy (CCE)**: Currently not supported. We plan to include CCE support for MiMo in the near future.

## Related Resources

- [MiMo Paper](https://arxiv.org/abs/2505.07608)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/mimo/mimo-7b-qlora.yaml
================================================
base_model: XiaomiMiMo/MiMo-7B-RL
trust_remote_code: true
revision_of_model: 6299b5a

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

# CCE - N/A as of now
# plugins:
#   - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/ministral/README.md
================================================
# Finetune Ministral with Axolotl

Ministral is a family of openweight models from MistralAI found on [HuggingFace](mistralai/Ministral-8B-Instruct-2410). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.

3. Run the finetuning example:

    ```bash
    axolotl train examples/ministral/ministral-small-qlora.yaml
    ```

This config uses about 8.76 GiB VRAM.

Let us know how it goes. Happy finetuning! 🚀

### Tips

- We recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo's files titled `SYSTEM_PROMPT.txt`.
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).

## Optimization Guides

Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).

## Limitations

We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.

In addition, we do not support overriding tokens yet.

## Related Resources

- [MistralAI Ministral Blog](https://mistral.ai/news/ministraux)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


## Future Work

- Add parity to Preference Tuning, RL, etc.
- Add parity to other tokenizer configs like overriding tokens.


================================================
FILE: examples/ministral/ministral-small-qlora.yaml
================================================
base_model: mistralai/Ministral-8B-Instruct-2410

# Enable to use mistral-common tokenizer
tokenizer_use_mistral_common: true

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/ministral3/README.md
================================================
# Finetune Ministral3 with Axolotl

Ministral3 is a family of open-weight models from MistralAI found on [HuggingFace](https://huggingface.co/collections/mistralai/ministral-3). This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.

Please see [Thinking](#thinking) and [Vision](#vision) for their respective fine-tuning.

Thanks to the team at MistralAI for giving us early access to prepare for these releases.

Note: This is still experimental given it is based on transformers v5 RC.

## Getting started

1. Install Axolotl from source following the [installation guide](https://docs.axolotl.ai/docs/installation.html#sec-edge-build).

2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.

3. Swap to the Axolotl transformers v5 branch

    ```bash
    cp examples/ministral3/ministral3-3b-qlora.yaml ministral3-3b-qlora.yaml

    git fetch
    git checkout transformers-v5

    # Install packages for transformers v5
    pip install -e .
    ```

4. Run the fine-tuning:

    ```bash
    axolotl train ministral3-3b-qlora.yaml
    ```

Let us know how it goes. Happy finetuning! 🚀


### Tips

- We recommend adding the same/similar SystemPrompt that the model is tuned for. You can find this within the repo's files titled `SYSTEM_PROMPT.txt`.
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).

### Thinking

Ministral3 2512 model supports thinking capabilities, enabling Chain-of-Thought reasoning with explicit thinking steps.

📚 **[See the Thinking fine-tuning guide →](./think/README.md)**

### Vision

Ministral3 2512 model also supports vision capabilities.

📚 **[See the Vision fine-tuning guide →](./vision/README.md)**

## Optimization Guides

Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).

## Limitations

We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.

In addition, we do not support overriding tokens yet.

## Related Resources

- [MistralAI Mistral3 Blog](https://mistral.ai/news/mistral-3)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


## Future Work

- Add parity to Preference Tuning, RL, etc.
- Add parity to other tokenizer configs like overriding tokens.


================================================
FILE: examples/ministral3/ministral3-3b-qlora.yaml
================================================
base_model: mistralai/Ministral-3-3B-Reasoning-2512

# Enable to use mistral-common tokenizer
tokenizer_use_mistral_common: true

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true
scaling_softmax: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/ministral3/think/README.md
================================================
# Ministral3 2512 Thinking Fine-tuning

This guide covers fine-tuning [Ministral3 2512](https://huggingface.co/collections/mistralai/ministral-3) with thinking capabilities using Axolotl. The thinking model enables explicit Chain-of-Thought reasoning with separate thinking and response sections.

## Prerequisites

Before starting, ensure you have:

- Installed Axolotl (see [main README](../README.md))

## Getting Started

Run the thinking model fine-tuning:

```bash
axolotl train examples/ministral3/think/ministral3-3b-think-qlora.yaml
```

This config uses about 4.76 GiB VRAM.

### Tips

- Dataset uses multi-content format with `type: thinking` support. See [Dataset Format](#dataset-format) below.
- You cannot mix `content: str` and `content: list[dict]`, otherwise, dataset loading will fail. Keep it consistent.

## Dataset Format

The thinking model requires the multi-content dataset format with support for an extra `role: thinking` within system and assistant messages.

Example format:

```json
{
    "messages": [
        {
            "role": "system",
            "content": [
                { "type": "text", "text": "{SYSTEM_PROMPT}"}
            ]
        },
        {
            "role": "user",
            "content": [
                { "type": "text", "text": "Solve this step by step: What is 15% of 240?"}
            ]
        },
        {
            "role": "assistant",
            "content": [
                {
                    "type": "thinking",
                    "thinking": "I need to calculate 15% of 240. First, I'll convert 15% to decimal: 0.15. Then multiply: 0.15 × 240 = 36."
                },
                {
                    "type": "text",
                    "text": "To find 15% of 240, I'll multiply 240 by 0.15:\n\n240 × 0.15 = 36\n\nTherefore, 15% of 240 is 36."
                }
            ]
        }
    ]
}
```

### Advanced Options

The `thinking` section supports an optional `closed` parameter:

```json
{
    "type": "thinking",
    "thinking": "Internal reasoning here...",
    "closed": true  // Default: true, controls adding the closing [/THINK] tag
}
```


================================================
FILE: examples/ministral3/think/ministral3-3b-think-qlora.yaml
================================================
base_model: mistralai/Ministral-3-3B-Reasoning-2512

# Enable to use mistral-common tokenizer
tokenizer_use_mistral_common: true

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: Nanobit/text-think-2k-test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0
output_dir: ./outputs/lora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/ministral3/vision/README.md
================================================
# Ministral3 2512 Vision Fine-tuning

This guide covers fine-tuning [Ministral3 2512](https://huggingface.co/collections/mistralai/ministral-3) with vision capabilities using Axolotl.

## Prerequisites

Before starting, ensure you have:

- Installed Axolotl from source (see [main README](../README.md))

## Getting started

1. Install the required vision lib:
    ```bash
    pip install 'mistral-common[opencv]==1.8.6'
    ```

2. Download the example dataset image:
   ```bash
   wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg
   ```

3. Run the fine-tuning:
   ```bash
   axolotl train examples/ministral3/vision/ministral3-3b-vision-qlora.yml
   ```

WARNING: The loss and grad norm will be much higher than normal at first. We suspect this to be inherent to the model as of the moment. If anyone would like to submit a fix for this, we are happy to take a look.

### Tips

Key differences from text-only model:
- Multi-modal dataset format required
- Sample packing not supported

## Dataset Format

The vision model requires multi-modal dataset format as documented [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).

One exception is that, passing `"image": PIL.Image` is not supported. MistralTokenizer only supports `path`, `url`, and `base64` for now.

Example:
```json
{
    "messages": [
        {"role": "system", "content": [{ "type": "text", "text": "{SYSTEM_PROMPT}"}]},
        {"role": "user", "content": [
            { "type": "text", "text": "What's in this image?"},
            {"type": "image", "path": "path/to/image.jpg" }
        ]},
        {"role": "assistant", "content": [{ "type": "text", "text": "..." }]},
    ],
}
```

## Limitations

- Sample Packing is not supported for multi-modality training currently.


================================================
FILE: examples/ministral3/vision/ministral3-3b-vision-qlora.yml
================================================
base_model: mistralai/Ministral-3-3B-Reasoning-2512
processor_type: AutoProcessor

# Enable to use mistral-common tokenizer
tokenizer_use_mistral_common: true

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_4bit: true

# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

# sample dataset below requires downloading image in advance
# wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg
datasets:
  - path: Nanobit/text-vision-2k-test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.01
output_dir: ./outputs/out

adapter: qlora
lora_model_dir:

sequence_len: 2048

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32: true

gradient_checkpointing: true
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/mistral/README.md
================================================
**Mistral 7B** is a language model with a total of 7.3 billion parameters, showcasing a notable performance across a variety of benchmarks.

Fine Tune:
```shell
accelerate launch -m axolotl.cli.train examples/mistral/config.yml

```

If you run into CUDA OOM, use deepspeed with config zero2.json:
```shell
accelerate launch -m axolotl.cli.train examples/mistral/config.yml --deepspeed deepspeed_configs/zero2.json
```


================================================
FILE: examples/mistral/bigstral/bigstral-ds-zero3.yaml
================================================
base_model: mistral-community/Mixtral-8x22B-v0.1
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

unfrozen_parameters:
  - ^lm_head.weight$
  - ^model.embed_tokens.weight$
  - model.layers.4[4-9]+.block_sparse_moe.gate
  - model.layers.4[4-9]+.block_sparse_moe.experts
  - model.layers.5[0-5]+.block_sparse_moe.gate
  - model.layers.5[0-5]+.block_sparse_moe.experts

model_config:
  output_router_logits: true

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.05
output_dir: ./outputs/out

sequence_len: 2048
sample_packing: true


gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 3
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0001

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

save_total_limit: 1
save_steps:

deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_params.json
weight_decay: 0.0
special_tokens:
  eos_token: "<|im_end|>"
tokens:
  - "<|im_start|>"

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/mistral/config.yml
================================================
base_model: mistralai/Mistral-7B-v0.1
# optionally might have model_type or tokenizer_type
model_type: MistralForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/out

sequence_len: 8192
sample_packing: true

eval_sample_packing: false

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.000005

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/mistral/dpo/mistral-dpo-qlora.yml
================================================
#Note that we are switching from the regular chat template to chatml.
#If you experience problems with the special tokens, training for more epochs can help.
#After training, merge the model before inference otherwise you might
#face problems with the special tokens.

base_model: mistralai/Mistral-7B-Instruct-v0.2
# optionally might have model_type or tokenizer_type
model_type: MistralForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

chat_template: chatml
rl: dpo
datasets:
  - path: olivermolenschot/alpaca_messages_dpo_test
    type: chat_template.default
    field_messages: conversation
    field_chosen: chosen
    field_rejected: rejected
    message_property_mappings:
      role: role
      content: content

dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/dpo-qlora

sequence_len: 2048
sample_packing: false


adapter: qlora
lora_model_dir:
lora_r: 8
lora_alpha: 16
lora_dropout: 0.2
lora_target_linear: true

lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj
lora_modules_to_save:
 - embed_tokens
 - lm_head

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 16
num_epochs: 6
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0001

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: false

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  bos_token: "<|im_start|>"
  eos_token: "<|im_end|>"

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/mistral/lora.yml
================================================
base_model: mistralai/Mistral-7B-v0.1
# optionally might have model_type or tokenizer_type
model_type: MistralForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: true
load_in_4bit: false

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out

adapter: lora
lora_model_dir:

sequence_len: 8192
sample_packing: true


lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/mistral/mistral-qlora-fsdp.yml
================================================
base_model: mistralai/Mixtral-8x7B-v0.1
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.02
output_dir: ./outputs/qlora-out

model_config:
  output_router_logits: true

adapter: qlora
lora_model_dir:

sequence_len: 1024
sample_packing: false
pad_to_sequence_len: false

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: paged_adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1

weight_decay: 0.0
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: false
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: false
  fsdp_transformer_layer_cls_to_wrap: MistralDecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/mistral/mixtral/mixtral-8x22b-qlora-fsdp.yml
================================================
base_model: mistral-community/Mixtral-8x22B-v0.1
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.02
output_dir: ./outputs/qlora-out

model_config:
  output_router_logits: true

adapter: qlora
lora_model_dir:

sequence_len: 1024
sample_packing: false
pad_to_sequence_len: false

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1

weight_decay: 0.0
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/mistral/mixtral/mixtral-qlora-fsdp.yml
================================================
base_model: mistralai/Mixtral-8x7B-v0.1
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.02
output_dir: ./outputs/qlora-out

model_config:
  output_router_logits: true

adapter: qlora
lora_model_dir:

sequence_len: 1024
sample_packing: false
pad_to_sequence_len: false

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1

weight_decay: 0.0
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_transformer_layer_cls_to_wrap: MixtralSparseMoeBlock
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_forward_prefetch: false
  fsdp_backward_prefetch: BACKWARD_PRE
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/mistral/mixtral/mixtral.yml
================================================
base_model: mistralai/Mixtral-8x7B-v0.1
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/qlora-out

## You can optionally freeze the entire model and unfreeze a subset of parameters
unfrozen_parameters:
#  - ^lm_head.weight$
#  - ^model.embed_tokens.weight$[:32000]
#  - model.layers.2[0-9]+.block_sparse_moe.gate
#  - model.layers.2[0-9]+.block_sparse_moe.experts
#  - model.layers.3[0-9]+.block_sparse_moe.gate
#  - model.layers.3[0-9]+.block_sparse_moe.experts

model_config:
  output_router_logits: true

adapter: qlora
lora_model_dir:

sequence_len: 4096
sample_packing: true


lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
#lora_target_modules:
#  - gate
#  - q_proj
#  - k_proj
#  - v_proj
#  - o_proj
#  - w1
#  - w2
#  - w3

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1

deepspeed: deepspeed_configs/zero2.json
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/mistral/mixtral/mixtral_22.yml
================================================
base_model: mistral-community/Mixtral-8x22B-v0.1
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

unfrozen_parameters:
  - ^lm_head.weight$
  - ^model.embed_tokens.weight$
  - model.layers.4[4-9]+.block_sparse_moe.gate
  - model.layers.4[4-9]+.block_sparse_moe.experts
  - model.layers.5[0-5]+.block_sparse_moe.gate
  - model.layers.5[0-5]+.block_sparse_moe.experts

model_config:
  output_router_logits: true

datasets:
  - path: yahma/alpaca-cleaned
    type: alpaca
output_dir: ./outputs/out

sequence_len: 8000
sample_packing: true


gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 3
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0001

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

save_total_limit: 1
save_steps:

deepspeed: deepspeed_configs/zero3_bf16_cpuoffload_all.json
weight_decay: 0.0
special_tokens:
  eos_token: "<|im_end|>"
tokens:
  - "<|im_start|>"

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/mistral/mps/lora-mps.yml
================================================
base_model: mistralai/Mistral-7B-v0.1
# optionally might have model_type or tokenizer_type
model_type: MistralForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0
output_dir: ./outputs/lora-out
eval_sample_packing: false

adapter: lora
lora_model_dir:

sequence_len: 4096
sample_packing: true


lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 8
micro_batch_size: 1
num_epochs: 2
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
fp16: false
tf32: true

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: false
sdp_attention: true

loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/mistral/orpo/mistral-qlora-orpo.yml
================================================
base_model: mistralai/Mistral-7B-v0.1
# optionally might have model_type or tokenizer_type
model_type: MistralForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

rl: orpo
orpo_alpha: 0.1
remove_unused_columns: false

chat_template: chatml
datasets:
  - path: argilla/ultrafeedback-binarized-preferences-cleaned
    type: chat_template.argilla
dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/mistral-qlora-orpo-out

adapter: qlora
lora_model_dir:

sequence_len: 4096
sample_packing: false


lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/mistral/qlora.yml
================================================
base_model: mistralai/Mistral-7B-v0.1
# optionally might have model_type or tokenizer_type
model_type: MistralForCausalLM
tokenizer_type: LlamaTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/qlora-out

adapter: qlora
lora_model_dir:

sequence_len: 8192
sample_packing: true


lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/mistral-small/README.md
================================================
# Mistral Small 3.1/3.2 Fine-tuning

This guide covers fine-tuning [Mistral Small 3.1](mistralai/Mistral-Small-3.1-24B-Instruct-2503) and [Mistral Small 3.2](mistralai/Mistral-Small-3.2-24B-Instruct-2506) with vision capabilities using Axolotl.

## Prerequisites

Before starting, ensure you have:

- Installed Axolotl (see [Installation docs](https://docs.axolotl.ai/docs/installation.html))

## Getting Started

1. Install the required vision lib:
    ```bash
    pip install 'mistral-common[opencv]==1.8.5'
    ```

2. Download the example dataset image:
   ```bash
   wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg
   ```

3. Run the fine-tuning:
   ```bash
   axolotl train examples/mistral/mistral-small/mistral-small-3.1-24B-lora.yml
   ```

This config uses about 29.4 GiB VRAM.

## Dataset Format

The vision model requires multi-modal dataset format as documented [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).

One exception is that, passing `"image": PIL.Image` is not supported. MistralTokenizer only supports `path`, `url`, and `base64` for now.

Example:
```json
{
    "messages": [
        {"role": "system", "content": [{ "type": "text", "text": "{SYSTEM_PROMPT}"}]},
        {"role": "user", "content": [
            { "type": "text", "text": "What's in this image?"},
            {"type": "image", "path": "path/to/image.jpg" }
        ]},
        {"role": "assistant", "content": [{ "type": "text", "text": "..." }]},
    ],
}
```

## Limitations

- Sample Packing is not supported for multi-modality training currently.


================================================
FILE: examples/mistral-small/mistral-small-3.1-24B-lora.yml
================================================
base_model: mistralai/Mistral-Small-3.1-24B-Instruct-2503
processor_type: AutoProcessor

# Enable to use mistral-common tokenizer
tokenizer_use_mistral_common: true

load_in_8bit: true

# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

# sample dataset below requires downloading image in advance
# wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg
datasets:
  - path: Nanobit/text-vision-2k-test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.01
output_dir: ./outputs/out

adapter: lora
lora_model_dir:

sequence_len: 2048
pad_to_sequence_len: false

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32: true

gradient_checkpointing: true
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/mistral4/README.md
================================================
# Finetune Mistral Small 4 with Axolotl

Mistral Small 4 is a 119B parameter (6.5B active) multimodal MoE model from MistralAI that unifies instruct, reasoning, and coding capabilities into a single model. It is available on HuggingFace at [Mistral-Small-4-119B-2603](https://huggingface.co/mistralai/Mistral-Small-4-119B-2603).

Thanks to the team at MistralAI for giving us early access to prepare for this release.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage

3. Install transformers from main

  ```bash
  pip install git+https://github.com/huggingface/transformers.git
  ```

4. Run one of the example configs:

  ```bash
  # text-only
  axolotl train examples/mistral4/qlora-text.yml  # no experts ~69 GiB, experts ~93 GiB
  axolotl train examples/mistral4/fft-text.yml

  # text + vision
  # run: wget https://huggingface.co/datasets/Nanobit/text-vision-2k-test/resolve/main/African_elephant.jpg
  axolotl train examples/mistral4/qlora-vision.yml  # no experts ~68 GiB
  axolotl train examples/mistral4/fft-vision.yml
  ```

Note: FFT configs provided as reference. Please adjust hyperparameters as needed.

## Reasoning Effort

The chat template supports a `reasoning_effort` variable to control the model's reasoning depth:

- `"none"` — instruct mode (default)
- `"high"` — reasoning mode with explicit thinking steps

Pass it via `chat_template_kwargs` under your dataset config:

```yaml
datasets:
  - path: your/dataset
    type: chat_template
    chat_template_kwargs:
      reasoning_effort: high
```

## Thinking Support

The chat template supports a `thinking` content type in assistant messages for training on reasoning traces (rendered as `[THINK]...[/THINK]` blocks).

To use thinking datasets, add the `thinking` mapping via `message_property_mappings`:

```yaml
datasets:
  - path: your/thinking-dataset
    type: chat_template
    message_property_mappings:
      role: role
      content: content
      thinking: thinking
    chat_template_kwargs:
      reasoning_effort: high
```

See the [Magistral thinking guide](../magistral/think/README.md) for dataset format details.

## Tips

- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
- The vision model requires multi-modal dataset format as documented [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).

## Related Resources

- [MistralAI Mistral Small 4 Blog](https://mistral.ai/news/mistral-small-4)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/mistral4/fft-text.yml
================================================
base_model: axolotl-ai-co/Mistral-Small-4-119B-2603-BF16

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
  - axolotl.integrations.kernels.KernelsPlugin
use_kernels: true
use_sonicmoe: true

# only train language model layers, freeze vision tower
unfrozen_parameters:
  - model.language_model.*
  - lm_head
  - embed_tokens

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.01
output_dir: ./outputs/out

sequence_len: 2048
sample_packing: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 2e-5

bf16: true
tf32: true

logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

fsdp_version: 2
fsdp_config:
  offload_params: false
  cpu_ram_efficient_loading: false
  state_dict_type: FULL_STATE_DICT
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Mistral4DecoderLayer
  reshard_after_forward: true
  activation_checkpointing: true


================================================
FILE: examples/mistral4/fft-vision.yml
================================================
base_model: axolotl-ai-co/Mistral-Small-4-119B-2603-BF16
processor_type: AutoProcessor

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
  - axolotl.integrations.kernels.KernelsPlugin
use_kernels: true
use_sonicmoe: true

# vision requirements
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

datasets:
  - path: Nanobit/text-vision-2k-test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.01
output_dir: ./outputs/out

sequence_len: 2048

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 2e-5

bf16: true
tf32: true

logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

fsdp_version: 2
fsdp_config:
  offload_params: false
  cpu_ram_efficient_loading: false
  state_dict_type: FULL_STATE_DICT
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Mistral4DecoderLayer
  reshard_after_forward: true
  activation_checkpointing: true


================================================
FILE: examples/mistral4/qlora-text.yml
================================================
base_model: axolotl-ai-co/Mistral-Small-4-119B-2603-BF16

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_4bit: true
quantize_moe_experts: true

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.01
output_dir: ./outputs/out

adapter: qlora

sequence_len: 2048
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

# uncomment to train on expert layers
# lora_target_parameters:
#   - mlp.experts.gate_up_proj
#   - mlp.experts.down_proj
# lora_mlp_kernel: false
# lora_qkv_kernel: false
# lora_o_kernel: false

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
tf32: true

gradient_checkpointing: true
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0


================================================
FILE: examples/mistral4/qlora-vision.yml
================================================
base_model: axolotl-ai-co/Mistral-Small-4-119B-2603-BF16
processor_type: AutoProcessor

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_4bit: true
quantize_moe_experts: true

# vision chat template requirements
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

datasets:
  - path: Nanobit/text-vision-2k-test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.01
output_dir: ./outputs/out

adapter: qlora

sequence_len: 2048

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

# uncomment to train on expert layers
# lora_target_parameters:
#   - mlp.experts.gate_up_proj
#   - mlp.experts.down_proj
# lora_mlp_kernel: false
# lora_qkv_kernel: false
# lora_o_kernel: false

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
tf32: true

gradient_checkpointing: true
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0


================================================
FILE: examples/nemotron/nemotron-mini-4b-qlora.yaml
================================================
base_model: nvidia/Nemotron-Mini-4B-Instruct

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/nemotron-mini-4b-qlora

adapter: qlora
lora_model_dir:

sequence_len: 4096
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
  - up_proj
  - down_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

special_tokens:


================================================
FILE: examples/olmo3/README.md
================================================
# Finetune Allenai's Olmo 3 with Axolotl

[Olmo 3](https://huggingface.co/collections/allenai/olmo-3) are a family of 7B and 32B models open source models trained by The Allen Institute for Artificial Intelligence.

This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.

3. Run the finetuning example:

    ```bash
    axolotl train examples/olmo3/olmo3-7b-qlora.yaml
    ```

This uses about 11.3 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀

### TIPS

- The example config can be re-used for Olmo and Olmo 2.
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).

## Optimization Guides

Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).

## Related Resources

- [Olmo 3 Blog](https://allenai.org/blog/olmo3)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/olmo3/olmo3-7b-qlora.yaml
================================================
base_model: allenai/Olmo-3-7B-Instruct-SFT

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/orpheus/README.md
================================================
# Finetuning LLMs to output audio

In this example, we finetune Orpcanopylabs/orpheus-tts-0.1-pretrained (a LLaMA 3.2 3b model) to output audio.

The `finetune.yml` withe current settings will run on any Nvidia GPU with 45GB VRAM or more. If you adjust the batch size it can easily run on any GPU under 24GB.

## Dataset pre-processing for pre-training
If you are adding another voice in English, please jump ahead to finetuning pre-processing.

For this to work, we need to preprocess our dataset. Since we are expecting to output audio, we will need to add tokens to the tokenizer.

Using this code, it will download the SNAC model and add the correct tokens and upload the final dataset.

```python
import torch
from snac import SNAC
from datasets import load_dataset
from huggingface_hub import snapshot_download
from datasets import load_dataset
import random
import torchaudio.transforms as T
from transformers import AutoTokenizer
import os

my_original_dataset_name = "<huggingface-id-of-dataset-that-we-want-to-preprocess>"
name_to_push_dataset_to = "<huggingface-id-of-where-to-save-dataset>"

dsn = my_original_dataset_name

snapshot_download(
    repo_id=dsn,
    repo_type="dataset",
    revision="main",
    max_workers=64,
)


ds = load_dataset(dsn, split="train")
ds_sample_rate = ds[0]["audio"]["sampling_rate"]

model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
model = model.to("mps")

def tokenise_audio(waveform):
  waveform = torch.from_numpy(waveform).unsqueeze(0)
  waveform = waveform.to(dtype=torch.float32)
  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
  waveform = resample_transform(waveform)

  waveform = waveform.unsqueeze(0).to("cuda")

  #generate the codes from snac
  with torch.inference_mode():
    codes = model.encode(waveform)

  all_codes = []
  for i in range(codes[0].shape[1]):
    all_codes.append(codes[0][0][i].item()+128266)
    all_codes.append(codes[1][0][2*i].item()+128266+4096)
    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))


  return all_codes

def add_codes(example):
    # Always initialize codes_list to None
    codes_list = None

    try:
        answer_audio = example.get("audio")
        # If there's a valid audio array, tokenise it
        if answer_audio and "array" in answer_audio:
            audio_array = answer_audio["array"]
            codes_list = tokenise_audio(audio_array)
    except Exception as e:
        print(f"Skipping row due to error: {e}")
        # Keep codes_list as None if we fail
    example["codes_list"] = codes_list

    return example

ds = ds.map(add_codes, remove_columns=["audio"])

#@title Load Tokenizer
tokeniser_length = 128256
start_of_text = 128000
end_of_text = 128009

start_of_speech = tokeniser_length + 1
end_of_speech = tokeniser_length + 2

start_of_human = tokeniser_length + 3
end_of_human = tokeniser_length + 4

start_of_ai = tokeniser_length + 5
end_of_ai =  tokeniser_length + 6
pad_token = tokeniser_length + 7

audio_tokens_start = tokeniser_length + 10

tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"


tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
num_proc = os.cpu_count() - 2

ds = ds.filter(lambda x: x["codes_list"] is not None)
ds = ds.filter(lambda x: len(x["codes_list"]) > 0)

#@title Create Input Ids
def remove_duplicate_frames(example):
    vals = example["codes_list"]
    if len(vals) % 7 != 0:
        raise ValueError("Input list length must be divisible by 7")

    result = vals[:7]

    removed_frames = 0

    for i in range(7, len(vals), 7):
        current_first = vals[i]
        previous_first = result[-7]

        if current_first != previous_first:
            result.extend(vals[i:i+7])
        else:
            removed_frames += 1

    example["codes_list"] = result

    return example

ds = ds.map(remove_duplicate_frames, num_proc=num_proc)


def create_input_ids(example):
    text_ids = tokenizer.encode({example['text']},  add_special_tokens=True)
    text_ids.append(end_of_text)
    example["text_tokens"] = text_ids
    input_ids = (
        [start_of_human]
        + example["text_tokens"]
        + [end_of_human]
        + [start_of_ai]
        + [start_of_speech]
        + example["codes_list"]
        + [end_of_speech]
        + [end_of_ai]
    )
    example["input_ids"] = input_ids
    example["labels"] = input_ids
    example["attention_mask"] = [1] * len(input_ids)

    return example

ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])

#@title Remove unnecessary columns
columns_to_keep = ["input_ids", "labels", "attention_mask"]
columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]

ds = ds.remove_columns(columns_to_remove)

ds.push_to_hub(name_to_push_dataset_to)
```


## Finetune pre-processing
Use this code to add a new voice.

```python
import torch
from snac import SNAC
from datasets import load_dataset
from huggingface_hub import snapshot_download
from datasets import load_dataset
import random
import torchaudio.transforms as T
from transformers import AutoTokenizer
import os

my_original_dataset_name = "<huggingface-id-of-dataset-that-we-want-to-preprocess>"
name_to_push_dataset_to = "<huggingface-id-of-where-to-save-dataset>"

dsn = my_original_dataset_name

snapshot_download(
    repo_id=dsn,
    repo_type="dataset",
    revision="main",
    max_workers=64,
)


ds = load_dataset(dsn, split="train")
ds_sample_rate = ds[0]["audio"]["sampling_rate"]

model = SNAC.from_pretrained("hubertsiuzdak/snac_24khz")
model = model.to("mps")

def tokenise_audio(waveform):
  waveform = torch.from_numpy(waveform).unsqueeze(0)
  waveform = waveform.to(dtype=torch.float32)
  resample_transform = T.Resample(orig_freq=ds_sample_rate, new_freq=24000)
  waveform = resample_transform(waveform)

  waveform = waveform.unsqueeze(0).to("cuda")

  #generate the codes from snac
  with torch.inference_mode():
    codes = model.encode(waveform)

  all_codes = []
  for i in range(codes[0].shape[1]):
    all_codes.append(codes[0][0][i].item()+128266)
    all_codes.append(codes[1][0][2*i].item()+128266+4096)
    all_codes.append(codes[2][0][4*i].item()+128266+(2*4096))
    all_codes.append(codes[2][0][(4*i)+1].item()+128266+(3*4096))
    all_codes.append(codes[1][0][(2*i)+1].item()+128266+(4*4096))
    all_codes.append(codes[2][0][(4*i)+2].item()+128266+(5*4096))
    all_codes.append(codes[2][0][(4*i)+3].item()+128266+(6*4096))


  return all_codes

def add_codes(example):
    # Always initialize codes_list to None
    codes_list = None

    try:
        answer_audio = example.get("audio")
        # If there's a valid audio array, tokenise it
        if answer_audio and "array" in answer_audio:
            audio_array = answer_audio["array"]
            codes_list = tokenise_audio(audio_array)
    except Exception as e:
        print(f"Skipping row due to error: {e}")
        # Keep codes_list as None if we fail
    example["codes_list"] = codes_list

    return example

ds = ds.map(add_codes, remove_columns=["audio"])

#@title Load Tokenizer
tokeniser_length = 128256
start_of_text = 128000
end_of_text = 128009

start_of_speech = tokeniser_length + 1
end_of_speech = tokeniser_length + 2

start_of_human = tokeniser_length + 3
end_of_human = tokeniser_length + 4

start_of_ai = tokeniser_length + 5
end_of_ai =  tokeniser_length + 6
pad_token = tokeniser_length + 7

audio_tokens_start = tokeniser_length + 10

tokenizer_name = "canopylabs/orpheus-3b-0.1-pretrained"


tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
num_proc = os.cpu_count() - 2

ds = ds.filter(lambda x: x["codes_list"] is not None)
ds = ds.filter(lambda x: len(x["codes_list"]) > 0)

#@title Create Input Ids
def remove_duplicate_frames(example):
    vals = example["codes_list"]
    if len(vals) % 7 != 0:
        raise ValueError("Input list length must be divisible by 7")

    result = vals[:7]

    removed_frames = 0

    for i in range(7, len(vals), 7):
        current_first = vals[i]
        previous_first = result[-7]

        if current_first != previous_first:
            result.extend(vals[i:i+7])
        else:
            removed_frames += 1

    example["codes_list"] = result

    return example

ds = ds.map(remove_duplicate_frames, num_proc=num_proc)

tok_info = '''*** HERE you can modify the text prompt
i.e. if you wanted a multispeaker model like canopylabs/orpheus-3b-0.1-ft, you can pass:
f"{example["source"]}:  {example["text"]}", as is passed.
'''
print(tok_info)

def create_input_ids(example):
    text_ids = tokenizer.encode(f"{example['speaker_id']}: {example['text']}",  add_special_tokens=True)
    text_ids.append(end_of_text)
    example["text_tokens"] = text_ids
    input_ids = (
        [start_of_human]
        + example["text_tokens"]
        + [end_of_human]
        + [start_of_ai]
        + [start_of_speech]
        + example["codes_list"]
        + [end_of_speech]
        + [end_of_ai]
    )
    example["input_ids"] = input_ids
    example["labels"] = input_ids
    example["attention_mask"] = [1] * len(input_ids)

    return example

ds = ds.map(create_input_ids, num_proc=num_proc, remove_columns=["text", "codes_list"])

#@title Remove unnecessary columns
columns_to_keep = ["input_ids", "labels", "attention_mask"]
columns_to_remove = [col for col in ds.column_names if col not in columns_to_keep]

ds = ds.remove_columns(columns_to_remove)

ds.push_to_hub(name_to_push_dataset_to)
```

## Training
After preprocessing is done, fill out the blanks in finetune.yml and simply run `axolotl train finetune.yml`

## Inference
For inference, please refer to the original [orpheus github](https://github.com/canopyai/Orpheus-TTS/tree/main).


================================================
FILE: examples/orpheus/finetune.yml
================================================
base_model: canopylabs/orpheus-3b-0.1-pretrained

hub_model_id: <your-hub-model-id>

plugins:
  - axolotl.integrations.liger.LigerPlugin
liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_fused_linear_cross_entropy: true

datasets:
  - path: <your-hf-dataset-id>
    type:  # leave empty to load pre-tokenized
dataset_prepared_path: last_run_prepared
val_set_size: 0.01
output_dir: ./outputs/out

sequence_len: 8192
sample_packing: true


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 8
micro_batch_size: 4
num_epochs: 3
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 2e-5

bf16: auto
tf32: false

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 5
saves_per_epoch: 5
weight_decay: 0.05

special_tokens:
  pad_token: <custom_token_7>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/phi/README.md
================================================
# Phi

Due to some nuances with the phi code, please use deepspeed when training phi for full finetune.

```shell
accelerate launch -m axolotl.cli.train examples/phi/phi-ft.yml --deepspeed deepspeed_configs/zero1.json

# OR

python -m axolotl.cli.train examples/phi/phi-qlora.yml
```


================================================
FILE: examples/phi/lora-3.5.yaml
================================================
base_model: microsoft/Phi-3.5-mini-instruct
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: true
load_in_4bit: false

chat_template: phi_3
datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/lora-out

sequence_len: 4096
sample_packing: false


adapter: lora
lora_model_dir:
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 4
num_epochs: 2
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bfloat16: true
bf16: true
fp16:
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 4
weight_decay: 0.0

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/phi/phi-ft.yml
================================================
base_model: microsoft/phi-1_5
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

datasets:
  - path: garage-bAInd/Open-Platypus
    type: alpaca

dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/phi-sft-out

sequence_len: 2048
sample_packing: true


adapter:
lora_model_dir:
lora_r:
lora_alpha:
lora_dropout:
lora_target_linear:

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_torch_fused
adam_beta2: 0.95
adam_epsilon: 0.00001
max_grad_norm: 1.0
lr_scheduler: cosine
learning_rate: 0.000003

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: True
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1
resize_token_embeddings_to_32x: true
special_tokens:
  pad_token: "<|endoftext|>"

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/phi/phi-qlora.yml
================================================
base_model: microsoft/phi-1_5
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: garage-bAInd/Open-Platypus
    type: alpaca

dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/phi-sft-out

sequence_len: 2048
sample_packing: true


adapter: qlora
lora_model_dir:
lora_r: 64
lora_alpha: 32
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_torch_fused
adam_beta2: 0.95
adam_epsilon: 0.00001
max_grad_norm: 1.0
lr_scheduler: cosine
learning_rate: 0.000003

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: True
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1
resize_token_embeddings_to_32x: true
special_tokens:
  pad_token: "<|endoftext|>"

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/phi/phi2-ft.yml
================================================
base_model: microsoft/phi-2
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

datasets:
  - path: garage-bAInd/Open-Platypus
    type: alpaca

dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/phi-sft-out

sequence_len: 2048
sample_packing: true


adapter:
lora_model_dir:
lora_r:
lora_alpha:
lora_dropout:
lora_target_linear:

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_torch_fused
adam_beta2: 0.95
adam_epsilon: 0.00001
max_grad_norm: 1.0
lr_scheduler: cosine
learning_rate: 0.000003

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: True
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1
resize_token_embeddings_to_32x: true
special_tokens:
  pad_token: "<|endoftext|>"

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/phi/phi3-ft-fsdp.yml
================================================
base_model: microsoft/Phi-3-mini-4k-instruct
# optionally might have model_type or tokenizer_type
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca

dataset_prepared_path:
val_set_size: 0
output_dir: ./phi-sft-out

sequence_len: 4096
sample_packing: true

trust_remote_code: true

adapter:
lora_model_dir:
lora_r:
lora_alpha:
lora_dropout:
lora_target_linear:

wandb_project: phi3
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 12
num_epochs: 2
optimizer: adamw_torch_fused
adam_beta2: 0.95
adam_epsilon: 0.00001
max_grad_norm: 1.0
lr_scheduler: cosine
learning_rate: 0.000003

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.1
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: Phi3DecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
resize_token_embeddings_to_32x: true
special_tokens:
  pad_token: "<|endoftext|>"

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/phi/phi3-ft.yml
================================================
base_model: microsoft/Phi-3-mini-4k-instruct
# optionally might have model_type or tokenizer_type
trust_remote_code: true
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

chat_template: phi_3

datasets:
  - path: garage-bAInd/Open-Platypus
    type: alpaca:phi

dataset_prepared_path:
val_set_size: 0.01
output_dir: ./out

sequence_len: 4096
sample_packing: true


adapter: lora
lora_model_dir:
lora_r: 64
lora_alpha: 32
lora_dropout: 0.05
lora_target_linear: true

gradient_accumulation_steps: 1
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_torch_fused
adam_beta2: 0.95
adam_epsilon: 0.00001
max_grad_norm: 1.0
lr_scheduler: cosine
learning_rate: 5.0e-6

bf16: auto

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: True
early_stopping_patience: 3
logging_steps: 1
flash_attention: true

eval_steps: 1000
save_steps: 5000
eval_batch_size: 2
eval_sample_packing: false
eval_table_size: 2
eval_max_new_tokens: 32
eval_causal_lm_metrics: ["perplexity"]
do_causal_lm_eval: true

warmup_ratio: 0.2
debug: true
weight_decay: 0.1
resize_token_embeddings_to_32x: true

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/pixtral/lora-12b.yml
================================================
base_model: mistral-community/pixtral-12b
processor_type: AutoProcessor

# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

chat_template: pixtral
datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
dataset_prepared_path:
val_set_size: 0.0
output_dir: ./outputs/out

adapter: lora
lora_model_dir:

sequence_len: 8192
pad_to_sequence_len: false

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32: true

gradient_checkpointing: true
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:
  pad_token: <pad>

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/plano/README.md
================================================
# Finetune Katanemo's Plano-Orchestrator with Axolotl

[Plano-Orchestrator](https://huggingface.co/collections/katanemo/plano-orchestrator) is a family of 4B and 30B-A3B routing and orchestration models designed for multi-agent systems. It analyzes user intent and conversation context to make precise routing decisions, excelling at multi-turn context understanding, multi-intent detection, and context-dependent routing.

This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.

3. Run the finetuning example:

    ```bash
    axolotl train examples/plano/plano-4b-qlora.yaml
    ```

This config uses about 5.1 GiB VRAM. Let us know how it goes. Happy finetuning! 🚀

### Orchestration Prompt

Plano-Orchestrator uses a specific orchestration prompt format for routing/agent decisions. Please check the [official model card](https://huggingface.co/katanemo/Plano-Orchestrator-4B) for proper prompt formatting and the `ORCHESTRATION_PROMPT` template.

### Tips

- To use the larger [Plano-Orchestrator-30B-A3B](https://huggingface.co/katanemo/Plano-Orchestrator-30B-A3B) MoE model, simply change `base_model: katanemo/Plano-Orchestrator-30B-A3B` in the config and enable multi-GPU training if needed.
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).

## Optimization Guides

Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).

## Related Resources

- [Plano GitHub](https://github.com/katanemo/plano)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/plano/plano-4b-qlora.yaml
================================================
base_model: katanemo/Plano-Orchestrator-4B

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: false
load_in_4bit: true

chat_template: qwen3
datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qat_nvfp4/Gemma3-12B_baseline.yml
================================================
base_model: google/gemma-3-12b-it
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: false
strict: false

plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true
seed: 42
chat_template: gemma3
datasets:
  - path: tatsu-lab/alpaca
    type: alpaca

output_dir: ./outputs/out_gemma/

sequence_len: 8096
sample_packing: true
flash_attention: true

wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 16

num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 4e-5

bf16: true
tf32: true

resume_from_checkpoint:
logging_steps: 1

# evals_per_epoch: 1
saves_per_epoch: 1

warmup_ratio: 0.1
weight_decay: 0.0
fsdp_version: 2

fsdp_config:
  offload_params: false
  cpu_ram_efficient_loading: true
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Gemma3DecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: true

special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qat_nvfp4/Gemma3-12B_qat.yml
================================================
base_model: google/gemma-3-12b-it
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: false
strict: false

plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true
seed: 42
chat_template: gemma3
datasets:
  - path: tatsu-lab/alpaca
    type: alpaca

output_dir: ./outputs/qat_out_gemma/

sequence_len: 8096
sample_packing: true
flash_attention: true

qat:
  activation_dtype: nvfp4
  weight_dtype: nvfp4
  group_size: 16 # only group_size of 16 is supported with nvfp4

wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 16

num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 4e-5

bf16: true
tf32: true

resume_from_checkpoint:
logging_steps: 1

evals_per_epoch: 1
saves_per_epoch: 1

warmup_ratio: 0.1
weight_decay: 0.0
fsdp_version: 2

fsdp_config:
  offload_params: false
  cpu_ram_efficient_loading: true
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Gemma3DecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: true

special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qat_nvfp4/Math-Gemma3-12B_baseline.yml
================================================
base_model: google/gemma-3-12b-it
# Math finetuning configuration for Gemma3-12B
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: false
strict: false

plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true
seed: 42
chat_template: gemma3
datasets:
  - path: AI-MO/NuminaMath-CoT
    type: chat_template

output_dir: ./outputs/out_math_gemma/

sequence_len: 4096
sample_packing: true
flash_attention: true

wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 8

num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 3e-5

bf16: true
tf32: true

resume_from_checkpoint:
logging_steps: 1

# evals_per_epoch: 1
saves_per_epoch: 1

warmup_ratio: 0.1
weight_decay: 0.0
fsdp_version: 2

fsdp_config:
  offload_params: false
  cpu_ram_efficient_loading: true
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Gemma3DecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: true

special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qat_nvfp4/Math-Gemma3-12B_qat.yml
================================================
base_model: google/gemma-3-12b-it
# Math finetuning configuration for Gemma3-12B
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: false
strict: false

plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true
seed: 42
chat_template: gemma3
datasets:
  - path: AI-MO/NuminaMath-CoT
    type: chat_template

output_dir: ./outputs/qat_out_math_gemma/

sequence_len: 4096
sample_packing: true
flash_attention: true

qat:
  activation_dtype: nvfp4
  weight_dtype: nvfp4
  group_size: 16 # only group_size of 16 is supported with nvfp4

wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 8

num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 3e-5

bf16: true
tf32: true

resume_from_checkpoint:
logging_steps: 1

# evals_per_epoch: 1
saves_per_epoch: 1

warmup_ratio: 0.1
weight_decay: 0.0
fsdp_version: 2

fsdp_config:
  offload_params: false
  cpu_ram_efficient_loading: true
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Gemma3DecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: true

special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qat_nvfp4/Math-Gemma3-27B_baseline.yml
================================================
base_model: google/gemma-3-27b-it
# Math finetuning configuration for Gemma3-27B
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: false
strict: false

plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true
seed: 42
chat_template: gemma3
datasets:
  - path: AI-MO/NuminaMath-CoT
    type: chat_template

output_dir: ./outputs/out_math_gemma27/

sequence_len: 4096
sample_packing: true
flash_attention: true

wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 16

num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 5e-6
eta_min: 7e-7

bf16: true
tf32: true

resume_from_checkpoint:
logging_steps: 1

# evals_per_epoch: 1
saves_per_epoch: 1

warmup_ratio: 0.1
weight_decay: 0.0
fsdp_version: 2

fsdp_config:
  offload_params: false
  cpu_ram_efficient_loading: true
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Gemma3DecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: true

special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qat_nvfp4/Math-Gemma3-27B_qat.yml
================================================
base_model: google/gemma-3-27b-it
# Math finetuning configuration for Gemma3-27B
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: false
strict: false

plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true
seed: 42
chat_template: gemma3
datasets:
  - path: AI-MO/NuminaMath-CoT
    type: chat_template

output_dir: ./outputs/qat_out_math_gemma27/

sequence_len: 4096
sample_packing: true
flash_attention: true

qat:
  activation_dtype: nvfp4
  weight_dtype: nvfp4
  group_size: 16 # only group_size of 16 is supported with nvfp4

wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 16

num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 5e-6
eta_min: 7e-7

bf16: true
tf32: true

resume_from_checkpoint:
logging_steps: 1

# evals_per_epoch: 1
saves_per_epoch: 1

warmup_ratio: 0.1
weight_decay: 0.0
fsdp_version: 2

fsdp_config:
  offload_params: false
  cpu_ram_efficient_loading: true
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Gemma3DecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: true

special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qat_nvfp4/Math-Qwen2.5-72B_baseline.yml
================================================
base_model: Qwen/Qwen2.5-72B
# Math finetuning configuration for Qwen2.5-72B (non-instruct)
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: false
strict: false

plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true
seed: 42
chat_template: qwen_25
datasets:
  - path: AI-MO/NuminaMath-CoT
    type: chat_template

output_dir: ./outputs/out_math_72b/

sequence_len: 4096
sample_packing: true
flash_attention: true

wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 8
num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 5e-6
eta_min: 7e-7

bf16: true
tf32: true

resume_from_checkpoint:
logging_steps: 1

# evals_per_epoch: 1
saves_per_epoch: 1

warmup_ratio: 0.1
weight_decay: 0.0
fsdp_version: 2

fsdp_config:
  offload_params: false
  cpu_ram_efficient_loading: true
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Qwen2DecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: true

special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qat_nvfp4/Math-Qwen2.5-72B_qat.yml
================================================
base_model: Qwen/Qwen2.5-72B
# Math finetuning configuration for Qwen2.5-72B (non-instruct)
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: false
strict: false

plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true
seed: 42
chat_template: qwen_25
datasets:
  - path: AI-MO/NuminaMath-CoT
    type: chat_template

output_dir: ./outputs/qat_out_math_72b/

sequence_len: 4096
sample_packing: true
flash_attention: true

qat:
  activation_dtype: nvfp4
  weight_dtype: nvfp4
  group_size: 16 # only group_size of 16 is supported with nvfp4

wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 8
num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 5e-6
eta_min: 7e-7

bf16: true
tf32: true

resume_from_checkpoint:
logging_steps: 1

# evals_per_epoch: 1
saves_per_epoch: 1

warmup_ratio: 0.1
weight_decay: 0.0
fsdp_version: 2

fsdp_config:
  offload_params: false
  cpu_ram_efficient_loading: true
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Qwen2DecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: true

special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qat_nvfp4/Qwen2.5-72B_baseline.yml
================================================
base_model: Qwen/Qwen2.5-72B
# Alpaca finetuning configuration for Qwen2.5-72B
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: false
strict: false

plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true
seed: 42
chat_template: qwen_25
datasets:
  - path: tatsu-lab/alpaca
    type: alpaca

output_dir: ./outputs/out_qwen72b/

sequence_len: 8096
sample_packing: true
flash_attention: true

wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 16

num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 2e-5

bf16: true
tf32: true

resume_from_checkpoint:
logging_steps: 1

# evals_per_epoch: 1
saves_per_epoch: 1

warmup_ratio: 0.1
weight_decay: 0.0
fsdp_version: 2

fsdp_config:
  offload_params: false
  cpu_ram_efficient_loading: true
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Qwen2DecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: true

special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qat_nvfp4/Qwen2.5-72B_qat.yml
================================================
base_model: Qwen/Qwen2.5-72B
# Alpaca finetuning configuration for Qwen2.5-72B
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: false
strict: false

plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true
seed: 42
chat_template: qwen_25
datasets:
  - path: tatsu-lab/alpaca
    type: alpaca

output_dir: ./outputs/qat_out_qwen72b/

sequence_len: 8096
sample_packing: true
flash_attention: true

qat:
  activation_dtype: nvfp4
  weight_dtype: nvfp4
  group_size: 16 # only group_size of 16 is supported with nvfp4

wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 16

num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 2e-5

bf16: true
tf32: true

resume_from_checkpoint:
logging_steps: 1

# evals_per_epoch: 1
saves_per_epoch: 1

warmup_ratio: 0.1
weight_decay: 0.0
fsdp_version: 2

fsdp_config:
  offload_params: false
  cpu_ram_efficient_loading: true
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Qwen2DecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: true

special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qwen2/adamw-pretrain-fsdp2.yaml
================================================
base_model: Qwen/Qwen2.5-0.5B
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer

# Use random initialization for fair comparison
reinit_weights: true

load_in_8bit: false
load_in_4bit: false
strict: false

# Pretraining dataset
pretraining_dataset:
  - path: allenai/c4
    name: en
    type: pretrain
    split: train

dataset_prepared_path:
val_set_size: 0.0
output_dir: ./outputs/compare-adamw-pretrain

sequence_len: 2048
sample_packing: true
pad_to_sequence_len: true

wandb_project: dist_muon
wandb_entity:
wandb_watch:
wandb_name: adamw
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 4
num_epochs: 1
max_steps: 305

# AdamW optimizer settings (standard LR for AdamW)
optimizer: adamw_torch_fused
learning_rate: 0.0002
weight_decay: 0.01
lr_scheduler: cosine

train_on_inputs: true
group_by_length: false
bf16: auto
fp16: false
tf32: false

gradient_checkpointing: false
logging_steps: 1
flash_attention: true

warmup_steps: 10
evals_per_epoch: 0
saves_per_epoch: 1

# Reproducibility
seed: 42

fsdp_config:
  fsdp_version: 2
  fsdp_offload_params: false
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_cpu_ram_efficient_loading: false
  fsdp_reshard_after_forward: true

special_tokens:


================================================
FILE: examples/qwen2/dpo.yaml
================================================
base_model: Qwen/Qwen2.5-0.5B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

chat_template: qwen_25
rl: dpo
datasets:
  - path: fozziethebeat/alpaca_messages_2k_dpo_test
    type: chat_template.default
    field_messages: conversation
    field_chosen: chosen
    field_rejected: rejected
    message_property_mappings:
      role: role
      content: content
    roles:
      system:
        - system
      user:
        - user
      assistant:
        - assistant

dataset_prepared_path:
val_set_size: 0.0
output_dir: ./outputs/dpo-out

sequence_len: 2048
sample_packing: false


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qwen2/muon-pretrain-fsdp2.yaml
================================================
base_model: Qwen/Qwen2.5-0.5B
model_type: AutoModelForCausalLM
tokenizer_type: AutoTokenizer

# Use random initialization for fair comparison
reinit_weights: true

load_in_8bit: false
load_in_4bit: false
strict: false

# Pretraining dataset
pretraining_dataset:
  - path: allenai/c4
    name: en
    type: pretrain
    split: train

dataset_prepared_path:
val_set_size: 0.0
output_dir: ./outputs/compare-muon-pretrain

sequence_len: 2048
sample_packing: true
pad_to_sequence_len: true

wandb_project: dist_muon
wandb_entity:
wandb_watch:
wandb_name: muon
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 4
num_epochs: 1
max_steps: 305

# Muon optimizer settings
optimizer: muon
learning_rate: 0.02
weight_decay: 0.01
lr_scheduler: cosine

train_on_inputs: true
group_by_length: false
bf16: auto
fp16: false
tf32: false

gradient_checkpointing: false
logging_steps: 1
flash_attention: true

warmup_steps: 10
evals_per_epoch: 0
saves_per_epoch: 1

# Reproducibility
seed: 42

fsdp_config:
  fsdp_version: 2
  fsdp_offload_params: false
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_cpu_ram_efficient_loading: false
  fsdp_reshard_after_forward: true

special_tokens:


================================================
FILE: examples/qwen2/prm.yaml
================================================
base_model: Qwen/Qwen2.5-3B
# optionally might have model_type or tokenizer_type
model_type: AutoModelForTokenClassification
num_labels: 2
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

process_reward_model: true
chat_template:
datasets:
  - path: trl-lib/math_shepherd
    type: stepwise_supervised
    step_separator: "\n"
    max_completion_length:
    train_on_last_step_only: false

val_set_size: 0.2
output_dir: ./outputs/out
remove_unused_columns: false

sequence_len: 2048
sample_packing: false
eval_sample_packing: false


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:


gradient_accumulation_steps: 1
micro_batch_size: 8
eval_batch_size: 8
num_epochs: 1
optimizer: adamw_torch
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32:
gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch:
eval_steps: 100
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qwen2/qlora-fsdp.yaml
================================================
base_model: Qwen/Qwen2-7B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

trust_remote_code: true

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/out

sequence_len: 2048
sample_packing: true
eval_sample_packing: true


adapter: qlora
lora_model_dir:
lora_r: 32
lora_alpha: 64
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: Qwen2DecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qwen2/reward-model.yaml
================================================
base_model:  Qwen/Qwen2.5-0.5B
# optionally might have model_type or tokenizer_type
model_type: AutoModelForSequenceClassification
num_labels: 1
tokenizer_type: AutoTokenizer
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

reward_model: true
chat_template: qwen_25
datasets:
  - path: argilla/distilabel-intel-orca-dpo-pairs
    type: bradley_terry.chat_template
val_set_size: 0.0
output_dir: ./outputs/out
remove_unused_columns: false

sequence_len: 2048
sample_packing: false
eval_sample_packing: false


wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qwen2-vl/lora-7b.yaml
================================================
base_model: Qwen/Qwen2-VL-7B-Instruct
processor_type: AutoProcessor

# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

chat_template: qwen2_vl
datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]

dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

adapter: lora
lora_model_dir:

sequence_len: 8192
pad_to_sequence_len: false

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32: true

gradient_checkpointing: true
logging_steps: 1
flash_attention: true
eager_attention:

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qwen2_5-vl/lora-7b.yaml
================================================
base_model: Qwen/Qwen2.5-VL-7B-Instruct
processor_type: AutoProcessor

# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

chat_template: qwen2_vl
datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]

dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

adapter: lora
lora_model_dir:

sequence_len: 8192
pad_to_sequence_len: false

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32: true

gradient_checkpointing: true
logging_steps: 1
flash_attention: true
eager_attention:

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qwen3/32b-qlora.yaml
================================================
base_model: Qwen/Qwen3-32B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
strict: false

chat_template: qwen3
datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
val_set_size: 0.0
output_dir: ./outputs/out
dataset_prepared_path: last_run_prepared

sequence_len: 2048
sample_packing: true
eval_sample_packing: true


load_in_4bit: true
adapter: qlora
lora_r: 16
lora_alpha: 32
lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
  - down_proj
  - up_proj
lora_mlp_kernel: true
lora_qkv_kernel: true
lora_o_kernel: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_4bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: offload
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qwen3/8b-qat-fsdp2.yml
================================================
base_model: Qwen/Qwen3-8B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: false
strict: false

plugins:
  - axolotl.integrations.liger.LigerPlugin

liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca

output_dir: ./outputs/qat_out/

sequence_len: 2048
sample_packing: true
flex_attention: true


flex_attn_compile_kwargs:
  dynamic: false
  mode: max-autotune-no-cudagraphs

qat:
  activation_dtype: int8
  weight_dtype: int4
  group_size: 256
  fake_quant_after_n_steps: 1000

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 2
max_steps: 2000
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 2e-5

bf16: true
tf32: true

resume_from_checkpoint:
logging_steps: 1

evals_per_epoch: 1
saves_per_epoch: 1

warmup_ratio: 0.1
weight_decay: 0.0
fsdp:
  - full_shard
  - auto_wrap

fsdp_config:
  fsdp_version: 2
  fsdp_offload_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
  fsdp_reshard_after_forward: true
  fsdp_activation_checkpointing: true

special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qwen3/README.md
================================================
# Finetune Qwen3 with Axolotl

[Qwen3](https://huggingface.co/collections/Qwen/qwen3) are a family of open source models trained by Alibaba.

This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.

3. Run the finetuning example:

    ```bash
    axolotl train examples/qwen3/32b-qlora.yaml
    ```

Let us know how it goes. Happy finetuning! 🚀

### Chat template masking a few tokens off

If you notice that the `chat_template` masking for assistant prompts are off by a few tokens, please ensure that you are adding the below to the yaml.

```yaml
chat_template: qwen3
```

### TIPS

- For inference, please check the official model card as it depends on your reasoning mode.
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).

## Optimization Guides

Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).

## Related Resources

- [Qwen3 Blog](https://qwenlm.github.io/blog/qwen3/)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/qwen3/qlora-fsdp.yaml
================================================
base_model: Qwen/Qwen3-8B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

load_in_8bit: false
load_in_4bit: true
strict: false

datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/out

sequence_len: 2048
sample_packing: true
eval_sample_packing: true


adapter: qlora
lora_model_dir:
lora_r: 32
lora_alpha: 64
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_fused
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
fsdp:
  - full_shard
  - auto_wrap
fsdp_config:
  fsdp_limit_all_gathers: true
  fsdp_sync_module_states: true
  fsdp_offload_params: true
  fsdp_use_orig_params: false
  fsdp_cpu_ram_efficient_loading: true
  fsdp_auto_wrap_policy: TRANSFORMER_BASED_WRAP
  fsdp_transformer_layer_cls_to_wrap: Qwen3DecoderLayer
  fsdp_state_dict_type: FULL_STATE_DICT
  fsdp_sharding_strategy: FULL_SHARD
special_tokens:

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qwen3/reward-model.yaml
================================================
base_model: Skywork/Skywork-Reward-V2-Qwen3-8B
model_type: AutoModelForSequenceClassification
num_labels: 1

reward_model: true
center_rewards_coefficient: 0.01  # Incentivize mean-zero rewards for improved stability
chat_template: qwen3
datasets:
  - path: argilla/distilabel-intel-orca-dpo-pairs
    type: bradley_terry.chat_template

val_set_size: 0.0
output_dir: ./outputs/out

sequence_len: 8192
sample_packing: false
eval_sample_packing: false
pad_to_sequence_len: true

deepspeed: deepspeed_configs/zero1.json

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
eval_batch_size: 1
num_epochs: 3
optimizer: adamw_bnb_8bit
lr_scheduler: linear
learning_rate: 0.00002

bf16: true
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
warmup_ratio: 0.1
logging_steps: 1
weight_decay: 0.01


================================================
FILE: examples/qwen3-next/README.md
================================================
# Finetune Qwen3-Next with Axolotl

[Qwen3-Next](https://huggingface.co/collections/Qwen/qwen3-next-68c25fd6838e585db8eeea9d) represents the next-generation foundation models optimized for extreme context length and large-scale parameter efficiency. The series introduces architectural innovations including Hybrid Attention (Gated DeltaNet + Gated Attention), High-Sparsity MoE with 1:50 activation ratio, and Multi-Token Prediction for enhanced performance and inference acceleration.

This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.

3. Install FLA for improved performance
```bash
pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1
```

4. Run the finetuning example:

```bash
axolotl train examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml
```

This config uses about ~47 GiB (no target experts) and ~71GiB (target experts) VRAM.

Let us know how it goes. Happy finetuning! 🚀

### TIPS

- For inference, you can experiment with `temperature: 0.7`, `top_p: 0.8`, `top_k: 20`, and `min_p: 0`.
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config. See [Multi-GPU](#optimization-guides) section below.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).

## Optimization Guides

- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)

## Related Resources

- [Qwen3-Next Blog](https://qwenlm.github.io/blog/qwen3_next/)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/qwen3-next/qwen3-next-80b-a3b-qlora.yaml
================================================
base_model: Qwen/Qwen3-Next-80B-A3B-Instruct

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: false
load_in_4bit: true

quantize_moe_experts: true

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 16
lora_alpha: 8
lora_dropout: 0
lora_target_modules:
  - linear_attn.in_proj_ba
  - linear_attn.in_proj_qkvz
  - linear_attn.out_proj
  - shared_expert.up_proj
  - shared_expert.down_proj
  - shared_expert.gate_proj
  - shared_expert_gate
  - q_proj
  - v_proj
  - k_proj
  - o_proj

# lora_target_parameters:
#   - mlp.experts.gate_up_proj
#   - mlp.experts.down_proj

lora_mlp_kernel: false
lora_qkv_kernel: false
lora_o_kernel: false

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/qwen3.5/122b-a10b-moe-qlora-fsdp.yaml
================================================
base_model: Qwen/Qwen3.5-122B-A10B

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
strict: false

chat_template: qwen3_5
datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
val_set_size: 0.0
output_dir: ./outputs/out
dataset_prepared_path: last_run_prepared

sequence_len: 2048
sample_packing: true

load_in_4bit: true
quantize_moe_experts: true
adapter: qlora
lora_r: 16
lora_alpha: 32
lora_dropout: 0
lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
# Regex matching to target shared experts too
# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'

# Target experts
# lora_target_parameters:
#   - mlp.experts.gate_up_proj
#   - mlp.experts.down_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_4bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

lora_mlp_kernel: false
lora_qkv_kernel: false
lora_o_kernel: false

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

fsdp_config:
  fsdp_version: 2
  offload_params: true
  cpu_ram_efficient_loading: false
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Qwen3_5MoeDecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: true


================================================
FILE: examples/qwen3.5/122b-a10b-moe-qlora.yaml
================================================
base_model: Qwen/Qwen3.5-122B-A10B

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
strict: false

chat_template: qwen3_5
datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
val_set_size: 0.0
output_dir: ./outputs/out
dataset_prepared_path: last_run_prepared

sequence_len: 2048
sample_packing: true

load_in_4bit: true
quantize_moe_experts: true
adapter: qlora
lora_r: 16
lora_alpha: 32
lora_dropout: 0
lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj

# Regex matching to target shared experts too
# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'

# Target experts
# lora_target_parameters:
#   - mlp.experts.gate_up_proj
#   - mlp.experts.down_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_4bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

lora_mlp_kernel: false
lora_qkv_kernel: false
lora_o_kernel: false

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: examples/qwen3.5/27b-fft.yaml
================================================
base_model: Qwen/Qwen3.5-27B
# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

# Full fine-tune (FFT) of the text-only path of Qwen3.5-27B.

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
strict: false

chat_template: qwen3_5
datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
val_set_size: 0.0
output_dir: ./outputs/out
dataset_prepared_path: last_run_prepared

sequence_len: 2048
sample_packing: true

# Freeze vision encoder
unfrozen_parameters:
  - model\.language_model\..*
  - lm_head\..*

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: examples/qwen3.5/27b-qlora-fsdp.yaml
================================================
base_model: Qwen/Qwen3.5-27B

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
strict: false

chat_template: qwen3_5
datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
val_set_size: 0.0
output_dir: ./outputs/out
dataset_prepared_path: last_run_prepared

sequence_len: 2048
sample_packing: true

load_in_4bit: true
adapter: qlora
lora_r: 16
lora_alpha: 32
lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
  - down_proj
  - up_proj
  # Uncomment below to also target the linear attention projections.
  # These use separate in_proj_qkv / in_proj_z / out_proj (Qwen3.5-specific).
  # - linear_attn.in_proj_qkv
  # - linear_attn.in_proj_z
  # - linear_attn.out_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_4bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

fsdp_config:
  fsdp_version: 2
  offload_params: false
  cpu_ram_efficient_loading: false
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Qwen3_5DecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: true


================================================
FILE: examples/qwen3.5/27b-qlora.yaml
================================================
base_model: Qwen/Qwen3.5-27B

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
strict: false

chat_template: qwen3_5
datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
val_set_size: 0.0
output_dir: ./outputs/out
dataset_prepared_path: last_run_prepared

sequence_len: 2048
sample_packing: true

load_in_4bit: true
adapter: qlora
lora_r: 16
lora_alpha: 32
lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
  - down_proj
  - up_proj
  # Uncomment below to also target the linear attention projections.
  # These use separate in_proj_qkv / in_proj_z / out_proj (Qwen3.5-specific).
  # - linear_attn.in_proj_qkv
  # - linear_attn.in_proj_z
  # - linear_attn.out_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_4bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: examples/qwen3.5/35b-a3b-moe-qlora-fsdp.yaml
================================================
base_model: Qwen/Qwen3.5-35B-A3B

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
strict: false

chat_template: qwen3_5
datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
val_set_size: 0.0
output_dir: ./outputs/out
dataset_prepared_path: last_run_prepared

sequence_len: 2048
sample_packing: true

load_in_4bit: true
quantize_moe_experts: true
adapter: qlora
lora_r: 16
lora_alpha: 32
lora_dropout: 0
lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj

# Regex matching to target shared experts too
# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'

# Target experts
# lora_target_parameters:
#   - mlp.experts.gate_up_proj
#   - mlp.experts.down_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_4bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

lora_mlp_kernel: false
lora_qkv_kernel: false
lora_o_kernel: false

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:

fsdp_config:
  fsdp_version: 2
  offload_params: true
  cpu_ram_efficient_loading: false
  auto_wrap_policy: TRANSFORMER_BASED_WRAP
  transformer_layer_cls_to_wrap: Qwen3_5MoeDecoderLayer
  state_dict_type: FULL_STATE_DICT
  sharding_strategy: FULL_SHARD
  reshard_after_forward: true
  activation_checkpointing: true


================================================
FILE: examples/qwen3.5/35b-a3b-moe-qlora.yaml
================================================
base_model: Qwen/Qwen3.5-35B-A3B

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
strict: false

chat_template: qwen3_5
datasets:
  - path: mlabonne/FineTome-100k
    type: chat_template
    split: train[:20%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value
val_set_size: 0.0
output_dir: ./outputs/out
dataset_prepared_path: last_run_prepared

sequence_len: 2048
sample_packing: true

load_in_4bit: true
quantize_moe_experts: true
adapter: qlora
lora_r: 16
lora_alpha: 32
lora_dropout: 0
lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj

# Regex matching to target shared experts too
# lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'

# Target experts
# lora_target_parameters:
#   - mlp.experts.gate_up_proj
#   - mlp.experts.down_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 2
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_torch_4bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

lora_mlp_kernel: false
lora_qkv_kernel: false
lora_o_kernel: false

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 4
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: examples/qwen3.5/9b-fft-vision.yaml
================================================
base_model: Qwen/Qwen3.5-9B
processor_type: AutoProcessor

# Required for multimodal training
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

chat_template: qwen3_5
datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]

dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

sequence_len: 4096
pad_to_sequence_len: false

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: examples/qwen3.5/9b-lora-vision.yaml
================================================
base_model: Qwen/Qwen3.5-9B
processor_type: AutoProcessor

# These 3 lines are required for vision/multimodal training
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

chat_template: qwen3_5
datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]

dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

adapter: lora
lora_model_dir:

sequence_len: 8192
pad_to_sequence_len: false

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
# Targets the language model attention and MLP layers.
lora_target_modules:
  - q_proj
  - k_proj
  - v_proj
  - o_proj
  - down_proj
  - up_proj
  # Uncomment to also target the linear attention (GatedDeltaNet) projections:
  # - linear_attn.in_proj_qkv
  # - linear_attn.in_proj_z
  # - linear_attn.out_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0


================================================
FILE: examples/qwen3.5/README.md
================================================
# Finetune Qwen3.5 with Axolotl

[Qwen3.5](https://huggingface.co/collections/Qwen/qwen35) is a hybrid architecture model series combining Gated DeltaNet linear attention with standard Transformer attention. All Qwen3.5 models are early-fusion vision-language models: dense variants use `Qwen3_5ForConditionalGeneration` and MoE variants use `Qwen3_5MoeForConditionalGeneration`.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.

3. Install FLA for sample packing support with the Gated DeltaNet linear attention layers:
  ```bash
  pip3 uninstall -y causal-conv1d && pip3 install flash-linear-attention==0.4.1
  ```
  > FLA is required when `sample_packing: true`. Without it, training raises a `RuntimeError` on packed sequences. Vision configs use `sample_packing: false` so FLA is optional there.

4. Pick any config from the table below and run:

    ```bash
    axolotl train examples/qwen3.5/<config>.yaml
    ```

Available configs:

| Config | Model | Type | Peak VRAM |
|---|---|---|---|
| `9b-lora-vision.yaml` | Qwen3.5-9B | Vision+text LoRA, single GPU | — |
| `9b-fft-vision.yaml` | Qwen3.5-9B | Vision+text FFT, single GPU | ~61 GiB |
| `27b-qlora.yaml` | Qwen3.5-27B | Dense, text-only QLoRA | ~47 GiB |
| `27b-fft.yaml` | Qwen3.5-27B | Dense, text-only FFT (vision frozen) | ~53 GiB |
| `27b-qlora-fsdp.yaml` | Qwen3.5-27B | Dense, text-only QLoRA + FSDP2 | — |
| `35b-a3b-moe-qlora.yaml` | Qwen3.5-35B-A3B | MoE, text-only QLoRA | — |
| `35b-a3b-moe-qlora-fsdp.yaml` | Qwen3.5-35B-A3B | MoE, text-only QLoRA + FSDP2 | — |
| `122b-a10b-moe-qlora.yaml` | Qwen3.5-122B-A10B | MoE, text-only QLoRA | — |
| `122b-a10b-moe-qlora-fsdp.yaml` | Qwen3.5-122B-A10B | MoE, text-only QLoRA + FSDP2 | — |

### Gated DeltaNet Linear Attention

Qwen3.5 interleaves standard attention with Gated DeltaNet linear attention layers. To apply LoRA to them, add to `lora_target_modules`:

```yaml
lora_target_modules:
  # ... standard projections ...
  - linear_attn.in_proj_qkv
  - linear_attn.in_proj_z
  - linear_attn.out_proj
```

### Routed Experts (MoE)

To apply LoRA to routed expert parameters, add `lora_target_parameters`:

```yaml
lora_target_parameters:
  - mlp.experts.gate_up_proj
  - mlp.experts.down_proj
#  - mlp.gate.weight  # router
```

### Shared Experts (MoE)

Routed experts and shared experts both have `gate_up_proj`/`down_proj`, so a plain module name in `lora_target_modules` would match both. Use a regex to target only attention and shared expert projections, while `lora_target_parameters` above handles routed experts separately:

```yaml
lora_target_modules: 'model\.(language_model\.)?layers\.[\d]+\.(mlp|self_attn)\.(shared_expert\.)?(up|down|gate|gate_up|q|k|v|o)_proj'
```

### TIPS

- For inference hyp, please see the respective model card details.
- You can run a full finetuning of smaller configs by removing `adapter: qlora` and `load_in_4bit: true`. See [Multi-GPU](#optimization-guides) below.
- Read more on loading your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
- For **multimodal** finetuning, set `processor_type: AutoProcessor`, `skip_prepare_dataset: true`, and `remove_unused_columns: false` as shown in `9b-lora-vision.yaml`.

## Optimization Guides

- [Optimizations Guide](https://docs.axolotl.ai/docs/optimizations.html)

## Related Resources

- [Qwen3.5 Blog](https://qwenlm.github.io/blog/qwen3.5/)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/seed-oss/README.md
================================================
# Finetune ByteDance's Seed-OSS with Axolotl

[Seed-OSS](https://huggingface.co/collections/ByteDance-Seed/seed-oss-68a609f4201e788db05b5dcd) are a series of 36B parameter open source models trained by ByteDance's Seed Team.

This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.

## Getting started

1.  Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

    Here is an example of how to install from pip:
    ```bash
    # Ensure you have a compatible version of Pytorch installed
    pip3 install packaging setuptools wheel ninja
    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'

    # Install Cut Cross Entropy
    python scripts/cutcrossentropy_install.py | sh
    ```

2. Run the finetuning example:

```bash
axolotl train examples/seed-oss/seed-oss-36b-qlora.yaml
```

This config uses about 27.7 GiB VRAM.

Let us know how it goes. Happy finetuning! 🚀

### TIPS

- For inference, the official Seed Team recommends `top_p=0.95` and `temperature=1.1`.
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).

## Optimization Guides

Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).

## Related Resources

- [ByteDance Seed Website](https://seed.bytedance.com/)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/seed-oss/seed-oss-36b-qlora.yaml
================================================
base_model: ByteDance-Seed/Seed-OSS-36B-Instruct

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/slurm/README.md
================================================
# SLURM Multi-Node Training

This directory contains an example SLURM script for running Axolotl training jobs across multiple nodes in a SLURM cluster.

## Prerequisites

- Access to a SLURM cluster with GPU nodes
- Axolotl installed on all nodes (see [installation docs](https://docs.axolotl.ai/docs/installation.html))

## Usage

### Standard SLURM Clusters

1. Copy [`axolotl.slurm`](./axolotl.slurm) to your working directory.
2. Place your Axolotl config file (`train.yaml`) in the same directory.
3. Set the appropriate environment variables for the job:
    ```bash
    export HF_TOKEN="your-huggingface-token"

    # metric tracking
    # export WANDB_API_KEY="your-wandb-api-key"
    # ...
    ```
4. Submit the job:
   ```bash
   sbatch --export=ALL,NUM_NODES=2,NUM_TRAINERS=8,PRIMARY_ADDR=<master-node>,PRIMARY_PORT=29400 axolotl.slurm
   ```

   Where:
   - `NUM_NODES`: Number of nodes to use
   - `NUM_TRAINERS`: GPUs per node (typically 8)
   - `PRIMARY_ADDR`: Hostname/IP of the master node
   - `PRIMARY_PORT`: Port for distributed training (default: 29400)

5. (Optional) Run other slurm commands:
    ```bash
    # check job info
    scontrol show job axolotl-cli

    # check job queue
    squeue

    # check cluster status
    sinfo
    ```

### RunPod Instant Clusters

Axolotl works with RunPod Instant Clusters. This feature provides managed SLURM clusters with zero configuration.

1. **Deploy a SLURM Cluster**:
   - Go to [RunPod Instant Clusters](https://console.runpod.io/cluster)
   - Click "Create a Cluster"
   - Choose your GPU type, node count, and region
   - Choose an [Axolotl cloud docker image](https://docs.axolotl.ai/docs/docker.html#cloud)
   - Deploy the cluster

2. **Connect to the Controller Node**: Find the controller node in the RunPod console and connect via SSH

3. **Follow the instructions in [Standard SLURM Clusters](#standard-slurm-clusters)**

## Additional Resources

- [Axolotl Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [SLURM Documentation](https://slurm.schedmd.com/documentation.html)
- [RunPod SLURM Clusters Guide](https://docs.runpod.io/instant-clusters/slurm-clusters)


================================================
FILE: examples/slurm/axolotl.slurm
================================================
#!/bin/bash
# Prior to running this script, export your HF_TOKEN and WANDB_API_KEY to your environment; i.e.
# export HF_TOKEN="..."
# export WANDB_API_KEY="..."
#

# ---------- SBATCH commands ---------- #
#SBATCH --job-name=axolotl-slurm-multinode
#SBATCH --ntasks-per-node=1
#SBATCH --nodes=$NUM_NODES
#SBATCH --gpus-per-task=8
#SBATCH --cpus-per-task=128

export TORCH_DIST_INIT_BARRIER=0

srun axolotl preprocess train.yaml

srun axolotl train train.yaml --launcher torchrun -- \
    --nproc_per_node=$NUM_TRAINERS --nnodes=$NUM_NODES \
    --rdzv_id axolotl-cli --rdzv_backend c10d --rdzv_endpoint "${PRIMARY_ADDR}:${PRIMARY_PORT}" --rdzv-conf="join_timeout=1800"


================================================
FILE: examples/smolvlm2/README.md
================================================
# Finetune SmolVLM2 with Axolotl

[SmolVLM2](https://huggingface.co/collections/HuggingFaceTB/smolvlm2-smallest-video-lm-ever-67ab6b5e84bf8aaa60cb17c7) are a family of lightweight, open-source multimodal models from HuggingFace designed to analyze and understand video, image, and text content.

These models are built for efficiency, making them well-suited for on-device applications where computational resources are limited. Models are available in multiple sizes, including 2.2B, 500M, and 256M.

This guide shows how to fine-tune SmolVLM2 models with Axolotl.

## Getting Started

1.  Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

    Here is an example of how to install from pip:
    ```bash
    # Ensure you have a compatible version of Pytorch installed
    pip3 install packaging setuptools wheel ninja
    pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
    ```

2. Install an extra dependency:

    ```bash
    pip3 install num2words==0.5.14
    ```

3.  Run the finetuning example:

    ```bash
    # LoRA SFT (1x48GB @ 6.8GiB)
    axolotl train examples/smolvlm2/smolvlm2-2B-lora.yaml
    ```

## TIPS

- **Dataset Format**: For video finetuning, your dataset must be compatible with the multi-content Messages format. For more details, see our documentation on [Multimodal Formats](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).
- **Dataset Loading**: Read more on how to prepare and load your own datasets in our [documentation](https://docs.axolotl.ai/docs/dataset_loading.html).

## Optimization Guides

Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).

## Related Resources

- [SmolVLM2 Blog](https://huggingface.co/blog/smolvlm2)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/smolvlm2/smolvlm2-2B-lora.yaml
================================================
base_model: HuggingFaceTB/SmolVLM2-2.2B-Instruct
trust_remote_code: true
processor_type: AutoProcessor

# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

datasets:
  - path: HuggingFaceH4/llava-instruct-mix-vsft
    type: chat_template
    split: train[:1%]
dataset_prepared_path: last_run_prepared
val_set_size: 0.0
output_dir: ./outputs/out

adapter: lora
lora_model_dir:

sequence_len: 8192
pad_to_sequence_len: false

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'model.text_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj'

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 1
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32: true

gradient_checkpointing: true
logging_steps: 1
flash_attention: true
eager_attention:

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/streaming/README.md
================================================
# Streaming Dataset Examples

This directory contains example configurations for using Axolotl's streaming dataset
functionality, which enables memory-efficient training with large datasets.

## Examples

Run the following examples with e.g. `axolotl train examples/streaming/sft.yaml`; no
`axolotl preprocess` required!

### Pretraining (`pretrain.yaml`)

Demonstrates streaming configuration for pretraining tasks using the fineweb-edu dataset
with SmolLM2-135M.

- Uses `pretraining_dataset` configuration for automatic streaming
- Multipack attention control to prevent cross-attention between packed sequences
- Buffer size configuration for memory management

### SFT (`sft.yaml`)

Shows how to use streaming for supervised fine-tuning with the Alpaca dataset.

- Explicit `streaming: true` flag for SFT datasets
- Memory-efficient training on instruction datasets
- Evaluation datasets are currently not streamed

## Key Configuration Options

### `streaming`
- Enables streaming mode for standard datasets
- Automatically enabled for `pretraining_dataset`

### `streaming_multipack_buffer_size`
- Controls buffer size for sample packing (default: 10,000)
- Larger values improve packing efficiency but use more memory
- Adjust based on available memory

### `shuffle_merged_datasets`
- Enables shuffling of streaming datasets
- Requires additional memory for shuffle buffer

### `sample_packing`
- Packs multiple samples into single sequences
- Minimize per-step padding tokens

## Performance Tips

- Download small / frequently-used datasets locally for better performance
- Larger buffer sizes improve packing efficiency


================================================
FILE: examples/streaming/pretrain.yaml
================================================
base_model: HuggingFaceTB/SmolLM2-135M

# Streaming pretraining configuration
pretraining_dataset:
  - path: HuggingFaceFW/fineweb-edu
    name: sample-10BT
    type: pretrain
    text_column: text
    split: train

# Streaming-specific settings
streaming_multipack_buffer_size: 10000
shuffle_merged_datasets: true

# Training configuration
max_steps: 1000
output_dir: ./outputs/smollm2-135m-pretrain-streaming

# Sequence and packing settings
sequence_len: 1024
sample_packing: true
pretrain_multipack_attn: true  # Prevent cross-attention between packed sequences
flash_attention: true

# Batch size settings
gradient_accumulation_steps: 8
micro_batch_size: 1

# Optimizer and scheduler
optimizer: adamw_torch
lr_scheduler: cosine
learning_rate: 5e-4
warmup_ratio: 0.1
weight_decay: 0.01

# Precision and performance
bf16: auto
tf32: true

# Logging and checkpointing
logging_steps: 10
save_strategy: steps
save_steps: 250
save_total_limit: 3

# Weights & Biases (optional)
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

# Special tokens
special_tokens:
  pad_token: "<|endoftext|>"

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/streaming/sft.yaml
================================================
base_model: HuggingFaceTB/SmolLM2-135M

# Dataset configuration
datasets:
  - path: tatsu-lab/alpaca
    type: alpaca
    split: train

# Streaming-specific settings
streaming: true
streaming_multipack_buffer_size: 10000
shuffle_merged_datasets: true

# Training configuration
max_steps: 1000
output_dir: ./outputs/smollm2-135m-sft-streaming

# Sequence and packing settings
sequence_len: 1024
sample_packing: true
flash_attention: true

# Batch size settings
gradient_accumulation_steps: 4
micro_batch_size: 1

# Optimizer and scheduler
optimizer: adamw_torch
lr_scheduler: cosine
learning_rate: 2e-4
warmup_ratio: 0.1
weight_decay: 0.0

# Precision and performance
bf16: auto
tf32: true

# Logging and checkpointing
logging_steps: 10
save_strategy: steps
save_steps: 100
save_total_limit: 3

# Weights & Biases (optional)
wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

# Special tokens
special_tokens:
  pad_token: "<|endoftext|>"

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/swanlab/README.md
================================================
# SwanLab Integration Examples

This directory contains example configurations demonstrating SwanLab integration with Axolotl.

## Examples Overview

### 1. DPO with Completion Logging
**File**: `dpo-swanlab-completions.yml`

Demonstrates DPO (Direct Preference Optimization) training with RLHF completion table logging.

**Features**:
- Basic SwanLab experiment tracking
- Completion table logging (prompts, chosen/rejected responses, rewards)
- Memory-bounded buffer for long training runs
- Cloud sync configuration

**Best for**: RLHF practitioners who want to analyze model outputs qualitatively

**Quick start**:
```bash
export SWANLAB_API_KEY=your-api-key
accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-completions.yml
```

---

### 2. LoRA with Performance Profiling
**File**: `lora-swanlab-profiling.yml`

Demonstrates standard LoRA fine-tuning with performance profiling enabled.

**Features**:
- SwanLab experiment tracking
- Automatic profiling of trainer methods
- Profiling metrics visualization
- Performance optimization guidance

**Best for**: Engineers optimizing training performance and comparing different configurations

**Quick start**:
```bash
export SWANLAB_API_KEY=your-api-key
accelerate launch -m axolotl.cli.train examples/swanlab/lora-swanlab-profiling.yml
```

---

### 3. Full-Featured DPO Production Setup
**File**: `dpo-swanlab-full-featured.yml`

Comprehensive production-ready configuration with ALL SwanLab features enabled.

**Features**:
- Experiment tracking with team workspace
- RLHF completion logging
- Performance profiling
- Lark (Feishu) team notifications
- Private deployment support
- Production checklist and troubleshooting

**Best for**: Production RLHF training with team collaboration

**Quick start**:
```bash
export SWANLAB_API_KEY=your-api-key
export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/...
export SWANLAB_LARK_SECRET=your-webhook-secret
accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-full-featured.yml
```

---

### 4. Custom Trainer Profiling (Python)
**File**: `custom_trainer_profiling.py`

Python code examples showing how to add SwanLab profiling to custom trainers.

**Features**:
- `@swanlab_profile` decorator examples
- Context manager profiling for fine-grained timing
- `ProfilingConfig` for advanced filtering and throttling
- Multiple profiling patterns and best practices

**Best for**: Advanced users creating custom trainers

**Usage**:
```python
from custom_trainer_profiling import CustomTrainerWithProfiling
# See file for detailed examples and patterns
```

---

## Feature Matrix

| Example | Tracking | Completion Logging | Profiling | Lark Notifications | Team Workspace |
|---------|----------|-------------------|-----------|-------------------|----------------|
| dpo-swanlab-completions.yml | ✅ | ✅ | ✅ (auto) | ➖ (commented) | ➖ (commented) |
| lora-swanlab-profiling.yml | ✅ | ➖ (disabled) | ✅ (auto) | ➖ (commented) | ➖ (commented) |
| dpo-swanlab-full-featured.yml | ✅ | ✅ | ✅ (auto) | ✅ | ✅ |
| custom_trainer_profiling.py | N/A | N/A | ✅ (manual) | N/A | N/A |

---

## Configuration Quick Reference

### Basic SwanLab Setup
```yaml
plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin

use_swanlab: true
swanlab_project: my-project
swanlab_experiment_name: my-experiment
swanlab_mode: cloud  # cloud, local, offline, disabled
```

### RLHF Completion Logging
```yaml
swanlab_log_completions: true
swanlab_completion_log_interval: 100  # Log every 100 steps
swanlab_completion_max_buffer: 128    # Memory-bounded buffer
```

### Lark Team Notifications
```yaml
swanlab_lark_webhook_url: https://open.feishu.cn/...
swanlab_lark_secret: your-webhook-secret  # Required for production
```

### Team Workspace
```yaml
swanlab_workspace: my-research-team
```

### Private Deployment
```yaml
swanlab_web_host: https://swanlab.yourcompany.com
swanlab_api_host: https://api.swanlab.yourcompany.com
```

---

## Authentication

### Recommended: Environment Variable
```bash
export SWANLAB_API_KEY=your-api-key
export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/...
export SWANLAB_LARK_SECRET=your-webhook-secret
```

### Alternative: Config File (less secure)
```yaml
swanlab_api_key: your-api-key
swanlab_lark_webhook_url: https://open.feishu.cn/...
swanlab_lark_secret: your-webhook-secret
```

---

## Common Use Cases

### Use Case 1: Migrate from WandB to SwanLab
Start with `lora-swanlab-profiling.yml`, add your model/dataset config, disable WandB:
```yaml
use_swanlab: true
use_wandb: false
```

### Use Case 2: Analyze DPO Model Outputs
Use `dpo-swanlab-completions.yml`, adjust completion logging interval based on your training length:
```yaml
swanlab_completion_log_interval: 50   # More frequent for short training
swanlab_completion_log_interval: 200  # Less frequent for long training
```

### Use Case 3: Optimize Training Performance
Use `lora-swanlab-profiling.yml`, run multiple experiments with different optimizations:
- Baseline: `flash_attention: false, gradient_checkpointing: false`
- Flash Attention: `flash_attention: true`
- Gradient Checkpointing: `gradient_checkpointing: true`
- Both: `flash_attention: true, gradient_checkpointing: true`

Compare profiling metrics in SwanLab dashboard.

### Use Case 4: Production RLHF with Team Collaboration
Use `dpo-swanlab-full-featured.yml`, set up team workspace and Lark notifications:
```yaml
swanlab_workspace: ml-team
swanlab_lark_webhook_url: ...
swanlab_lark_secret: ...
```

---

## Viewing Your Experiments

### Cloud Mode
Visit [https://swanlab.cn](https://swanlab.cn) and navigate to your project.

**Dashboard sections**:
- **Metrics**: Training loss, learning rate, profiling metrics
- **Tables**: RLHF completions (for DPO/KTO/ORPO/GRPO)
- **Config**: Hyperparameters and configuration
- **System**: Resource usage (GPU, memory, CPU)
- **Files**: Logged artifacts

### Local Mode
```bash
swanlab watch ./swanlog
# Open browser to http://localhost:5092
```

---

## Troubleshooting

### SwanLab not initializing
```bash
# Check API key
echo $SWANLAB_API_KEY

# Verify SwanLab is installed
pip show swanlab

# Check config
grep -A 5 "use_swanlab" your-config.yml
```

### Completions not appearing
- Verify you're using an RLHF trainer (DPO/KTO/ORPO/GRPO)
- Check `swanlab_log_completions: true`
- Wait for `swanlab_completion_log_interval` steps
- Look for "Registered SwanLab RLHF completion logging" in logs

### Lark notifications not working
- Test webhook manually: `curl -X POST "$SWANLAB_LARK_WEBHOOK_URL" ...`
- Verify `SWANLAB_LARK_SECRET` is set correctly
- Check bot is added to Lark group chat
- Look for "Registered Lark notification callback" in logs

### Profiling metrics not appearing
- Verify `use_swanlab: true`
- Check SwanLab is initialized (look for init log message)
- Profiling metrics are under "profiling/" namespace
- Profiling auto-enabled when SwanLab is enabled

---

## Performance Notes

### Overhead Comparison

| Feature | Overhead per Step | Memory Usage |
|---------|------------------|--------------|
| Basic tracking | < 0.1% | ~10 MB |
| Completion logging | < 0.5% | ~64 KB (buffer=128) |
| Profiling | < 0.1% | ~1 KB |
| **Total** | **< 0.7%** | **~10 MB** |

### Best Practices
1. Use ONE logging tool in production (disable WandB/MLflow when using SwanLab)
2. Adjust completion log interval based on training length (100-200 steps)
3. Keep completion buffer size reasonable (128-512)
4. Profile critical path methods first (training_step, compute_loss)
5. Use ProfilingConfig to throttle high-frequency operations

---

## Further Reading

- **Full Documentation**: [src/axolotl/integrations/swanlab/README.md](../../src/axolotl/integrations/swanlab/README.md)
- **SwanLab Docs**: [https://docs.swanlab.cn](https://docs.swanlab.cn)
- **Axolotl Docs**: [https://axolotl-ai-cloud.github.io/axolotl/](https://axolotl-ai-cloud.github.io/axolotl/)
- **DPO Paper**: [Direct Preference Optimization](https://arxiv.org/abs/2305.18290)

---

## Contributing

Found an issue or have an improvement? Please submit a PR or open an issue:
- [Axolotl Issues](https://github.com/axolotl-ai-cloud/axolotl/issues)
- [SwanLab Issues](https://github.com/SwanHubX/SwanLab/issues)


================================================
FILE: examples/swanlab/custom_trainer_profiling.py
================================================
"""Example: Custom Trainer with SwanLab Profiling

This example demonstrates how to add SwanLab profiling to your custom trainer.

Features:
- @swanlab_profile decorator for automatic profiling
- swanlab_profiling_context for fine-grained profiling
- ProfilingConfig for advanced filtering and throttling

Usage:
    1. Create your custom trainer extending AxolotlTrainer
    2. Add @swanlab_profile decorators to methods you want to profile
    3. Use swanlab_profiling_context for fine-grained profiling within methods
    4. Enable SwanLab in your config (use_swanlab: true)

See also:
    - examples/swanlab/lora-swanlab-profiling.yml for config
    - src/axolotl/integrations/swanlab/profiling.py for implementation
"""

from axolotl.core.trainers.base import AxolotlTrainer
from axolotl.integrations.swanlab.profiling import (
    ProfilingConfig,
    swanlab_profile,
    swanlab_profiling_context,
    swanlab_profiling_context_advanced,
)


class CustomTrainerWithProfiling(AxolotlTrainer):
    """Custom trainer with SwanLab profiling enabled.

    This trainer demonstrates three profiling patterns:
    1. Decorator-based profiling (@swanlab_profile)
    2. Context manager profiling (swanlab_profiling_context)
    3. Advanced profiling with filtering (ProfilingConfig)
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Create custom profiling config for high-frequency operations
        self.fast_op_config = ProfilingConfig(
            enabled=True,
            min_duration_ms=0.5,  # Only log if duration > 0.5ms
            log_interval=50,  # Log every 50th call
        )

    # ========================================================================
    # Pattern 1: Decorator-based Profiling
    # ========================================================================
    # Best for: Methods you always want to profile
    # Overhead: ~2-5 microseconds per call (negligible)

    @swanlab_profile
    def training_step(self, model, inputs):
        """Main training step - always profile.

        Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.training_step
        """
        return super().training_step(model, inputs)

    @swanlab_profile
    def compute_loss(self, model, inputs, return_outputs=False):
        """Loss computation - always profile.

        Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.compute_loss
        """
        return super().compute_loss(model, inputs, return_outputs)

    @swanlab_profile
    def prediction_step(self, model, inputs, prediction_loss_only, ignore_keys=None):
        """Prediction step - always profile.

        Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.prediction_step
        """
        return super().prediction_step(model, inputs, prediction_loss_only, ignore_keys)

    # ========================================================================
    # Pattern 2: Fine-grained Context Manager Profiling
    # ========================================================================
    # Best for: Profiling specific code blocks within a method
    # Use case: When you want to profile forward vs backward separately

    def complex_training_step(self, model, inputs):
        """Training step with fine-grained profiling.

        Profiling metrics:
        - profiling/Time taken: CustomTrainerWithProfiling.forward_pass
        - profiling/Time taken: CustomTrainerWithProfiling.backward_pass
        - profiling/Time taken: CustomTrainerWithProfiling.optimizer_step
        """
        # Profile just the forward pass
        with swanlab_profiling_context(self, "forward_pass"):
            outputs = model(**inputs)
            loss = outputs.loss

        # Profile just the backward pass
        with swanlab_profiling_context(self, "backward_pass"):
            loss.backward()

        # Profile optimizer step
        with swanlab_profiling_context(self, "optimizer_step"):
            self.optimizer.step()
            self.optimizer.zero_grad()

        return outputs

    # ========================================================================
    # Pattern 3: Advanced Profiling with Filtering
    # ========================================================================
    # Best for: High-frequency operations where you want to throttle logging
    # Use case: Methods called 100+ times per step

    def _prepare_inputs(self, inputs):
        """Prepare inputs - throttled profiling.

        This method is called frequently (once per batch), so we throttle
        profiling to reduce overhead:
        - Only log if duration > 0.5ms (skip very fast operations)
        - Only log every 50th call (reduce logging frequency)

        Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.prepare_inputs
        """
        with swanlab_profiling_context_advanced(
            self, "prepare_inputs", config=self.fast_op_config
        ):
            return super()._prepare_inputs(inputs)

    def _prepare_input_for_model(self, input_ids):
        """Another high-frequency operation - throttled profiling.

        Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.prepare_input_for_model
        """
        with swanlab_profiling_context_advanced(
            self, "prepare_input_for_model", config=self.fast_op_config
        ):
            # Your custom input preparation logic
            return input_ids

    # ========================================================================
    # Pattern 4: Exception-safe Profiling
    # ========================================================================
    # Profiling is exception-safe: duration is logged even if method raises

    @swanlab_profile
    def potentially_failing_method(self):
        """This method may raise an exception.

        SwanLab profiling will still log the duration before re-raising.
        Profiling metric: profiling/Time taken: CustomTrainerWithProfiling.potentially_failing_method
        """
        # Do some work
        result = self._do_risky_computation()

        # If this raises, profiling duration is still logged
        if result < 0:
            raise ValueError("Invalid result")

        return result

    def _do_risky_computation(self):
        """Placeholder for risky computation."""
        return 42


# ============================================================================
# Advanced Example: Custom ProfilingConfig Per Method
# ============================================================================


class AdvancedProfilingTrainer(AxolotlTrainer):
    """Trainer with method-specific profiling configurations."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Different profiling configs for different method types
        self.critical_path_config = ProfilingConfig(
            enabled=True,
            min_duration_ms=0.0,  # Log everything on critical path
            log_interval=1,  # Log every call
        )

        self.fast_path_config = ProfilingConfig(
            enabled=True,
            min_duration_ms=1.0,  # Only log if > 1ms
            log_interval=100,  # Log every 100th call
        )

        self.debug_config = ProfilingConfig(
            enabled=True,
            min_duration_ms=0.0,  # Log everything
            log_interval=1,  # Log every call
        )

    def training_step(self, model, inputs):
        """Critical path - log everything."""
        with swanlab_profiling_context_advanced(
            self, "training_step", config=self.critical_path_config
        ):
            return super().training_step(model, inputs)

    def _prepare_inputs(self, inputs):
        """Fast path - throttle logging."""
        with swanlab_profiling_context_advanced(
            self, "prepare_inputs", config=self.fast_path_config
        ):
            return super()._prepare_inputs(inputs)

    def _debug_method(self, data):
        """Debug-only method - verbose logging."""
        with swanlab_profiling_context_advanced(
            self, "debug_method", config=self.debug_config
        ):
            # Your debug logic
            pass


# ============================================================================
# How to Use This Custom Trainer
# ============================================================================

"""
To use this custom trainer:

1. Save this file to your project (e.g., my_custom_trainer.py)

2. Create a config file that uses your custom trainer:

    # config.yml
    base_model: NousResearch/Llama-3.2-1B

    # ... other config ...

    plugins:
      - axolotl.integrations.swanlab.SwanLabPlugin

    use_swanlab: true
    swanlab_project: my-profiling-experiment

    # Optional: Specify custom trainer
    # (Or modify axolotl to use your custom trainer class)

3. Run training:

    export SWANLAB_API_KEY=your-api-key
    accelerate launch -m axolotl.cli.train config.yml

4. View profiling metrics in SwanLab dashboard:
   - profiling/Time taken: CustomTrainerWithProfiling.training_step
   - profiling/Time taken: CustomTrainerWithProfiling.forward_pass
   - profiling/Time taken: CustomTrainerWithProfiling.backward_pass
   - etc.

5. Compare profiling metrics across runs:
   - Run baseline without optimizations
   - Run with flash_attention enabled
   - Run with gradient_checkpointing enabled
   - Compare profiling metrics to see performance impact
"""

# ============================================================================
# Tips for Effective Profiling
# ============================================================================

"""
1. Profile the critical path first:
   - training_step, compute_loss, prediction_step
   - These methods are called most frequently and have biggest impact

2. Use throttling for high-frequency operations:
   - Methods called 100+ times per step
   - Use log_interval=50 or log_interval=100
   - Reduces profiling overhead and dashboard clutter

3. Filter noise with min_duration_ms:
   - Set min_duration_ms=1.0 to skip very fast operations
   - Focus on operations that actually take time

4. Compare across runs:
   - Run same config multiple times to check consistency
   - Compare different optimization strategies
   - Track profiling trends over time

5. Monitor distributed training:
   - Check for per-rank timing differences
   - Look for stragglers (slower ranks)
   - Identify synchronization bottlenecks

6. Disable profiling in production:
   - from axolotl.integrations.swanlab.profiling import DEFAULT_PROFILING_CONFIG
   - DEFAULT_PROFILING_CONFIG.enabled = False

7. Exception handling:
   - Profiling is exception-safe
   - Duration logged even if method raises
   - Useful for debugging methods that fail intermittently
"""


================================================
FILE: examples/swanlab/dpo-swanlab-completions.yml
================================================
# SwanLab DPO Training Example with Completion Logging
#
# This example demonstrates DPO (Direct Preference Optimization) training
# with SwanLab integration for experiment tracking and completion table logging.
#
# Features enabled:
# - SwanLab experiment tracking
# - RLHF completion table logging (prompts, chosen/rejected responses, rewards)
# - Lark (Feishu) team notifications (optional)
#
# To run:
#   export SWANLAB_API_KEY=your-api-key
#   accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-completions.yml

# Model Configuration
base_model: meta-llama/Meta-Llama-3-8B-Instruct
model_type: LlamaForCausalLM
tokenizer_type: AutoTokenizer

special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot_id|>

# Quantization
load_in_8bit: true
load_in_4bit: false

# LoRA Configuration
adapter: lora
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true

# DPO Configuration
chat_template: llama3
rl: dpo

datasets:
  - path: fozziethebeat/alpaca_messages_2k_dpo_test
    type: chat_template.default
    field_messages: conversation
    field_chosen: chosen
    field_rejected: rejected
    message_property_mappings:
      role: role
      content: content
    roles:
      system:
        - system
      user:
        - user
      assistant:
        - assistant

# Dataset and Output
dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/dpo-swanlab-out

# Training Configuration
sequence_len: 4096
sample_packing: false
micro_batch_size: 2
gradient_accumulation_steps: 4
num_epochs: 4

# Optimization
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
warmup_ratio: 0.1
weight_decay: 0.0

# Precision
bf16: auto
tf32: false

# Performance
gradient_checkpointing: true
flash_attention: true

# Checkpointing and Logging
logging_steps: 1
evals_per_epoch: 4
saves_per_epoch: 1

# ============================================================================
# SwanLab Integration
# ============================================================================

plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin

# Basic SwanLab Configuration
use_swanlab: true
swanlab_project: dpo-training
swanlab_experiment_name: llama-3-dpo-completions-demo
swanlab_description: "DPO training with completion table logging"
swanlab_mode: cloud  # Options: cloud, local, offline, disabled

# SwanLab Authentication
# Recommended: Set via environment variable
#   export SWANLAB_API_KEY=your-api-key
# Or set in config (less secure):
# swanlab_api_key: your-api-key

# Optional: Team workspace
# swanlab_workspace: my-research-team

# ============================================================================
# RLHF Completion Table Logging
# ============================================================================
#
# Automatically logs model completions to SwanLab for qualitative analysis:
# - Prompts from your DPO dataset
# - Chosen responses (preferred)
# - Rejected responses (non-preferred)
# - Reward differences
#
# View the table in SwanLab dashboard under "rlhf_completions"

swanlab_log_completions: true
swanlab_completion_log_interval: 100  # Log every 100 training steps
swanlab_completion_max_buffer: 128    # Keep last 128 completions in memory

# Memory Usage Notes:
# - Buffer size 128: ~64 KB (default, recommended)
# - Buffer size 512: ~256 KB (for more historical completions)
# - Buffer size 1024: ~512 KB (maximum for very long training runs)

# Performance Notes:
# - Completion logging overhead: < 0.5% per training step
# - Only logs every N steps to minimize impact
# - Memory-bounded buffer prevents memory leaks

# ============================================================================
# Optional: Lark (Feishu) Team Notifications
# ============================================================================
#
# Get real-time training notifications in your team chat
# Uncomment to enable:

# swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx
# swanlab_lark_secret: your-webhook-secret  # Recommended for production

# Notifications sent for:
# - Training start
# - Training completion
# - Training errors
# - Metric milestones (if configured)

# ============================================================================
# Optional: Private SwanLab Deployment
# ============================================================================
#
# For enterprise users with private SwanLab deployment:

# swanlab_web_host: https://swanlab.yourcompany.com
# swanlab_api_host: https://api.swanlab.yourcompany.com

# ============================================================================
# Disable WandB if you're migrating from it
# ============================================================================

# wandb_project:
# wandb_entity:
# use_wandb: false


================================================
FILE: examples/swanlab/dpo-swanlab-full-featured.yml
================================================
# SwanLab Full-Featured DPO Training Example
#
# This example demonstrates ALL SwanLab integration features:
# - Experiment tracking with cloud sync
# - RLHF completion table logging
# - Performance profiling
# - Lark (Feishu) team notifications
# - Team workspace collaboration
#
# Use this as a reference for production RLHF training setups.
#
# To run:
#   export SWANLAB_API_KEY=your-api-key
#   export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/...
#   export SWANLAB_LARK_SECRET=your-webhook-secret
#   accelerate launch -m axolotl.cli.train examples/swanlab/dpo-swanlab-full-featured.yml

# ============================================================================
# Model Configuration
# ============================================================================

base_model: meta-llama/Meta-Llama-3-8B-Instruct
model_type: LlamaForCausalLM
tokenizer_type: AutoTokenizer

special_tokens:
  pad_token: <|finetune_right_pad_id|>
  eos_token: <|eot_id|>

# Quantization for efficient training
load_in_8bit: true
load_in_4bit: false

# ============================================================================
# LoRA Configuration
# ============================================================================

adapter: lora
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true  # Target all linear layers

# ============================================================================
# DPO (Direct Preference Optimization) Configuration
# ============================================================================

chat_template: llama3
rl: dpo  # Enable DPO trainer

datasets:
  - path: fozziethebeat/alpaca_messages_2k_dpo_test
    type: chat_template.default
    field_messages: conversation
    field_chosen: chosen
    field_rejected: rejected
    message_property_mappings:
      role: role
      content: content
    roles:
      system:
        - system
      user:
        - user
      assistant:
        - assistant

# ============================================================================
# Dataset and Output Configuration
# ============================================================================

dataset_prepared_path:
val_set_size: 0.05
output_dir: ./outputs/dpo-swanlab-full-featured-out

# ============================================================================
# Training Configuration
# ============================================================================

sequence_len: 4096
sample_packing: false

micro_batch_size: 2
gradient_accumulation_steps: 4
num_epochs: 4

# ============================================================================
# Optimization
# ============================================================================

optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002
warmup_ratio: 0.1
weight_decay: 0.0

# ============================================================================
# Precision and Performance
# ============================================================================

bf16: auto
tf32: false

gradient_checkpointing: true
flash_attention: true

# ============================================================================
# Checkpointing and Logging
# ============================================================================

logging_steps: 1
evals_per_epoch: 4
saves_per_epoch: 1

# ============================================================================
# SwanLab Integration - Full Configuration
# ============================================================================

plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin

# ------------------------------------------------------------------------------
# Basic SwanLab Configuration
# ------------------------------------------------------------------------------

use_swanlab: true
swanlab_project: dpo-production
swanlab_experiment_name: llama-3-dpo-full-featured-v1
swanlab_description: |
  Production DPO training with all SwanLab features enabled:
  - Completion table logging for qualitative analysis
  - Performance profiling for optimization
  - Lark notifications for team collaboration

swanlab_mode: cloud  # Options: cloud, local, offline, disabled

# ------------------------------------------------------------------------------
# Team Collaboration
# ------------------------------------------------------------------------------

# Workspace for team collaboration (shared experiments)
swanlab_workspace: ml-research-team

# Authentication (recommended: use environment variable)
#   export SWANLAB_API_KEY=your-api-key
# Or set in config (less secure):
# swanlab_api_key: your-api-key

# ------------------------------------------------------------------------------
# RLHF Completion Table Logging
# ------------------------------------------------------------------------------
# Automatically logs model completions for qualitative analysis:
# - Prompts from your DPO dataset
# - Chosen responses (preferred)
# - Rejected responses (non-preferred)
# - Reward differences
#
# View in SwanLab dashboard under "rlhf_completions" table

swanlab_log_completions: true
swanlab_completion_log_interval: 100  # Log every 100 steps
swanlab_completion_max_buffer: 256    # Larger buffer for long training runs

# Buffer size recommendations:
# - 128: Default, ~64 KB memory (recommended for most cases)
# - 256: ~128 KB memory (this config, good for longer training)
# - 512: ~256 KB memory (maximum for very long runs)

# ------------------------------------------------------------------------------
# Lark (Feishu) Team Notifications
# ------------------------------------------------------------------------------
# Get real-time training notifications in your team chat
#
# Notifications sent for:
# - Training start
# - Training completion
# - Training errors
# - Metric milestones (if configured)

# Recommended: Set via environment variables
#   export SWANLAB_LARK_WEBHOOK_URL=https://open.feishu.cn/...
#   export SWANLAB_LARK_SECRET=your-webhook-secret

# Or set in config (less secure):
# swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx
# swanlab_lark_secret: your-webhook-secret  # REQUIRED for production

# Security note: ALWAYS use swanlab_lark_secret in production to prevent
# unauthorized parties from sending fake notifications to your team chat.

# ------------------------------------------------------------------------------
# Performance Profiling
# ------------------------------------------------------------------------------
# Profiling is automatically enabled when SwanLab is enabled.
# Metrics logged to SwanLab under "profiling/" namespace:
#   profiling/Time taken: AxolotlTrainer.training_step
#   profiling/Time taken: AxolotlTrainer.compute_loss
#   profiling/Time taken: AxolotlTrainer.prediction_step
#
# Use these metrics to:
# - Identify bottlenecks in training loop
# - Compare performance across different configurations
# - Monitor performance regressions over time
# - Debug unexpected slowdowns

# For custom profiling in your own trainer, see:
#   examples/swanlab/custom_trainer_profiling.py

# ------------------------------------------------------------------------------
# Optional: Private SwanLab Deployment
# ------------------------------------------------------------------------------
# For enterprise users with private SwanLab deployment:

# swanlab_web_host: https://swanlab.yourcompany.com
# swanlab_api_host: https://api.swanlab.yourcompany.com

# ------------------------------------------------------------------------------
# Optional: Model Checkpointing to SwanLab
# ------------------------------------------------------------------------------
# Log model checkpoints to SwanLab (coming soon)

swanlab_log_model: false

# ============================================================================
# Disable Other Logging Tools (Recommended)
# ============================================================================
# Using multiple logging tools simultaneously can impact performance:
# - Expected overhead: ~1-2% per logger
# - Potential config/callback conflicts
#
# For production training, use ONLY SwanLab:

# wandb_project:
# use_wandb: false
#
# use_mlflow: false
#
# use_comet: false

# ============================================================================
# Expected Training Behavior
# ============================================================================

# With this configuration, you should see:
#
# 1. SwanLab Initialization (rank 0 only):
#    INFO: SwanLab initialized for project: dpo-production
#    INFO: SwanLab experiment: llama-3-dpo-full-featured-v1
#    INFO: SwanLab mode: cloud
#    INFO: SwanLab workspace: ml-research-team
#
# 2. Completion Logging (rank 0 only):
#    INFO: Registered SwanLab RLHF completion logging callback for DPOTrainer
#          (log_interval=100, max_buffer=256)
#
# 3. Lark Notifications (rank 0 only):
#    INFO: Registered Lark notification callback with HMAC authentication
#
# 4. Distributed Training Detection (if multi-GPU):
#    INFO: Distributed training detected (world_size=N)
#    INFO: Only rank 0 will initialize SwanLab
#    INFO: Other ranks will skip SwanLab to avoid conflicts
#
# 5. Training Start Notification (Lark):
#    Your team chat receives: "Training started: llama-3-dpo-full-featured-v1"
#
# 6. Periodic Completion Logging:
#    Every 100 steps, completion table is updated in SwanLab dashboard
#
# 7. Training Complete Notification (Lark):
#    Your team chat receives: "Training completed: llama-3-dpo-full-featured-v1"
#    With link to SwanLab dashboard and final metrics
#
# 8. SwanLab Dashboard Shows:
#    - Training metrics (loss, learning rate, etc.)
#    - Completion table (rlhf_completions)
#    - Profiling metrics (profiling/Time taken: ...)
#    - Hyperparameters and configuration
#    - System resource usage

# ============================================================================
# Production Checklist
# ============================================================================

# Before deploying to production, verify:
# ✅ SwanLab API key is set via environment variable (not in config)
# ✅ Lark webhook secret is set (required for HMAC authentication)
# ✅ Workspace is set to your team's workspace
# ✅ Experiment name is descriptive and unique
# ✅ Only SwanLab is enabled (other loggers disabled)
# ✅ Completion logging buffer size is appropriate for your training duration
# ✅ Private deployment hosts are set (if using enterprise SwanLab)
# ✅ Test run completes successfully and shows up in SwanLab dashboard
# ✅ Lark notifications are received in team chat
# ✅ Profiling metrics are logged correctly

# ============================================================================
# Troubleshooting
# ============================================================================

# If SwanLab initialization fails:
# 1. Check SWANLAB_API_KEY environment variable is set
# 2. Verify swanlab_project is set in config
# 3. Check swanlab_mode is valid (cloud/local/offline/disabled)
# 4. Verify internet connectivity (for cloud mode)

# If Lark notifications not received:
# 1. Check SWANLAB_LARK_WEBHOOK_URL is set correctly
# 2. Verify SWANLAB_LARK_SECRET matches your Lark bot settings
# 3. Test webhook manually: curl -X POST "$SWANLAB_LARK_WEBHOOK_URL" ...
# 4. Check training logs for "Registered Lark notification callback"
# 5. Verify bot is added to the target Lark group chat

# If completions not appearing in SwanLab:
# 1. Verify you're using an RLHF trainer (DPO/KTO/ORPO/GRPO)
# 2. Check swanlab_log_completions is true
# 3. Wait for log_interval steps (default: 100)
# 4. Check training logs for "Registered SwanLab RLHF completion logging"

# If profiling metrics not appearing:
# 1. Verify use_swanlab is true
# 2. Check SwanLab is initialized (check logs)
# 3. Look under "profiling/" namespace in dashboard
# 4. Profiling may be disabled if DEFAULT_PROFILING_CONFIG.enabled = False

# For more help:
# - SwanLab docs: https://docs.swanlab.cn
# - Axolotl SwanLab integration: src/axolotl/integrations/swanlab/README.md
# - GitHub issues: https://github.com/axolotl-ai-cloud/axolotl/issues


================================================
FILE: examples/swanlab/lora-swanlab-profiling.yml
================================================
# SwanLab LoRA Training Example with Performance Profiling
#
# This example demonstrates standard LoRA fine-tuning with SwanLab integration
# for performance profiling and optimization.
#
# Features enabled:
# - SwanLab experiment tracking
# - Performance profiling (training step, forward/backward pass timing)
# - Real-time metrics visualization
#
# To run:
#   export SWANLAB_API_KEY=your-api-key
#   accelerate launch -m axolotl.cli.train examples/swanlab/lora-swanlab-profiling.yml

# Model Configuration
base_model: NousResearch/Llama-3.2-1B

# Dataset Configuration
datasets:
  - path: teknium/GPT4-LLM-Cleaned
    type: alpaca

val_set_size: 0.1
output_dir: ./outputs/lora-swanlab-profiling-out

# LoRA Configuration
adapter: lora
lora_r: 16
lora_alpha: 32
lora_dropout: 0.05
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

# Training Configuration
sequence_len: 2048
sample_packing: true
eval_sample_packing: true

micro_batch_size: 2
gradient_accumulation_steps: 2
num_epochs: 1

# Optimization
optimizer: adamw_8bit
lr_scheduler: cosine
learning_rate: 0.0002
warmup_ratio: 0.1
weight_decay: 0.0

# Precision
bf16: auto
tf32: false

# Performance
gradient_checkpointing: true
flash_attention: true

# Checkpointing and Logging
logging_steps: 1
evals_per_epoch: 4
saves_per_epoch: 1

# Loss Monitoring
loss_watchdog_threshold: 5.0
loss_watchdog_patience: 3

special_tokens:
  pad_token: "<|end_of_text|>"

# ============================================================================
# SwanLab Integration
# ============================================================================

plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin

# Basic SwanLab Configuration
use_swanlab: true
swanlab_project: lora-profiling
swanlab_experiment_name: llama-3.2-1b-profiling-demo
swanlab_description: "LoRA fine-tuning with performance profiling"
swanlab_mode: cloud  # Options: cloud, local, offline, disabled

# SwanLab Authentication
# Recommended: Set via environment variable
#   export SWANLAB_API_KEY=your-api-key
# Or set in config (less secure):
# swanlab_api_key: your-api-key

# Optional: Team workspace
# swanlab_workspace: my-ml-team

# ============================================================================
# Performance Profiling
# ============================================================================
#
# SwanLab automatically profiles trainer methods when enabled.
# Profiling metrics appear in SwanLab dashboard under "profiling/" namespace.
#
# Built-in profiling:
# - Minimal overhead (< 0.1% per step)
# - High-precision timing (microsecond accuracy)
# - Exception-safe (logs duration even if method fails)
#
# View profiling metrics in SwanLab dashboard:
#   profiling/Time taken: AxolotlTrainer.training_step
#   profiling/Time taken: AxolotlTrainer.compute_loss
#   profiling/Time taken: AxolotlTrainer.prediction_step
#
# For custom profiling in your own trainer, see:
#   examples/swanlab/custom_trainer_profiling.py

# Completion logging is disabled for non-RLHF trainers
swanlab_log_completions: false  # Only works with DPO/KTO/ORPO/GRPO

# ============================================================================
# Optional: Compare with Multiple Runs
# ============================================================================
#
# To compare profiling metrics across different configurations:
#
# 1. Run baseline without flash attention:
#    swanlab_experiment_name: llama-3.2-1b-no-flash-attn
#    flash_attention: false
#
# 2. Run with gradient checkpointing:
#    swanlab_experiment_name: llama-3.2-1b-grad-checkpoint
#    gradient_checkpointing: true
#
# 3. Run with both:
#    swanlab_experiment_name: llama-3.2-1b-optimized
#    flash_attention: true
#    gradient_checkpointing: true
#
# Then compare profiling metrics in SwanLab dashboard to see performance impact

# ============================================================================
# Optional: Lark (Feishu) Team Notifications
# ============================================================================
#
# Get notified when profiling experiments complete:

# swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx
# swanlab_lark_secret: your-webhook-secret

# ============================================================================
# Profiling Best Practices
# ============================================================================
#
# 1. Run multiple epochs to see profiling trends over time
# 2. Ignore first ~10 steps (warmup period, slower)
# 3. Look for outliers (steps that take significantly longer)
# 4. Compare profiling metrics before/after optimization changes
# 5. Monitor per-rank profiling in distributed training
#
# Common bottlenecks to profile:
# - training_step: Overall step time (should be consistent)
# - compute_loss: Loss computation (scales with sequence length)
# - prediction_step: Evaluation time (can be slow for large val sets)
#
# If you see inconsistent timing:
# - Check for data loading bottlenecks
# - Monitor GPU utilization (may be CPU-bound)
# - Check for gradient accumulation effects
# - Verify CUDA kernel synchronization

# ============================================================================
# Disable WandB if you're migrating from it
# ============================================================================

# wandb_project:
# use_wandb: false


================================================
FILE: examples/trinity/README.md
================================================
# Finetune ArceeAI's Trinity with Axolotl

[Trinity](https://huggingface.co/collections/arcee-ai/trinity) is a family of open weight MoE models trained by Arcee.ai.

This guide shows how to fine-tune it with Axolotl with multi-turn conversations and proper masking.

## Getting started

1. Install Axolotl following the main from the [installation guide](https://docs.axolotl.ai/docs/installation.html#sec-edge-build).

2. Install [Cut Cross Entropy](https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy) to reduce training VRAM usage.

3. Run the finetuning example:

    ```bash
    axolotl train examples/trinity/trinity-nano-preview-qlora.yaml
    ```

This config uses about 24.9 GiB VRAM (w/o CCE).

Let us know how it goes. Happy finetuning! 🚀

### TIPS

- For inference, the official Arcee.ai team recommends `top_p: 0.75`, `temperature: 0.15`, `top_k: 50`, and `min_p: 0.06`.
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).

## Optimization Guides

Please check the [Optimizations doc](https://docs.axolotl.ai/docs/optimizations.html).

## Related Resources

- [Trinity Blog](https://www.arcee.ai/blog/the-trinity-manifesto)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)


================================================
FILE: examples/trinity/trinity-nano-preview-qlora.yaml
================================================
base_model: arcee-ai/Trinity-Nano-Preview
revision_of_model: 2ee94b0

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

# CCE - N/A as of now
# plugins:
#   - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: false
load_in_4bit: true

datasets:
  - path: fozziethebeat/alpaca_messages_2k_test
    type: chat_template

dataset_prepared_path: last_run_prepared
val_set_size: 0.1
output_dir: ./outputs/lora-out

adapter: qlora
lora_model_dir:

sequence_len: 2048
sample_packing: true

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_linear: true
lora_target_modules:
  - gate_proj
  - down_proj
  - up_proj
  - q_proj
  - v_proj
  - k_proj
  - o_proj

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: false

gradient_checkpointing: true
resume_from_checkpoint:
logging_steps: 1
# flash_attention: true  # Not supported
sdp_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1

# save_first_step: true  # uncomment this to validate checkpoint saving works with your config


================================================
FILE: examples/voxtral/README.md
================================================
# Finetune Voxtral with Axolotl

Voxtral is a [3B](https://huggingface.co/mistralai/Voxtral-Mini-3B-2507)/[24B](https://huggingface.co/mistralai/Voxtral-Small-24B-2507) parameter opensource model from MistralAI found on HuggingFace. This guide shows how to fine-tune it with Axolotl.

Thanks to the team at MistralAI for giving us early access to prepare for this release.

## Getting started

1. Install Axolotl following the [installation guide](https://docs.axolotl.ai/docs/installation.html).

    Here is an example of how to install from pip:

```bash
# Ensure you have Pytorch installed (Pytorch 2.6.0 min)
pip3 install packaging==26.0 setuptools==75.8.0 wheel ninja
pip3 install --no-build-isolation 'axolotl[flash-attn]>=0.12.0'
```

2. Please install the below.

```bash
# audio
pip3 install librosa==0.11.0
pip3 install 'mistral_common[audio]==1.8.3'

# Install CCE https://docs.axolotl.ai/docs/custom_integrations.html#cut-cross-entropy
python scripts/cutcrossentropy_install.py | sh
```

3. Download sample dataset files

```bash
# for text + audio only
wget https://huggingface.co/datasets/Nanobit/text-audio-2k-test/resolve/main/En-us-African_elephant.oga
```

4. Run the finetuning example:

```bash
# text only
axolotl train examples/voxtral/voxtral-mini-qlora.yml

# text + audio
axolotl train examples/voxtral/voxtral-mini-audio-qlora.yml
```

These configs use about 4.8 GB VRAM.

Let us know how it goes. Happy finetuning! 🚀

### TIPS

- For inference, the official MistralAI team recommends `temperature: 0.2` and `top_p: 0.95` for audio understanding and `temperature: 0.0` for transcription.
- You can run a full finetuning by removing the `adapter: qlora` and `load_in_4bit: true` from the config.
- Read more on how to load your own dataset at [docs](https://docs.axolotl.ai/docs/dataset_loading.html).
- The text dataset format follows the OpenAI Messages format as seen [here](https://docs.axolotl.ai/docs/dataset-formats/conversation.html#chat_template).
- The multimodal dataset format follows the OpenAI multi-content Messages format as seen [here](https://docs.axolotl.ai/docs/multimodal.html#dataset-format).


## Optimization Guides

- [Multi-GPU Training](https://docs.axolotl.ai/docs/multi-gpu.html)
- [Multi-Node Training](https://docs.axolotl.ai/docs/multi-node.html)
- [LoRA Optimizations](https://docs.axolotl.ai/docs/lora_optims.html)

## Limitations

We only support the `mistral-common` tokenizer for Supervised Fine-tuning at the moment and for `type: chat_template` only.

In addition, we do not support overriding tokens yet.

## Related Resources

- [MistralAI Magistral Blog](https://mistral.ai/news/magistral/)
- [Axolotl Docs](https://docs.axolotl.ai)
- [Axolotl Website](https://axolotl.ai)
- [Axolotl GitHub](https://github.com/axolotl-ai-cloud/axolotl)
- [Axolotl Discord](https://discord.gg/7m9sfhzaf3)

## Future Work

- Add parity to Preference Tuning, RL, etc.
- Add parity to other tokenizer configs like overriding tokens.


================================================
FILE: examples/voxtral/voxtral-mini-audio-qlora.yml
================================================
base_model: mistralai/Voxtral-Mini-3B-2507
processor_type: VoxtralProcessor

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

# Enable to use mistral-common tokenizer
tokenizer_use_mistral_common: true

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

# for use with fft to only train on language model layers
# unfrozen_parameters:
  # - language_model.model.*
  # - lm_head
  # - embed_tokens

load_in_4bit: true

# these 3 lines are needed for now to handle vision chat templates w images
skip_prepare_dataset: true
remove_unused_columns: false
sample_packing: false

# gemma3 doesn't seem to play nice with ddp
ddp_find_unused_parameters: true

eot_tokens:
  - <end_of_turn>

# sample dataset below requires downloading audio/image in advance
# wget https://huggingface.co/datasets/Nanobit/text-audio-2k-test/resolve/main/En-us-African_elephant.oga
datasets:
  - path: NanoBit/text-audio-2k-test
    type: chat_template
dataset_prepared_path:
val_set_size: 0.01
output_dir: ./outputs/out

adapter: qlora
lora_model_dir:

sequence_len: 2048
pad_to_sequence_len: false

lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 4
micro_batch_size: 2
num_epochs: 1
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: true
fp16:
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch: 1
saves_per_epoch: 1
weight_decay: 0.0


================================================
FILE: examples/voxtral/voxtral-mini-qlora.yml
================================================
base_model: mistralai/Voxtral-Mini-3B-2507

# Automatically upload checkpoint and final model to HF
# hub_model_id: username/custom_model_name

# Enable to use mistral-common tokenizer
tokenizer_use_mistral_common: true

plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

load_in_8bit: false
load_in_4bit: true

# for use with fft to only train on language model layers
# unfrozen_parameters:
  # - language_model.model.*
  # - lm_head
  # - embed_tokens

eot_tokens:
  - <end_of_turn>
datasets:
  - path: cgato/SlimOrcaDedupCleaned
    type: chat_template
    split: train[:1%]
    field_messages: conversations
    message_property_mappings:
      role: from
      content: value

val_set_size: 0.0
output_dir: ./outputs/out

adapter: qlora
lora_r: 32
lora_alpha: 16
lora_dropout: 0.05
lora_target_modules: 'language_model.model.layers.[\d]+.(mlp|self_attn).(up|down|gate|q|k|v|o)_proj'

sequence_len: 2048
sample_packing: true
eval_sample_packing: true
pad_to_sequence_len: true

wandb_project:
wandb_entity:
wandb_watch:
wandb_name:
wandb_log_model:

gradient_accumulation_steps: 1
micro_batch_size: 1
num_epochs: 4
optimizer: adamw_bnb_8bit
lr_scheduler: cosine
learning_rate: 0.0002

bf16: auto
tf32: true

gradient_checkpointing: true
gradient_checkpointing_kwargs:
  use_reentrant: false
resume_from_checkpoint:
logging_steps: 1
flash_attention: true

warmup_ratio: 0.1
evals_per_epoch:
saves_per_epoch: 1
weight_decay: 0.0
special_tokens:


================================================
FILE: index.qmd
================================================
---
# toc-location: right-body
# toc-title: Table Of Contents
# toc-expand: 2
---

```{python}
#|output: asis
#|echo: false

# This cell steals the README as the home page for now, but excludes the table of contents (quarto adds its own)
import re
pattern = re.compile(
    r"<table>\s*<tr>\s*<td>\s*## Table of Contents.*?</td>\s*</tr>\s*</table>",
    re.DOTALL | re.IGNORECASE
)

with open('README.md', 'r') as f:
    txt = f.read()

cleaned = pattern.sub("", txt)
print(cleaned)
```


================================================
FILE: pyproject.toml
================================================
[build-system]
requires = ["setuptools>=64", "wheel", "setuptools_scm>=8", "packaging==26.0"]
build-backend = "setuptools.build_meta"

[project]
name = "axolotl"
dynamic = ["version", "dependencies", "optional-dependencies"]
description = "LLM Trainer"
readme = "README.md"
requires-python = ">=3.10"
# license = "Apache-2.0"

[project.scripts]
axolotl = "axolotl.cli.main:main"

[project.urls]
Homepage = "https://axolotl.ai/"
Documentation = "https://docs.axolotl.ai/"
Repository = "https://github.com/axolotl-ai-cloud/axolotl.git"

[tool.setuptools_scm]

[tool.setuptools]
py-modules = ["setuptools_axolotl_dynamic_dependencies"]
include-package-data = true

[tool.setuptools.dynamic]
version = { file = "VERSION" }

[tool.setuptools.cmdclass]
build_py = "setuptools_axolotl_dynamic_dependencies.BuildPyCommand"

[tool.ruff]
line-length = 88
target-version = "py310"

[tool.ruff.lint]
select = ["E", "F", "W", "C90", "B", "I"]
ignore = [
    "E203",  # Whitespace before ':'
    "E501",  # Line too long
    "C901",  # Too complex
    "B019",  # Use of functools.cache on methods
    "E722",  # Bare except
    "F821",  # Undefined name (for dynamic exec)
]

[tool.ruff.lint.isort]
known-third-party = ["wandb", "comet_ml"]
known-local-folder = ["src", "tests"]
# Black-compatible isort settings
force-single-line = false
combine-as-imports = true
split-on-trailing-comma = true

[tool.ruff.format]
# Use black's formatting style exactly
quote-style = "double"
indent-style = "space"
skip-magic-trailing-comma = false
line-ending = "auto"
docstring-code-format = false

[tool.uv.extra-build-dependencies]
axolotl = ["huggingface_hub"]


================================================
FILE: requirements-dev.txt
================================================
black
mypy
pre-commit
types-requests
quartodoc
jupyter
blobfile
tiktoken


================================================
FILE: requirements-tests.txt
================================================
codecov
codecov-cli
pytest
pytest-cov
pytest-retry
pytest-sugar
pytest-xdist
tbparse


================================================
FILE: requirements.txt
================================================
--extra-index-url https://huggingface.github.io/autogptq-index/whl/cu118/

# START section of dependencies that don't install on Darwin/MacOS
bitsandbytes==0.49.1
triton>=3.4.0
mamba-ssm==1.2.0.post1
xformers>=0.0.23.post1
liger-kernel==0.7.0
# END section

packaging==26.0
huggingface_hub>=1.1.7
peft>=0.18.1
tokenizers>=0.22.1
transformers==5.3.0
accelerate==1.13.0
datasets==4.5.0
deepspeed>=0.18.6,<0.19.0
trl==0.29.0
hf_xet==1.3.2
kernels==0.12.2

fla-core==0.4.1
flash-linear-attention==0.4.1

trackio>=0.16.1
typing-extensions>=4.15.0

optimum==1.16.2
hf_transfer
sentencepiece
gradio>=6.2.0,<7.0

modal==1.3.0.post1
pydantic>=2.10.6
addict
fire
PyYAML>=6.0
requests
wandb
einops
colorama
numba>=0.61.2
numpy>=2.2.6

# qlora things
evaluate==0.4.1
scipy
nvidia-ml-py==12.560.30
art
tensorboard
python-dotenv==1.0.1

# remote filesystems
s3fs>=2024.5.0
gcsfs>=2025.3.0
adlfs>=2024.5.0
ocifs==1.3.2

zstandard==0.22.0
fastcore

# lm eval harness
lm_eval==0.4.7
langdetect==1.0.9
immutabledict==4.2.0
antlr4-python3-runtime==4.13.2

torchao==0.16.0
openenv-core==0.1.0
schedulefree==1.4.1

axolotl-contribs-lgpl==0.0.7
axolotl-contribs-mit==0.0.6
# telemetry
posthog==6.7.11

mistral-common==1.10.0


================================================
FILE: scripts/chat_datasets.py
================================================
"""
helper script to parse chat datasets into a usable yaml
"""

import click
import yaml
from datasets import load_dataset


@click.command()
@click.argument("dataset", type=str)
@click.option("--split", type=str, default="train")
def parse_dataset(dataset=None, split="train"):
    ds_cfg = {}
    ds_cfg["path"] = dataset
    ds_cfg["split"] = split
    ds_cfg["type"] = "chat_template"
    ds_cfg["chat_template"] = "<<<Replace based on your model>>>"

    dataset = load_dataset(dataset, split=split)
    features = dataset.features
    feature_keys = features.keys()
    field_messages = None
    for key in ["conversation", "conversations", "messages"]:
        if key in feature_keys:
            field_messages = key
            break
    if not field_messages:
        raise ValueError(
            f"No conversation field found in dataset: {', '.join(feature_keys)}"
        )
    ds_cfg["field_messages"] = field_messages

    message_fields = features[field_messages][0].keys()

    message_property_mappings = {"role": None, "content": None}
    for key in ["from", "role"]:
        if key in message_fields:
            message_property_mappings["role"] = key
            break
    if not message_property_mappings["role"]:
        raise ValueError(
            f"No role field found in messages: {', '.join(message_fields)}"
        )

    for key in ["content", "text", "value"]:
        if key in message_fields:
            message_property_mappings["content"] = key
            break
    if not message_property_mappings["content"]:
        raise ValueError(
            f"No content field found in messages: {', '.join(message_fields)}"
        )
    ds_cfg["message_property_mappings"] = message_property_mappings

    print(yaml.dump({"datasets": [ds_cfg]}))


if __name__ == "__main__":
    parse_dataset()


================================================
FILE: scripts/cloud-entrypoint-term.sh
================================================
#!/bin/bash

# Export specific ENV variables to /etc/rp_environment
echo "Exporting environment variables..."
printenv | grep -E '^RUNPOD_|^PATH=|^_=' | sed 's/^\(.*\)=\(.*\)$/export \1="\2"/' >> /etc/rp_environment
conda init
# this needs to come after conda init
echo 'source /etc/rp_environment' >> ~/.bashrc

add_keys_to_authorized() {
    local key_value=$1

    # Create the ~/.ssh directory and set permissions
    mkdir -p ~/.ssh
    chmod 700 ~/.ssh

    # Create the authorized_keys file if it doesn't exist
    touch ~/.ssh/authorized_keys

    # Initialize an empty key variable
    local key=""

    # Read the key variable word by word
    for word in $key_value; do
        # Check if the word looks like the start of a key
        if [[ $word == ssh-* ]]; then
            # If there's a key being built, add it to the authorized_keys file
            if [[ -n $key ]]; then
                echo $key >> ~/.ssh/authorized_keys
            fi
            # Start a new key
            key=$word
        else
            # Append the word to the current key
            key="$key $word"
        fi
    done

    # Add the last key to the authorized_keys file
    if [[ -n $key ]]; then
        echo $key >> ~/.ssh/authorized_keys
    fi

    # Set the correct permissions
    chmod 600 ~/.ssh/authorized_keys
    chmod 700 -R ~/.ssh
}

if [[ $PUBLIC_KEY ]]; then
    # runpod
    add_keys_to_authorized "$PUBLIC_KEY"
    # Start the SSH service in the background
    service ssh start
elif [[ $SSH_KEY ]]; then
    # latitude.sh
    add_keys_to_authorized "$SSH_KEY"
    # Start the SSH service in the background
    service ssh start
else
    echo "No PUBLIC_KEY or SSH_KEY environment variable provided, not starting openSSH daemon"
fi

# Check if JUPYTER_PASSWORD is set and not empty
if [ -n "$JUPYTER_PASSWORD" ]; then
    # Set JUPYTER_TOKEN to the value of JUPYTER_PASSWORD
    export JUPYTER_TOKEN="$JUPYTER_PASSWORD"
fi

if [ "$JUPYTER_DISABLE" != "1" ]; then
    # Run Jupyter Lab in the background
    jupyter lab --port=8888 --ip=* --allow-root --ServerApp.allow_origin=* &
fi

if [ ! -d "/workspace/data/axolotl-artifacts" ]; then
    mkdir -p /workspace/data/axolotl-artifacts
fi
if [ ! -L "/workspace/axolotl/outputs" ]; then
    ln -sf /workspace/data/axolotl-artifacts /workspace/axolotl/outputs
fi

# Execute the passed arguments (CMD)
exec "$@"


================================================
FILE: scripts/cloud-entrypoint.sh
================================================
#!/bin/bash

# Export specific ENV variables to /etc/rp_environment
echo "Exporting environment variables..."
printenv | grep -E '^HF_|^BNB_|^CUDA_|^NCCL_|^NV|^RUNPOD_|^PATH=|^_=' | sed 's/^\([^=]*\)=\(.*\)$/export \1="\2"/' | grep -v 'printenv' >> /etc/rp_environment
echo 'source /etc/rp_environment' >> ~/.bashrc

add_keys_to_authorized() {
    local key_value=$1

    # Create the ~/.ssh directory and set permissions
    mkdir -p ~/.ssh
    chmod 700 ~/.ssh

    # Create the authorized_keys file if it doesn't exist
    touch ~/.ssh/authorized_keys

    # Initialize an empty key variable
    local key=""

    # Read the key variable word by word
    for word in $key_value; do
        # Check if the word looks like the start of a key
        if [[ $word == ssh-* ]]; then
            # If there's a key being built, add it to the authorized_keys file
            if [[ -n $key ]]; then
                echo $key >> ~/.ssh/authorized_keys
            fi
            # Start a new key
            key=$word
        else
            # Append the word to the current key
            key="$key $word"
        fi
    done

    # Add the last key to the authorized_keys file
    if [[ -n $key ]]; then
        echo $key >> ~/.ssh/authorized_keys
    fi

    # Set the correct permissions
    chmod 600 ~/.ssh/authorized_keys
    chmod 700 -R ~/.ssh
}

# Set SSH port
if [ ! -z "$SSH_PORT" ]; then
    sed -i "s/#Port 22/Port $SSH_PORT/" /etc/ssh/sshd_config
fi

if [[ $PUBLIC_KEY ]]; then
    # runpod, prime intellect
    add_keys_to_authorized "$PUBLIC_KEY"
    # Start the SSH service in the background
    service ssh start
elif [[ $SSH_KEY ]]; then
    # latitude.sh
    add_keys_to_authorized "$SSH_KEY"
    # Start the SSH service in the background
    service ssh start
else
    echo "No PUBLIC_KEY or SSH_KEY environment variable provided, not starting openSSH daemon"
fi

# Check if JUPYTER_PASSWORD is set and not empty
if [ -n "$JUPYTER_PASSWORD" ]; then
    # Set JUPYTER_TOKEN to the value of JUPYTER_PASSWORD
    export JUPYTER_TOKEN="$JUPYTER_PASSWORD"
fi

if [ "$JUPYTER_DISABLE" != "1" ]; then
    # Run Jupyter Lab in the background
    jupyter lab --port=8888 --ip=* --allow-root --ServerApp.allow_origin=* &
fi

if [ ! -d "/workspace/data/axolotl-artifacts" ]; then
    mkdir -p /workspace/data/axolotl-artifacts
fi
if [ ! -L "/workspace/axolotl/outputs" ]; then
    ln -sf /workspace/data/axolotl-artifacts /workspace/axolotl/outputs
fi

# start the runpod slurm init
SLURM_INIT="${SLURM_INIT:-/slurm-init.sh}"

if [[ -f "$SLURM_INIT" ]]; then
  echo "[entrypoint] running $SLURM_INIT..."
  bash "$SLURM_INIT"
fi

# Execute the passed arguments (CMD)
exec "$@"


================================================
FILE: scripts/cutcrossentropy_install.py
================================================
"""Script to output the correct installation command for cut-cross-entropy."""

import importlib.util
import sys

try:
    import torch
except ImportError as exc:
    raise ImportError("Install torch via `pip install torch`") from exc
from packaging.version import Version as V

USE_UV = "--uv" in sys.argv[1:]

v = V(torch.__version__)

# no cut-cross-entropy support for torch < 2.4.0
if v < V("2.4.0"):
    print("")
    sys.exit(0)

cce_spec = importlib.util.find_spec("cut_cross_entropy")

UNINSTALL_PREFIX = ""
if cce_spec:
    if not importlib.util.find_spec("cut_cross_entropy.transformers"):
        UNINSTALL_PREFIX = "pip uninstall -y cut-cross-entropy && "

UV_PREFIX = "uv " if USE_UV else ""

print(
    UNINSTALL_PREFIX
    + f'{UV_PREFIX}pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@63b15e6"'
)


================================================
FILE: scripts/motd
================================================

     #@@ #@@      @@# @@#
    @@  @@          @@  @@           =@@#                               @@                 #@    =@@#.
    @@    #@@@@@@@@@    @@           #@#@=                              @@                 #@     .=@@
      #@@@@@@@@@@@@@@@@@            =@# @#     ##=     ##    =####=+    @@      =#####+  =#@@###.   @@
    @@@@@@@@@@/  +@@/  +@@          #@  =@=     #@=   @@   =@#+  +#@#   @@    =@#+  +#@#   #@.      @@
    @@@@@@@@@@  ##@@  ##@@         =@#   @#      =@# @#    @@      @@   @@    @@      #@   #@       @@
     @@@@@@@@@@@@@@@@@@@@          #@=+++#@=      =@@#     @@      @@   @@    @@      #@   #@       @@
                                  =@#=====@@     =@# @#    @@      @@   @@    @@      #@   #@       @@
    @@@@@@@@@@@@@@@@  @@@@        #@      #@=   #@=  +@@   #@#    =@#   @@.   =@#    =@#   #@.      @@
                                 =@#       @#  #@=     #@   =#@@@@#=    +#@@=  +#@@@@#=    .##@@+   @@
    @@@@  @@@@@@@@@@@@@@@@

Welcome to the axolotl cloud image! If the you've mounted a disk to /workspace and the axolotl directory is empty, run the following commands:

Need help with your post-training workloads? Reach out us at contact@axolotl.ai for assistance.

```
cd /workspace
rm -rf /workspace/axolotl
git clone https://github.com/axolotl-ai-cloud/axolotl.git
cd axolotl
pip install --no-build-isolation --no-deps -e .
```


================================================
FILE: scripts/unsloth_install.py
================================================
# noqa
import sys

try:
    import torch
except ImportError as error:
    raise ImportError("Install torch via `pip install torch`") from error
from packaging.version import Version as V

use_uv = "--uv" in sys.argv[1:]

v = V(torch.__version__)
cuda = str(torch.version.cuda)
try:
    is_ampere = torch.cuda.get_device_capability()[0] >= 8
except RuntimeError:
    is_ampere = False
if cuda != "12.1" and cuda != "11.8" and cuda != "12.4":
    raise RuntimeError(f"CUDA = {cuda} not supported!")
if v <= V("2.1.0"):
    raise RuntimeError(f"Torch = {v} too old!")
elif v <= V("2.1.1"):
    x = "cu{}{}-torch211"
elif v <= V("2.1.2"):
    x = "cu{}{}-torch212"
elif v < V("2.3.0"):
    x = "cu{}{}-torch220"
elif v < V("2.4.0"):
    x = "cu{}{}-torch230"
elif v < V("2.5.0"):
    x = "cu{}{}-torch240"
elif v < V("2.6.0"):
    x = "cu{}{}-torch250"
else:
    raise RuntimeError(f"Torch = {v} too new!")
x = x.format(cuda.replace(".", ""), "-ampere" if is_ampere else "")
uv_prefix = "uv " if use_uv else ""
print(
    f'{uv_prefix}pip install unsloth-zoo==2024.12.1 && {uv_prefix}pip install --no-deps "unsloth[{x}]==2024.12.4"'
)


================================================
FILE: setup.py
================================================
"""setup.py for axolotl"""

import os
import platform
import re
from importlib.metadata import PackageNotFoundError, version
from pathlib import Path

from setuptools import find_packages, setup


def parse_requirements(extras_require_map):
    _install_requires = []
    _dependency_links = []
    with open("./requirements.txt", encoding="utf-8") as requirements_file:
        lines = [r.strip() for r in requirements_file.readlines()]
        for line in lines:
            is_extras = "deepspeed" in line or "mamba-ssm" in line
            if line.startswith("--extra-index-url"):
                # Handle custom index URLs
                _, url = line.split()
                _dependency_links.append(url)
            elif not is_extras and line and line[0] != "#":
                # Handle standard packages
                _install_requires.append(line)
    try:
        xformers_version = [req for req in _install_requires if "xformers" in req][0]
        install_xformers = platform.machine() != "aarch64"
        if platform.machine() == "aarch64":
            # skip on ARM64
            skip_packages = [
                "torchao",
                "fla-core",
                "flash-linear-attention",
            ]
            _install_requires = [
                req
                for req in _install_requires
                if re.split(r"[>=<]", req)[0].strip() not in skip_packages
            ]
        if "Darwin" in platform.system():
            # skip packages not compatible with OSX
            skip_packages = [
                "bitsandbytes",
                "triton",
                "mamba-ssm",
                "xformers",
                "liger-kernel",
            ]
            _install_requires = [
                req
                for req in _install_requires
                if re.split(r"[>=<]", req)[0].strip() not in skip_packages
            ]
            print(
                _install_requires, [req in skip_packages for req in _install_requires]
            )
        else:
            # detect the version of torch already installed
            # and set it so dependencies don't clobber the torch version
            try:
                torch_version = version("torch")
            except PackageNotFoundError:
                torch_version = "2.8.0"  # default to torch 2.8.0
            _install_requires.append(f"torch=={torch_version}")

            version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version)
            if version_match:
                major, minor, patch = version_match.groups()
                major, minor = int(major), int(minor)
                patch = (
                    int(patch) if patch is not None else 0
                )  # Default patch to 0 if not present
            else:
                raise ValueError("Invalid version format")

            torch_parts = torch_version.split("+")
            if len(torch_parts) == 2:
                torch_cuda_version = torch_parts[1]
                _dependency_links.append(
                    f"https://download.pytorch.org/whl/{torch_cuda_version}"
                )

            if (major, minor) >= (2, 9):
                extras_require_map.pop("fbgemm-gpu")
                extras_require_map["fbgemm-gpu"] = [
                    "fbgemm-gpu==1.4.0",
                    "fbgemm-gpu-genai==1.4.2",
                ]
                extras_require_map["vllm"] = ["vllm==0.11.1"]
                if not install_xformers:
                    _install_requires.pop(_install_requires.index(xformers_version))
                extras_require_map["vllm"] = ["vllm==0.13.0"]
                if patch == 0:
                    extras_require_map["vllm"] = ["vllm==0.13.0"]
                else:
                    extras_require_map["vllm"] = ["vllm==0.14.0"]
            elif (major, minor) >= (2, 8):
                extras_require_map.pop("fbgemm-gpu")
                extras_require_map["fbgemm-gpu"] = ["fbgemm-gpu-genai==1.3.0"]
                extras_require_map["vllm"] = ["vllm==0.11.0"]
                if not install_xformers:
                    _install_requires.pop(_install_requires.index(xformers_version))
            elif (major, minor) >= (2, 7):
                _install_requires.pop(_install_requires.index(xformers_version))
                if patch == 0:
                    if install_xformers:
                        _install_requires.append("xformers==0.0.30")
                    # vllm 0.9.x is incompatible with latest transformers
                    extras_require_map.pop("vllm")
                else:
                    if install_xformers:
                        _install_requires.append("xformers==0.0.31")
                    extras_require_map["vllm"] = ["vllm==0.10.1"]
            elif (major, minor) >= (2, 6):
                _install_requires.pop(_install_requires.index(xformers_version))
                if install_xformers:
                    _install_requires.append("xformers==0.0.29.post3")
                # since we only support 2.6.0+cu126
                _dependency_links.append("https://download.pytorch.org/whl/cu126")
                extras_require_map.pop("vllm")
            elif (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
                if install_xformers:
                    if patch == 0:
                        _install_requires.append("xformers==0.0.28.post2")
                    else:
                        _install_requires.append("xformers>=0.0.28.post3")
                extras_require_map.pop("vllm")
            elif (major, minor) >= (2, 4):
                extras_require_map.pop("vllm")
                if install_xformers:
                    if patch == 0:
                        _install_requires.pop(_install_requires.index(xformers_version))
                        _install_requires.append("xformers>=0.0.27")
                    else:
                        _install_requires.pop(_install_requires.index(xformers_version))
                        _install_requires.append("xformers==0.0.28.post1")
            else:
                raise ValueError("axolotl requires torch>=2.4")

    except PackageNotFoundError:
        pass
    return _install_requires, _dependency_links, extras_require_map


def get_package_version():
    with open(
        Path(os.path.dirname(os.path.abspath(__file__))) / "VERSION",
        "r",
        encoding="utf-8",
    ) as fin:
        version_ = fin.read().strip()
    return version_


extras_require = {
    "flash-attn": ["flash-attn==2.8.3"],
    "ring-flash-attn": [
        "flash-attn==2.8.3",
        "ring-flash-attn>=0.1.7",
    ],
    "deepspeed": [
        "deepspeed==0.18.2",
        "deepspeed-kernels",
    ],
    "mamba-ssm": [
        "mamba-ssm==1.2.0.post1",
        "causal_conv1d",
    ],
    "auto-gptq": [
        "auto-gptq==0.5.1",
    ],
    "mlflow": [
        "mlflow",
    ],
    "galore": [
        "galore_torch",
    ],
    "apollo": [
        "apollo-torch",
    ],
    "optimizers": [
        "galore_torch",
        "apollo-torch",
        "lomo-optim==0.1.1",
        "torch-optimi==0.2.1",
        "came_pytorch==0.1.3",
    ],
    "ray": [
        "ray[train]>=2.52.1",
    ],
    "vllm": [
        "vllm==0.10.0",
    ],
    "llmcompressor": [
        "llmcompressor==0.5.1",
    ],
    "fbgemm-gpu": ["fbgemm-gpu-genai==1.3.0"],
    "opentelemetry": [
        "opentelemetry-api",
        "opentelemetry-sdk",
        "opentelemetry-exporter-prometheus",
        "prometheus-client",
    ],
}
install_requires, dependency_links, extras_require_build = parse_requirements(
    extras_require
)

setup(
    version=get_package_version(),
    package_dir={"": "src"},
    packages=find_packages("src"),
    install_requires=install_requires,
    dependency_links=dependency_links,
    entry_points={
        "console_scripts": [
            "axolotl=axolotl.cli.main:main",
        ],
    },
    extras_require=extras_require_build,
)


================================================
FILE: src/axolotl/__init__.py
================================================
"""Axolotl - Train and fine-tune large language models"""

import pkgutil
from importlib.metadata import PackageNotFoundError, version

__path__ = pkgutil.extend_path(__path__, __name__)  # Make this a namespace package

try:
    __version__ = version("axolotl")
except PackageNotFoundError:
    __version__ = "unknown"


================================================
FILE: src/axolotl/cli/__init__.py
================================================
"""Axolotl CLI module initialization."""

import os

from axolotl.logging_config import configure_logging

os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
os.environ.setdefault("HF_XET_HIGH_PERFORMANCE", "1")
os.environ.setdefault("TRL_EXPERIMENTAL_SILENCE", "1")

configure_logging()


================================================
FILE: src/axolotl/cli/args.py
================================================
"""Module for axolotl CLI command arguments."""

from dataclasses import dataclass, field
from typing import Optional


@dataclass
class PreprocessCliArgs:
    """Dataclass with CLI arguments for `axolotl preprocess` command."""

    debug: bool = field(default=False)
    debug_text_only: bool = field(default=False)
    debug_num_examples: int = field(default=1)
    prompter: Optional[str] = field(default=None)
    download: Optional[bool] = field(default=True)
    iterable: Optional[bool] = field(
        default=False,
        metadata={
            "help": (
                "Deprecated in v0.13.0, will be removed in v0.14.0. For streaming "
                "datasets, use 'axolotl train' and set 'streaming: true' in your YAML "
                "config, or pass --streaming instead in the CLI."
            )
        },
    )


@dataclass
class TrainerCliArgs:
    """Dataclass with CLI arguments for `axolotl train` command."""

    debug: bool = field(default=False)
    debug_text_only: bool = field(default=False)
    debug_num_examples: int = field(default=0)
    prompter: Optional[str] = field(default=None)
    shard: bool = field(default=False)


@dataclass
class VllmServeCliArgs:
    """Dataclass with CLI arguments for `axolotl vllm-serve` command."""

    tensor_parallel_size: Optional[int] = field(
        default=None,
        metadata={"help": "Number of tensor parallel workers to use."},
    )
    data_parallel_size: Optional[int] = field(
        default=None,
        metadata={
            "help": "Number of data parallel workers to use for vLLM serving. This controls how many model replicas are used for parallel inference."
        },
    )
    host: Optional[str] = field(
        default=None,  # nosec B104
        metadata={"help": "Host address to run the server on."},
    )
    port: Optional[int] = field(
        default=None,
        metadata={"help": "Port to run the server on."},
    )
    gpu_memory_utilization: Optional[float] = field(
        default=None,
        metadata={
            "help": "Ratio (between 0 and 1) of GPU memory to reserve for the model weights, activations, and KV "
            "cache on the device dedicated to generation powered by vLLM. Higher values will increase the KV cache "
            "size and thus improve the model's throughput. However, if the value is too high, it may cause "
            "out-of-memory (OOM) errors during initialization."
        },
    )
    dtype: Optional[str] = field(
        default=None,
        metadata={
            "help": "Data type to use for vLLM generation. If set to 'auto', the data type will be automatically "
            "determined based on the model configuration. Find the supported values in the vLLM documentation."
        },
    )
    max_model_len: Optional[int] = field(
        default=None,
        metadata={
            "help": "If set, the `max_model_len` to use for vLLM. This can be useful when running with reduced "
            "`vllm_gpu_memory_utilization`, leading to a reduced KV cache size. If not set, vLLM will use the model "
            "context size, which might be much larger than the KV cache, leading to inefficiencies."
        },
    )
    enable_prefix_caching: Optional[bool] = field(
        default=None,
        metadata={
            "help": "Whether to enable prefix caching in vLLM. If set to `True`, ensure that the model and the "
            "hardware support this feature."
        },
    )
    serve_module: Optional[str] = field(
        default=None,
        metadata={
            "help": "Module to serve. If not set, the default module will be used."
        },
    )

    enable_reasoning: Optional[bool] = field(
        default=None,
    )

    reasoning_parser: Optional[str] = field(
        default=None,
    )


@dataclass
class QuantizeCliArgs:
    """Dataclass with CLI arguments for `axolotl quantize` command."""

    base_model: Optional[str] = field(default=None)
    weight_dtype: Optional[str] = field(default=None)
    activation_dtype: Optional[str] = field(default=None)
    quantize_embedding: Optional[bool] = field(default=None)
    group_size: Optional[int] = field(default=None)
    output_dir: Optional[str] = field(default=None)
    hub_model_id: Optional[str] = field(default=None)


@dataclass
class EvaluateCliArgs:
    """Dataclass with CLI arguments for `axolotl evaluate` command."""

    debug: bool = field(default=False)
    debug_text_only: bool = field(default=False)
    debug_num_examples: int = field(default=0)


@dataclass
class InferenceCliArgs:
    """Dataclass with CLI arguments for `axolotl inference` command."""

    prompter: Optional[str] = field(default=None)


================================================
FILE: src/axolotl/cli/art.py
================================================
"""Axolotl ASCII logo utils."""

from axolotl.utils.distributed import is_main_process

AXOLOTL_LOGO = """
     #@@ #@@      @@# @@#
    @@  @@          @@  @@           =@@#                               @@                 #@    =@@#.
    @@    #@@@@@@@@@    @@           #@#@=                              @@                 #@     .=@@
      #@@@@@@@@@@@@@@@@@            =@# @#     ##=     ##    =####=+    @@      =#####+  =#@@###.   @@
    @@@@@@@@@@/  +@@/  +@@          #@  =@=     #@=   @@   =@#+  +#@#   @@    =@#+  +#@#   #@.      @@
    @@@@@@@@@@  ##@@  ##@@         =@#   @#      =@# @#    @@      @@   @@    @@      #@   #@       @@
     @@@@@@@@@@@@@@@@@@@@          #@=+++#@=      =@@#     @@      @@   @@    @@      #@   #@       @@
                                  =@#=====@@     =@# @#    @@      @@   @@    @@      #@   #@       @@
    @@@@@@@@@@@@@@@@  @@@@        #@      #@=   #@=  +@@   #@#    =@#   @@.   =@#    =@#   #@.      @@
                                 =@#       @#  #@=     #@   =#@@@@#=    +#@@=  +#@@@@#=    .##@@+   @@
    @@@@  @@@@@@@@@@@@@@@@
"""

HAS_PRINTED_LOGO = False


def print_axolotl_text_art():
    """Prints axolotl ASCII art."""

    global HAS_PRINTED_LOGO
    if HAS_PRINTED_LOGO:
        return
    if is_main_process():
        HAS_PRINTED_LOGO = True
        print(AXOLOTL_LOGO)


================================================
FILE: src/axolotl/cli/checks.py
================================================
"""Various checks for Axolotl CLI."""

import os
from pathlib import Path

from accelerate.commands.config import config_args
from huggingface_hub import HfApi
from huggingface_hub.utils import LocalTokenNotFoundError
from requests import HTTPError

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def check_accelerate_default_config() -> None:
    """Logs at warning level if no accelerate config file is found."""
    if Path(config_args.default_yaml_config_file).exists():
        LOG.warning(
            f"accelerate config file found at {config_args.default_yaml_config_file}. This can lead to unexpected errors"
        )


def check_user_token() -> bool:
    """Checks for HF user info. Check is skipped if HF_HUB_OFFLINE=1.

    Returns:
        Boolean indicating successful check (i.e., HF_HUB_OFFLINE=1 or HF user info is retrieved).

    Raises:
        LocalTokenNotFoundError: If HF user info can't be retrieved.
    """
    # Skip check if HF_HUB_OFFLINE is set to True
    if os.getenv("HF_HUB_OFFLINE") == "1":
        LOG.info(
            "Skipping HuggingFace token verification because HF_HUB_OFFLINE is set to True. Only local files will be used."
        )
        return True

    # Verify if token is valid
    api = HfApi()
    try:
        user_info = api.whoami()
        return bool(user_info)
    except LocalTokenNotFoundError:
        LOG.warning(
            "Error verifying HuggingFace token. Remember to log in using `hf auth login` and get your access token from https://huggingface.co/settings/tokens if you want to use gated models or datasets."
        )
        return False
    except HTTPError:
        LOG.warning(
            "Error accessing HuggingFace. This may be due to a network issue or rate limiting."
        )
        return False


================================================
FILE: src/axolotl/cli/cloud/__init__.py
================================================
"""
launch axolotl in supported cloud platforms
"""

from pathlib import Path
from typing import Literal

import yaml

from axolotl.cli.cloud.base import Cloud
from axolotl.cli.cloud.baseten import BasetenCloud
from axolotl.cli.cloud.modal_ import ModalCloud
from axolotl.utils.dict import DictDefault


def load_cloud_cfg(cloud_config: Path | str) -> DictDefault:
    """Load and validate cloud configuration."""
    # Load cloud configuration.
    with open(cloud_config, encoding="utf-8") as file:
        cloud_cfg: DictDefault = DictDefault(yaml.safe_load(file))
    return cloud_cfg


def do_cli_preprocess(
    cloud_config: Path | str,
    config: Path | str,
) -> None:
    cloud_cfg = load_cloud_cfg(cloud_config)
    cloud = ModalCloud(cloud_cfg)
    with open(config, "r", encoding="utf-8") as file:
        config_yaml = file.read()
    cloud.preprocess(config_yaml)


def do_cli_train(
    cloud_config: Path | str,
    config: Path | str,
    launcher: Literal["accelerate", "torchrun", "python"] = "accelerate",
    launcher_args: list[str] | None = None,
    cwd=None,
    **kwargs,
) -> None:
    cloud_cfg: DictDefault = load_cloud_cfg(cloud_config)
    provider = cloud_cfg.provider or "modal"
    cloud: Cloud | None
    if provider == "modal":
        cloud = ModalCloud(cloud_cfg)
    elif provider == "baseten":
        cloud = BasetenCloud(cloud_cfg.to_dict())
    else:
        raise ValueError(f"Unsupported cloud provider: {provider}")
    with open(config, "r", encoding="utf-8") as file:
        config_yaml = file.read()
    local_dirs = {}
    if cwd and not Path(cwd).joinpath("src", "axolotl").exists():
        local_dirs = {"/workspace/mounts": cwd}
    cloud.train(
        config_yaml,
        launcher=launcher,
        launcher_args=launcher_args,
        local_dirs=local_dirs,
        **kwargs,
    )


def do_cli_lm_eval(
    cloud_config: Path | str,
    config: Path | str,
) -> None:
    cloud_cfg = load_cloud_cfg(cloud_config)
    cloud = ModalCloud(cloud_cfg)
    with open(config, "r", encoding="utf-8") as file:
        config_yaml = file.read()
    cloud.lm_eval(config_yaml)


================================================
FILE: src/axolotl/cli/cloud/base.py
================================================
"""
base class for cloud platforms from cli
"""

from abc import ABC, abstractmethod
from typing import Literal


class Cloud(ABC):
    """
    Abstract base class for cloud platforms.
    """

    @abstractmethod
    def preprocess(self, config_yaml: str, *args, **kwargs) -> None:
        pass

    @abstractmethod
    def train(
        self,
        config_yaml: str,
        launcher: Literal["accelerate", "torchrun", "python"] = "accelerate",
        launcher_args: list[str] | None = None,
        local_dirs: dict[str, str] | None = None,
        **kwargs,
    ):
        pass


================================================
FILE: src/axolotl/cli/cloud/baseten/__init__.py
================================================
"""Baseten Cloud CLI"""

import shutil
import subprocess  # nosec B404
import tempfile
from os.path import dirname
from typing import Literal

import yaml

from axolotl.cli.cloud.base import Cloud


class BasetenCloud(Cloud):
    """Baseten Cloud Axolotl CLI"""

    def __init__(self, config: dict):
        self.config = config

    def preprocess(self, config_yaml: str, *args, **kwargs) -> None:
        raise NotImplementedError(
            "Separate preprocess function for Baseten is not "
            "implemented and will happen during hte train step."
        )

    def train(
        self,
        config_yaml: str,
        launcher: Literal["accelerate", "torchrun", "python"] = "accelerate",
        launcher_args: list[str] | None = None,
        local_dirs: dict[str, str] | None = None,  # pylint: disable=unused-argument
        **kwargs,
    ):
        with tempfile.TemporaryDirectory() as tmp_dir:
            config = self.config.copy()
            config["launcher"] = launcher
            config["launcher_args"] = launcher_args
            with open(tmp_dir + "/cloud.yaml", "w", encoding="utf-8") as cloud_fout:
                yaml.dump(config, cloud_fout)
            with open(tmp_dir + "/train.yaml", "w", encoding="utf-8") as config_fout:
                config_fout.write(config_yaml)
            shutil.copyfile(dirname(__file__) + "/template/run.sh", tmp_dir + "/run.sh")
            shutil.copyfile(
                dirname(__file__) + "/template/train_sft.py", tmp_dir + "/train_sft.py"
            )
            subprocess.run(  # nosec B603 B607
                ["truss", "train", "push", "train_sft.py"], cwd=tmp_dir, check=False
            )


================================================
FILE: src/axolotl/cli/cloud/baseten/template/run.sh
================================================
#!/bin/bash
set -eux

export NCCL_SOCKET_IFNAME="^docker0,lo"
export NCCL_IB_DISABLE=0
export NCCL_TIMEOUT=1800000

axolotl preprocess train.yaml
axolotl train train.yaml --launcher ${AXOLOTL_LAUNCHER} ${AXOLOTL_LAUNCHER_ARGS}


================================================
FILE: src/axolotl/cli/cloud/baseten/template/train_sft.py
================================================
"""
Baseten Training Script for Axolotl
"""

# pylint: skip-file
import yaml
from truss.base import truss_config

# Import necessary classes from the Baseten Training SDK
from truss_train import definitions

cloud_config = yaml.safe_load(open("cloud.yaml", "r"))
gpu = cloud_config.get("gpu", "h100")
gpu_count = int(cloud_config.get("gpu_count", 1))
node_count = int(cloud_config.get("node_count", 1))
project_name = cloud_config.get("project_name", "axolotl-project") or "axolotl-project"
secrets = cloud_config.get("secrets", [])
launcher = cloud_config.get("launcher", "accelerate")
launcher_args = cloud_config.get("launcher_args", [])
script_name = "run.sh"

launcher_args_str = ""
if launcher_args:
    launcher_args_str = "-- " + " ".join(launcher_args)

# 1. Define a base image for your training job
BASE_IMAGE = "axolotlai/axolotl:main-py3.11-cu128-2.9.1"

# 2. Define the Runtime Environment for the Training Job
# This includes start commands and environment variables.a
# Secrets from the baseten workspace like API keys are referenced using
# `SecretReference`.

env_vars = {
    "AXOLOTL_LAUNCHER": launcher,
    "AXOLOTL_LAUNCHER_ARGS": launcher_args_str,
}
for secret_name in secrets:
    env_vars[secret_name] = definitions.SecretReference(name=secret_name)

training_runtime = definitions.Runtime(
    start_commands=[  # Example: list of commands to run your training script
        f"/bin/sh -c 'chmod +x ./{script_name} && ./{script_name}'"
    ],
    environment_variables=env_vars,
)

# 3. Define the Compute Resources for the Training Job
training_compute = definitions.Compute(
    node_count=node_count,
    accelerator=truss_config.AcceleratorSpec(
        accelerator=truss_config.Accelerator.H100,
        count=gpu_count,
    ),
)

# 4. Define the Training Job
# This brings together the image, compute, and runtime configurations.
my_training_job = definitions.TrainingJob(
    image=definitions.Image(base_image=BASE_IMAGE),
    compute=training_compute,
    runtime=training_runtime,
)


# This config will be pushed using the Truss CLI.
# The association of the job to the project happens at the time of push.
first_project_with_job = definitions.TrainingProject(
    name=project_name, job=my_training_job
)


================================================
FILE: src/axolotl/cli/cloud/modal_.py
================================================
"""
Modal Cloud support from CLI
"""

import copy
import json
import os
import subprocess  # nosec B404
from pathlib import Path
from random import randint
from typing import Literal

import modal

from axolotl.cli.cloud.base import Cloud


def run_cmd(cmd: str, run_folder: str, volumes=None):
    """Run a command inside a folder, with Modal Volume reloading before and commit on success."""
    # Ensure volumes contain latest files.
    if volumes:
        for _, vol in volumes.items():
            vol.reload()

    # modal workaround so it doesn't use the automounted axolotl
    new_env = copy.deepcopy(os.environ)

    if "PYTHONPATH" in new_env:
        paths = ["/workspace/mounts"]
        for sub_python_path_str in new_env["PYTHONPATH"].split(":"):
            sub_python_path = Path(sub_python_path_str)
            if not sub_python_path.joinpath("src", "axolotl").exists():
                # we don't want to use the automounted axolotl or unexpected behavior happens
                paths.append(str(sub_python_path))
        if paths:
            new_env["PYTHONPATH"] = ":".join(paths)
        else:
            del new_env["PYTHONPATH"]

    # Propagate errors from subprocess.
    if exit_code := subprocess.call(  # nosec B603
        cmd.split(), cwd=run_folder, env=new_env
    ):
        exit(exit_code)

    # Commit writes to volume.
    if volumes:
        for _, vol in volumes.items():
            vol.commit()


class ModalCloud(Cloud):
    """
    Modal Cloud implementation.
    """

    def __init__(self, config, app=None):
        self.config = config
        if not app:
            app = modal.App()
        self.app = app

        self.volumes = {}
        if config.volumes:
            for volume_config in config.volumes:
                _, mount, vol = self.create_volume(volume_config)
                self.volumes[mount] = (vol, volume_config)

    def get_env(self):
        res = {
            "HF_DATASETS_CACHE": "/workspace/data/huggingface-cache/datasets",
            "HF_HUB_CACHE": "/workspace/data/huggingface-cache/hub",
        }

        for key in self.config.get("env", []):
            if isinstance(key, str):
                if val := os.environ.get(key, ""):
                    res[key] = val
            elif isinstance(key, dict):
                (key_, val) = list(key.items())[0]
                res[key_] = val
        return res

    def get_image(self):
        docker_tag = "main-py3.11-cu128-2.9.1"
        if self.config.docker_tag:
            docker_tag = self.config.docker_tag
        docker_image = f"axolotlai/axolotl:{docker_tag}"

        # grab the sha256 hash from docker hub for this image+tag
        # this ensures that we always get the latest image for this tag, even if it's already cached
        try:
            manifest = subprocess.check_output(  # nosec
                ["docker", "manifest", "inspect", docker_image],
            ).decode("utf-8")
            sha256_hash = json.loads(manifest)["manifests"][0]["digest"]
        except subprocess.CalledProcessError:
            sha256_hash = None

        # create the image
        if sha256_hash:
            image = modal.Image.from_registry(f"axolotlai/axolotl@{sha256_hash}")
        else:
            image = modal.Image.from_registry(docker_image)

        dockerfile_commands = []
        if self.config.dockerfile_commands:
            dockerfile_commands.extend(self.config.dockerfile_commands)

        # branch
        if self.config.branch:
            dockerfile_commands.extend(
                [
                    # Random id for cache busting of branch commits
                    f"RUN echo '{str(randint(0, 1000000))}'",  # nosec B311
                    f"RUN cd /workspace/axolotl && git fetch && git checkout {self.config.branch} && git pull",
                ]
            )

        if dockerfile_commands:
            image = image.dockerfile_commands(dockerfile_commands)

        if env := self.get_env():
            image = image.env(env)

        return image

    def get_secrets(self):
        res = []
        if self.config.secrets:
            for key in self.config.get("secrets", []):
                if isinstance(key, str):
                    if val := os.environ.get(key, ""):
                        res.append(modal.Secret.from_dict({key: val}))
                elif isinstance(key, dict):
                    (key_, val) = list(key.items())[0]
                    res.append(modal.Secret.from_dict({key_: val}))
        return res

    def create_volume(self, volume_config):
        name = volume_config.name
        mount = volume_config.mount
        return name, mount, modal.Volume.from_name(name, create_if_missing=True)

    def get_ephemeral_disk_size(self):
        return 1000 * 525  # 1 TiB

    def get_preprocess_timeout(self):
        if self.config.timeout_preprocess:
            return int(self.config.timeout_preprocess)
        return 60 * 60 * 3  # 3 hours

    def get_preprocess_memory(self):
        memory = 128  # default to 128GiB
        if self.config.memory:
            memory = int(self.config.memory)
        if self.config.memory_preprocess:
            memory = int(self.config.memory_preprocess)
        return 1024 * memory

    def get_preprocess_env(self):
        return self.app.function(
            image=self.get_image(),
            volumes={k: v[0] for k, v in self.volumes.items()},
            cpu=8.0,
            ephemeral_disk=self.get_ephemeral_disk_size(),
            memory=self.get_preprocess_memory(),
            timeout=self.get_preprocess_timeout(),
            secrets=self.get_secrets(),
        )

    def preprocess(self, config_yaml: str, *args, **kwargs):
        modal_fn = self.get_preprocess_env()(_preprocess)
        with modal.enable_output():
            with self.app.run(detach=True):
                modal_fn.remote(
                    config_yaml,
                    *args,
                    volumes={k: v[0] for k, v in self.volumes.items()},
                    **kwargs,
                )

    def get_train_timeout(self):
        if self.config.timeout:
            return int(self.config.timeout)
        return 60 * 60 * 24  # 24 hours

    def get_train_gpu(self):
        count = self.config.gpu_count or 1
        family = self.config.gpu.lower() or "l40s"

        if family == "l40s":
            return modal.gpu.L40S(count=count)
        if family in ["a100", "a100-40gb"]:
            return modal.gpu.A100(count=count, size="40GB")
        if family == "a100-80gb":
            return modal.gpu.A100(count=count, size="80GB")
        if family in ["a10", "a10g"]:
            return modal.gpu.A10G(count=count)
        if family == "h100":
            return f"H100:{count}"
        if family == "t4":
            return modal.gpu.T4(count=count)
        if family == "l4":
            return modal.gpu.L4(count=count)
        raise ValueError(f"Unsupported GPU family: {family}")

    def get_train_memory(self):
        memory = 128  # default to 128GiB
        if self.config.memory:
            memory = int(self.config.memory)
        return 1024 * memory

    def get_train_env(self, local_dirs=None):
        image = self.get_image()
        for mount, local_dir in (local_dirs or {}).items():
            image = image.add_local_dir(local_dir, mount)
        return self.app.function(
            image=image,
            volumes={k: v[0] for k, v in self.volumes.items()},
            cpu=16.0,
            gpu=self.get_train_gpu(),
            memory=self.get_train_memory(),
            timeout=self.get_train_timeout(),
            secrets=self.get_secrets(),
        )

    def train(
        self,
        config_yaml: str,
        launcher: Literal["accelerate", "torchrun", "python"] = "accelerate",
        launcher_args: list[str] | None = None,
        local_dirs: dict[str, str] | None = None,
        **kwargs,
    ):
        modal_fn = self.get_train_env(local_dirs)(_train)
        with modal.enable_output():
            with self.app.run(detach=True):
                modal_fn.remote(
                    config_yaml,
                    launcher=launcher,
                    launcher_args=launcher_args,
                    volumes={k: v[0] for k, v in self.volumes.items()},
                    **kwargs,
                )

    def lm_eval(self, config_yaml: str):
        modal_fn = self.get_train_env()(_lm_eval)
        with modal.enable_output():
            with self.app.run(detach=True):
                if self.config.get("spawn", False):
                    modal_fn_exec = modal_fn.spawn
                else:
                    modal_fn_exec = modal_fn.remote
                modal_fn_exec(
                    config_yaml,
                    volumes={k: v[0] for k, v in self.volumes.items()},
                )


def _preprocess(config_yaml: str, volumes=None):
    Path("/workspace/mounts").mkdir(parents=True, exist_ok=True)
    with open("/workspace/mounts/config.yaml", "w", encoding="utf-8") as f_out:
        f_out.write(config_yaml)
    run_folder = "/workspace/mounts"
    run_cmd(
        "axolotl preprocess /workspace/mounts/config.yaml --dataset-processes=8",
        run_folder,
        volumes,
    )


def _train(
    config_yaml: str,
    launcher: Literal["accelerate", "torchrun", "python"] = "accelerate",
    launcher_args: list[str] | None = None,
    volumes=None,
    **kwargs,
):
    Path("/workspace/mounts").mkdir(parents=True, exist_ok=True)
    with open("/workspace/mounts/config.yaml", "w", encoding="utf-8") as f_out:
        f_out.write(config_yaml)
    run_folder = "/workspace/mounts"

    launcher_args = launcher_args or []

    # Build the base command
    if launcher == "accelerate":
        launcher_arg = "--launcher accelerate"
    elif launcher == "torchrun":
        launcher_arg = "--launcher torchrun"
    else:
        launcher_arg = "--launcher python"

    # Build launcher args string
    launcher_args_str = ""
    if launcher_args:
        launcher_args_str = "-- " + " ".join(launcher_args)

    run_cmd(
        f"axolotl train {launcher_arg} /workspace/mounts/config.yaml {launcher_args_str}".strip(),
        run_folder,
        volumes,
    )


def _lm_eval(config_yaml: str, volumes=None):
    Path("/workspace/mounts").mkdir(parents=True, exist_ok=True)
    with open("/workspace/mounts/config.yaml", "w", encoding="utf-8") as f_out:
        f_out.write(config_yaml)
    run_folder = "/workspace/mounts"
    run_cmd(
        "axolotl lm-eval /workspace/mounts/config.yaml",
        run_folder,
        volumes,
    )


================================================
FILE: src/axolotl/cli/config.py
================================================
"""Configuration loading and processing."""

import json
import os
import tempfile
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import Any, Optional, Union
from urllib.parse import urlparse

import requests
import torch
import yaml
from transformers.utils import is_torch_bf16_gpu_available, is_torch_tf32_available

from axolotl.integrations.base import PluginManager
from axolotl.telemetry.errors import send_errors
from axolotl.telemetry.manager import TelemetryManager
from axolotl.utils.comet_ import setup_comet_env_vars
from axolotl.utils.config import (
    normalize_cfg_datasets,
    normalize_config,
    validate_config,
)
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger
from axolotl.utils.mlflow_ import setup_mlflow_env_vars
from axolotl.utils.tee import prepare_debug_log
from axolotl.utils.trackio_ import setup_trackio_env_vars
from axolotl.utils.trainer import prepare_optim_env
from axolotl.utils.wandb_ import setup_wandb_env_vars

LOG = get_logger(__name__)


def _coerce_value(value: Any, existing: Optional[Any] = None) -> Any:
    """Coerce a string CLI value to its most likely Python type.

    If an existing value is present in the config, its type is used to guide
    casting.  Otherwise, YAML-style inference is applied: booleans, ints,
    floats, and None literals are recognised automatically.

    Args:
        value: The raw value (typically a string from the CLI).
        existing: An optional existing config value whose type guides coercion.

    Returns:
        The value cast to the inferred or expected type.
    """
    if not isinstance(value, str):
        return value

    # If the config already has a typed value, cast to match
    if existing is not None:
        if isinstance(existing, bool):
            return value.lower() in ("true", "1", "yes")
        if isinstance(existing, int):
            try:
                return int(value)
            except (ValueError, TypeError):
                return value
        if isinstance(existing, float):
            try:
                return float(value)
            except (ValueError, TypeError):
                return value
        # For other types (str, list, dict, etc.), return as-is
        return value

    # No existing value -- use YAML-style inference
    lower = value.lower()
    if lower in ("true", "yes"):
        return True
    if lower in ("false", "no"):
        return False
    if lower in ("null", "none", "~"):
        return None

    # Try int then float
    try:
        return int(value)
    except ValueError:
        pass
    try:
        return float(value)
    except ValueError:
        pass

    return value


API_KEY_FIELDS = {"comet_api_key"}

TELEMETRY_MANAGER = TelemetryManager.get_instance()


def check_remote_config(config: Union[str, Path]) -> Union[str, Path]:
    """
    First, determines if the passed config is a valid HTTPS URL. Then, attempts to query
    for it and parse its content, first as JSON, then as YAML (YAML is preferred).
    Finally, the parsed content is written to a local file and its path is returned.

    Args:
        config: HTTPS URL to a YAML or JSON file.

    Returns:
        Either the original `config` if it's not a valid HTTPS URL, or the path to the
        downloaded remote config.

    Raises:
        ValueError: If the remote configuration is neither valid JSON or YAML.
        RuntimeError: If some request-related exception occurs from the file download.
        Exception: Catch-all for any other exception.
    """
    # Check if the config is a valid HTTPS URL to a .yml or .yaml file
    if not (isinstance(config, str) and config.startswith("https://")):
        return config  # Return the original value if it's not a valid URL

    filename = os.path.basename(urlparse(config).path)
    temp_dir = tempfile.mkdtemp()

    try:
        response = requests.get(config, timeout=30)
        response.raise_for_status()  # Check for HTTP errors

        content = response.content
        try:
            # Try parsing as JSON first to catch cases where JSON content is mistakenly
            # considered YAML.
            json.loads(content)

            # Log a warning but do not raise an error; JSON is technically valid YAML.
            # This can happen when you forget to point to a raw GitHub link.
            LOG.warning(
                f"Warning: The content of the file at {config} is JSON, which is technically valid YAML but might not be intended."
            )
        except json.JSONDecodeError:
            # If it's not valid JSON, verify it's valid YAML
            try:
                yaml.safe_load(content)
            except yaml.YAMLError as err:
                raise ValueError(
                    f"Failed to parse the content at {config} as YAML: {err}"
                ) from err

        # Write the content to a file if it's valid YAML (or JSON treated as YAML)
        output_path = Path(temp_dir) / filename
        with open(output_path, "wb") as file:
            file.write(content)
        LOG.info(
            f"Using the following config obtained from {config}: \n\n{content.decode('utf-8')}\n"
        )
        return output_path

    except requests.RequestException as err:
        # This catches all requests-related exceptions including HTTPError
        raise RuntimeError(f"Failed to download {config}: {err}") from err
    except Exception as err:
        # Catch-all for any other exceptions
        raise err


def choose_config(path: Path) -> str:
    """
    Helper method for choosing a `axolotl` config YAML file (considering only files
    ending with `.yml` or `.yaml`). If more than one config file exists in the passed
    `path`, the user is prompted to choose one.

    Args:
        path: Directory in which config file(s) are stored.

    Returns:
        Path to either (1) the sole YAML file, or (2) if more than one YAML files exist,
        the user-selected YAML file.

    Raises:
        ValueError: If no YAML files are found in the given `path`.
    """
    yaml_files = list(path.glob("*.yml")) + list(path.glob("*.yaml"))

    if not yaml_files:
        raise ValueError(
            "No YAML config files found in the specified directory. Are you using a .yml extension?"
        )

    if len(yaml_files) == 1:
        LOG.info(f"Using default YAML file '{yaml_files[0]}'")
        return str(yaml_files[0])

    LOG.info("Choose a YAML file:")
    for idx, file in enumerate(yaml_files):
        LOG.info(f"{idx + 1}. {file}")

    chosen_file = None
    while chosen_file is None:
        try:
            choice = int(input("Enter the number of your choice: "))
            if 1 <= choice <= len(yaml_files):
                chosen_file = str(yaml_files[choice - 1])
            else:
                LOG.info("Invalid choice. Please choose a number from the list.")
        except ValueError:
            LOG.info("Invalid input. Please enter a number.")

    return chosen_file


def prepare_plugins(cfg: DictDefault):
    """
    Registers the plugins for the given configuration.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
    """
    if cfg.get("plugins"):
        plugin_manager = PluginManager.get_instance()
        for plugin_name in cfg["plugins"]:
            plugin_manager.register(plugin_name)
        for plugin in plugin_manager.plugins.values():
            plugin.register(cfg)


def plugin_set_cfg(cfg: DictDefault):
    if cfg.get("plugins"):
        plugin_manager = PluginManager.get_instance()
        plugin_manager.cfg = cfg


@send_errors
def load_cfg(
    config: str | Path | DictDefault = Path("examples/"), **kwargs
) -> DictDefault:
    """
    Loads the `axolotl` configuration stored at `config`, validates it, and performs
    various setup.

    Args:
        config: Path (local or remote) to `axolotl` config YAML file.
        kwargs: Additional keyword arguments to override config file values.

    Returns:
        `DictDefault` mapping configuration keys to values.
    """
    if isinstance(config, (str, Path)):
        config = check_remote_config(config)
        if Path(config).is_dir():
            config = choose_config(Path(config))

        # Load the config from the yaml file
        with open(config, encoding="utf-8") as file:
            cfg: DictDefault = DictDefault(yaml.safe_load(file))

        cfg.axolotl_config_path = config
    else:
        cfg = config
        with NamedTemporaryFile(
            mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
        ) as temp_file:
            temp_file.write(yaml.dump(config.to_dict()))
            temp_file.close()
        cfg.axolotl_config_path = temp_file.name

    TELEMETRY_MANAGER.send_event(event_type="config-loaded", properties=cfg)

    # If there are any options passed in the cli, if it is something that seems valid
    # from the yaml, then overwrite the value
    cfg_keys = cfg.keys()

    # Separate nested (dot-notation) kwargs from flat kwargs
    nested_kwargs: dict[str, dict[str, Any]] = {}
    flat_kwargs: dict[str, Any] = {}
    for key, value in kwargs.items():
        if "__" in key:
            parent, child = key.split("__", 1)
            nested_kwargs.setdefault(parent, {})[child] = value
        else:
            flat_kwargs[key] = value

    # Apply flat kwargs
    for key, value in flat_kwargs.items():
        # If not strict, allow writing to cfg even if it's not in the yml already
        if key in cfg_keys or not cfg.strict:
            cfg[key] = _coerce_value(value, cfg.get(key))

    # Apply nested kwargs (e.g., trl__beta -> cfg.trl.beta)
    for parent, children in nested_kwargs.items():
        if parent not in cfg_keys and cfg.strict:
            continue
        if cfg[parent] is None:
            cfg[parent] = {}
        if not isinstance(cfg[parent], dict):
            LOG.warning(
                "Overwriting non-dict value for '%s' with nested CLI overrides", parent
            )
            cfg[parent] = {}
        for child_key, child_value in children.items():
            existing_child = cfg[parent].get(child_key)
            cfg[parent][child_key] = _coerce_value(child_value, existing_child)

    try:
        device_props = torch.cuda.get_device_properties("cuda")
        gpu_version = "sm_" + str(device_props.major) + str(device_props.minor)
    except (RuntimeError, AssertionError):
        gpu_version = None

    prepare_plugins(cfg)

    cfg = validate_config(
        cfg,
        capabilities={
            "bf16": is_torch_bf16_gpu_available(),
            "fp8": compute_supports_fp8(),
            "tf32": is_torch_tf32_available(),
            "n_gpu": int(os.environ.get("WORLD_SIZE", 1)),
            "compute_capability": gpu_version,
        },
        env_capabilities={
            "torch_version": str(torch.__version__).split("+", maxsplit=1)[0]
        },
    )

    # NOTE(djsaunde): We start outputting to output_dir/debug.log at this point since we
    # have to wait for cfg.output to be resolved. We could call this earlier if we write
    # to a temporary file, and then move it later.
    prepare_debug_log(cfg)
    prepare_optim_env(cfg)
    normalize_config(cfg)
    normalize_cfg_datasets(cfg)
    setup_wandb_env_vars(cfg)
    setup_mlflow_env_vars(cfg)
    setup_comet_env_vars(cfg)
    setup_trackio_env_vars(cfg)
    plugin_set_cfg(cfg)

    TELEMETRY_MANAGER.send_event(event_type="config-processed", properties=cfg)
    cfg_to_log = {
        k: "[REDACTED]" if k in API_KEY_FIELDS else v
        for k, v in cfg.items()
        if v is not None
    }
    LOG.info(
        "config:\n%s",
        json.dumps(cfg_to_log, indent=2, default=str, sort_keys=True),
    )

    return cfg


def compute_supports_fp8() -> bool:
    try:
        compute_capability = torch.cuda.get_device_capability()
        return compute_capability >= (9, 0)
    except RuntimeError:
        return False


================================================
FILE: src/axolotl/cli/delinearize_llama4.py
================================================
"""
CLI tool to delinearize quantized/Linearized Llama-4 models.
"""

import os
from pathlib import Path
from typing import Generator, Union

import fire
import torch
from accelerate import init_empty_weights
from transformers import AutoProcessor


def iter_convert_patched_to_hf(model_state_dict, num_experts) -> Generator:
    keys = list(model_state_dict.keys())
    for key in keys:
        if ".feed_forward.experts." not in key:
            yield key, model_state_dict[key]
        if ".feed_forward.experts.gate_projs" in key:
            # gate gets fused with up so skip the yield on this and we'll fuse it when asking for the up
            continue
        if ".feed_forward.experts.up_projs" in key:
            if ".feed_forward.experts.up_projs.0." in key:
                # handle the re-shape and fusing of gate and up, and conversion from linear to parameter
                prefix = key.split(".up_projs.0.")[0]
                key = f"{prefix}.gate_up_proj"
                # grab all the up_projs and gate_projs across all experts
                gate_stacked = torch.stack(
                    [
                        model_state_dict[
                            f"{prefix}.gate_projs.{expert_idx}.weight"
                        ].transpose(0, 1)
                        for expert_idx in range(num_experts)
                    ]
                )
                up_stacked = torch.stack(
                    [
                        model_state_dict[
                            f"{prefix}.up_projs.{expert_idx}.weight"
                        ].transpose(0, 1)
                        for expert_idx in range(num_experts)
                    ]
                )
                gate_up_proj = torch.cat((gate_stacked, up_stacked), dim=-1)
                del gate_stacked, up_stacked
                yield key, gate_up_proj
            else:
                del model_state_dict[key]
                continue
        if ".feed_forward.experts.down_projs" in key:
            if ".feed_forward.experts.down_projs.0." in key:
                # handle the re-shape and fusing of gate and up, and conversion from linear to parameter
                prefix = key.split(".down_projs.0.")[0]
                key = f"{prefix}.down_proj"
                # grab all the down_projs across all experts
                down_stacked = torch.stack(
                    [
                        model_state_dict[
                            f"{prefix}.down_projs.{expert_idx}.weight"
                        ].transpose(0, 1)
                        for expert_idx in range(num_experts)
                    ]
                )
                yield key, down_stacked
            else:
                del model_state_dict[key]
                continue


def do_cli(model: Union[Path, str], output: Union[Path, str]) -> None:
    """
    Convert a patched HF format Llama4 model (with separated projections)
    back to the original HF format (with fused projections).

    Args:
        model: Path to the patched HF model
        output: Path to save the converted model
    """
    print(f"Loading model from {model}")
    from axolotl.monkeypatch.models.llama4.modeling import (
        patch_llama4_linearized_modeling,
    )

    unpatch_llama4 = patch_llama4_linearized_modeling()
    from transformers import Llama4ForConditionalGeneration

    model_ = Llama4ForConditionalGeneration.from_pretrained(model, dtype=torch.bfloat16)
    processor = AutoProcessor.from_pretrained(model)
    processor.save_pretrained(output)

    device = model_.device.type
    if device == "cuda":
        print(
            f"peak memory allocated: {torch.cuda.max_memory_allocated() / 1024**2} MB"
        )
        print(f"peak memory reserved: {torch.cuda.max_memory_reserved() / 1024**2} MB")
    model_config = model_.config
    config = model_.config.get_text_config()

    # Get key dimensions from the config
    hidden_size = config.hidden_size
    intermediate_size = config.intermediate_size
    num_experts = config.num_local_experts

    print(
        f"Model dimensions: hidden_size={hidden_size}, intermediate_size={intermediate_size}, num_experts={num_experts}"
    )

    # Create output directory if it doesn't exist
    os.makedirs(output, exist_ok=True)

    # Get state dict
    state_dict = model_.state_dict()
    del model_

    # Create a new state dict for the converted model
    converted_state_dict = {}

    # First, copy all keys that don't need modification
    for key, value in iter_convert_patched_to_hf(state_dict, num_experts):
        converted_state_dict[key] = value

    del state_dict
    if device == "cuda":
        torch.cuda.empty_cache()
        print("State dict converted.")
        print(
            f"peak memory allocated: {torch.cuda.max_memory_allocated() / 1024**2} MB"
        )
        print(f"peak memory reserved: {torch.cuda.max_memory_reserved() / 1024**2} MB")
    # Ideally re-load the model import to load the converted state dict
    # Save the converted model
    with init_empty_weights():
        unpatch_llama4()
        model_ = Llama4ForConditionalGeneration(model_config)

    if device == "cuda":
        print("State dict loaded into model.")
        print(
            f"peak memory allocated: {torch.cuda.max_memory_allocated() / 1024**2} MB"
        )
        print(f"peak memory reserved: {torch.cuda.max_memory_reserved() / 1024**2} MB")
    model_.load_state_dict(converted_state_dict, strict=False, assign=True)
    print(f"Saving converted model to {output}...")
    model_.save_pretrained(output)

    print(f"Model successfully converted and saved to {output}")


if __name__ == "__main__":
    fire.Fire(do_cli)


================================================
FILE: src/axolotl/cli/evaluate.py
================================================
"""CLI to run evaluation on a model."""

import os
from pathlib import Path
from typing import Union

import fire
from transformers.hf_argparser import HfArgumentParser

from axolotl.cli.args import TrainerCliArgs
from axolotl.cli.checks import check_accelerate_default_config, check_user_token
from axolotl.cli.config import load_cfg
from axolotl.common.datasets import load_datasets, load_preference_datasets
from axolotl.evaluate import evaluate
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def do_evaluate(cfg: DictDefault, cli_args: TrainerCliArgs) -> None:
    """
    Evaluates a `transformers` model by first loading the dataset(s) specified in the
    `axolotl` config, and then calling `axolotl.evaluate.evaluate`, which computes
    evaluation metrics on the given dataset(s) and writes them to disk.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: CLI arguments.
    """

    check_accelerate_default_config()
    if int(os.getenv("LOCAL_RANK", "0")) == 0:
        check_user_token()

    if cfg.rl:
        dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)
    else:
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

    evaluate(cfg=cfg, dataset_meta=dataset_meta)


def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None:
    """
    Parses `axolotl` config, CLI args, and calls `do_evaluate`.

    Args:
        config: Path to `axolotl` config YAML file.
        kwargs: Additional keyword arguments to override config file values.
    """

    parsed_cfg = load_cfg(config, **kwargs)
    parser = HfArgumentParser(TrainerCliArgs)
    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
        return_remaining_strings=True
    )
    do_evaluate(parsed_cfg, parsed_cli_args)


if __name__ == "__main__":
    fire.Fire(do_cli)


================================================
FILE: src/axolotl/cli/inference.py
================================================
"""CLI to run inference on a trained model."""

import importlib
import sys
from pathlib import Path
from threading import Thread
from typing import Union

import fire
import torch
import transformers
from transformers import GenerationConfig, TextIteratorStreamer, TextStreamer

from axolotl.cli.args import InferenceCliArgs
from axolotl.cli.config import load_cfg
from axolotl.cli.utils import load_model_and_tokenizer
from axolotl.cli.utils.diffusion import (
    diffusion_inference,
    launch_diffusion_gradio_ui,
)
from axolotl.integrations.base import PluginManager
from axolotl.telemetry.errors import send_errors
from axolotl.utils.chat_templates import (
    get_chat_template_from_config,
)
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def get_multi_line_input() -> str:
    """
    Gets multi-line input from terminal.

    Returns:
        Possibly multi-line, possibly empty stdin input as a string.
    """
    print("Give me an instruction (Ctrl + D to submit): ")
    print("=" * 80)

    instruction = ""
    for line in sys.stdin:
        instruction += line

    return instruction


@send_errors
def do_inference(
    *,
    cfg: DictDefault,
    cli_args: InferenceCliArgs,
):
    """
    Runs inference on the command line in a loop. User input is accepted, a chat
    template is (optionally) applied, and the model specified in the `axolotl` config is
    used to generate completions according to a default generation config.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: Inference-specific CLI arguments.
    """
    model, tokenizer, _ = load_model_and_tokenizer(cfg=cfg, inference=True)
    prompter = cli_args.prompter

    prompter_module = None
    chat_template_str = None
    if prompter:
        prompter_module = getattr(
            importlib.import_module("axolotl.prompters"), prompter
        )
    elif cfg.chat_template:
        chat_template_str = get_chat_template_from_config(
            cfg, ds_cfg=None, tokenizer=tokenizer
        )
    elif cfg.datasets and cfg.datasets[0].type == "chat_template":
        chat_template_str = get_chat_template_from_config(
            cfg=cfg, ds_cfg=cfg.datasets[0], tokenizer=tokenizer
        )

    model = model.to(cfg.device, dtype=cfg.torch_dtype)

    # Detect diffusion mode
    plugin_manager = PluginManager.get_instance()
    is_diffusion = any(
        plugin.__class__.__name__ == "DiffusionPlugin"
        for plugin in plugin_manager.plugins.values()
    )

    if is_diffusion:
        print("=" * 80)
        print("Commands:")
        print(":complete N -> completion mode with N tokens (default 64)")
        print(":mask R     -> random masking with ratio R (0.0–1.0)")

    while True:
        print("=" * 80)
        instruction = get_multi_line_input()
        if not instruction:
            return

        if prompter_module:
            prompt: str = next(
                prompter_module().build_prompt(instruction=instruction.strip("\n"))
            )
        else:
            prompt = instruction.strip()

        if chat_template_str:
            batch = tokenizer.apply_chat_template(
                [
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ],
                return_tensors="pt",
                add_special_tokens=True,
                add_generation_prompt=True,
                chat_template=chat_template_str,
                tokenize=True,
                return_dict=True,
            )
        else:
            batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

        print("=" * 80)
        model.eval()
        with torch.no_grad():
            if is_diffusion:
                diffusion_inference(
                    model=model,
                    tokenizer=tokenizer,
                    cfg=cfg,
                    prompt=prompt,
                    chat_template_str=chat_template_str,
                )
                continue

            generation_config = GenerationConfig(
                repetition_penalty=1.1,
                max_new_tokens=1024,
                temperature=0.9,
                top_p=0.95,
                top_k=40,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=True,
                use_cache=True,
                return_dict_in_generate=True,
                output_attentions=False,
                output_hidden_states=False,
                output_scores=False,
            )
            streamer = TextStreamer(tokenizer)
            generated = model.generate(
                inputs=batch["input_ids"].to(cfg.device),
                generation_config=generation_config,
                streamer=streamer,
            )
        print("=" * 80)
        print(tokenizer.decode(generated["sequences"].cpu().tolist()[0]))


@send_errors
def do_inference_gradio(
    *,
    cfg: DictDefault,
    cli_args: InferenceCliArgs,
):
    """
    Runs inference in a Gradio interface. User input is accepted, a chat template is
    (optionally) applied, and the model specified in the `axolotl` config is used to
    generate completions according to a default generation config.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: Inference-specific CLI arguments.
    """
    import gradio as gr

    model, tokenizer, _ = load_model_and_tokenizer(cfg=cfg, inference=True)
    prompter = cli_args.prompter

    prompter_module = None
    chat_template_str = None
    if prompter:
        prompter_module = getattr(
            importlib.import_module("axolotl.prompters"), prompter
        )
    elif cfg.chat_template:
        chat_template_str = get_chat_template_from_config(
            cfg, ds_cfg=None, tokenizer=tokenizer
        )
    elif cfg.datasets and cfg.datasets[0].type == "chat_template":
        chat_template_str = get_chat_template_from_config(
            cfg=cfg, ds_cfg=cfg.datasets[0], tokenizer=tokenizer
        )

    model = model.to(cfg.device, dtype=cfg.torch_dtype)

    # Detect diffusion mode
    plugin_manager = PluginManager.get_instance()
    is_diffusion = any(
        plugin.__class__.__name__ == "DiffusionPlugin"
        for plugin in plugin_manager.plugins.values()
    )

    if is_diffusion:
        launch_diffusion_gradio_ui(
            model=model,
            tokenizer=tokenizer,
            cfg=cfg,
            prompter_module=prompter_module,
            chat_template_str=chat_template_str,
        )
        return

    def generate(instruction):
        if not instruction:
            return
        if prompter_module:
            prompt: str = next(
                prompter_module().build_prompt(instruction=instruction.strip("\n"))
            )
        else:
            prompt = instruction.strip()

        if chat_template_str:
            batch = tokenizer.apply_chat_template(
                [
                    {
                        "role": "user",
                        "content": prompt,
                    }
                ],
                return_tensors="pt",
                add_special_tokens=True,
                add_generation_prompt=True,
                chat_template=chat_template_str,
                tokenize=True,
                return_dict=True,
            )
        else:
            batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

        model.eval()
        with torch.no_grad():
            generation_config = GenerationConfig(
                repetition_penalty=1.1,
                max_new_tokens=cfg.get("gradio_max_new_tokens", 1024),
                temperature=cfg.get("gradio_temperature", 0.9),
                top_p=0.95,
                top_k=40,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=True,
                use_cache=True,
                return_dict_in_generate=True,
                output_attentions=False,
                output_hidden_states=False,
                output_scores=False,
            )
            streamer = TextIteratorStreamer(tokenizer)
            generation_kwargs = {
                "inputs": batch["input_ids"].to(cfg.device),
                "attention_mask": batch["attention_mask"].to(cfg.device),
                "generation_config": generation_config,
                "streamer": streamer,
            }

            thread = Thread(target=model.generate, kwargs=generation_kwargs)
            thread.start()

            all_text = ""

            for new_text in streamer:
                all_text += new_text
                yield all_text

    demo = gr.Interface(
        fn=generate,
        inputs="textbox",
        outputs="text",
        title=cfg.get("gradio_title", "Axolotl Gradio Interface"),
    )

    demo.launch(
        footer_links=["gradio", "settings"],
        share=cfg.get("gradio_share", True),
        server_name=cfg.get("gradio_server_name", "127.0.0.1"),
        server_port=cfg.get("gradio_server_port", None),
    )


def do_cli(
    config: Union[Path, str] = Path("examples/"), gradio: bool = False, **kwargs
) -> None:
    """
    Parses axolotl config, CLI args, and calls `do_inference` or `do_inference_gradio`.

    Args:
        config: Path to `axolotl` config YAML file.
        kwargs: Additional keyword arguments to override config file values.
    """

    parsed_cfg = load_cfg(config, inference=True, rl=None, **kwargs)
    parsed_cfg.sample_packing = False
    parser = transformers.HfArgumentParser(InferenceCliArgs)
    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
        return_remaining_strings=True
    )

    if gradio:
        do_inference_gradio(cfg=parsed_cfg, cli_args=parsed_cli_args)
    else:
        do_inference(cfg=parsed_cfg, cli_args=parsed_cli_args)


if __name__ == "__main__":
    fire.Fire(do_cli)


================================================
FILE: src/axolotl/cli/main.py
================================================
"""Click CLI definitions for various axolotl commands."""

import os
import subprocess  # nosec B404
from typing import Literal, Optional

import click
from dotenv import load_dotenv

import axolotl
from axolotl.cli.args import (
    EvaluateCliArgs,
    PreprocessCliArgs,
    QuantizeCliArgs,
    TrainerCliArgs,
    VllmServeCliArgs,
)
from axolotl.cli.art import print_axolotl_text_art
from axolotl.cli.utils import (
    add_options_from_config,
    add_options_from_dataclass,
    build_command,
    fetch_from_github,
    filter_none_kwargs,
    generate_config_files,
    launch_training,
)
from axolotl.integrations.lm_eval.cli import lm_eval
from axolotl.utils import set_misc_env, set_pytorch_cuda_alloc_conf
from axolotl.utils.logging import get_logger
from axolotl.utils.schemas.config import AxolotlInputConfig

LOG = get_logger(__name__)

LAUNCHER_COMMAND_MAPPING = {
    "accelerate": ["accelerate", "launch"],
    "torchrun": ["torchrun"],
}


@click.group()
@click.version_option(version=axolotl.__version__, prog_name="axolotl")
def cli():
    """Axolotl CLI - Train and fine-tune large language models"""
    print_axolotl_text_art()
    load_dotenv()
    set_pytorch_cuda_alloc_conf()
    set_misc_env()


@cli.command()
@click.argument("config", type=click.Path(exists=True, path_type=str))
@click.option("--cloud", default=None, type=click.Path(exists=True, path_type=str))
@add_options_from_dataclass(PreprocessCliArgs)
@add_options_from_config(AxolotlInputConfig)
@filter_none_kwargs
def preprocess(config: str, cloud: Optional[str] = None, **kwargs):
    """
    Preprocess datasets before training.

    Args:
        config: Path to `axolotl` config YAML file.
        cloud: Path to a cloud accelerator configuration file.
        kwargs: Additional keyword arguments which correspond to CLI args or `axolotl`
            config options.
    """

    if cloud:
        from axolotl.cli.cloud import do_cli_preprocess

        do_cli_preprocess(cloud_config=cloud, config=config)
    else:
        from axolotl.cli.preprocess import do_cli

        do_cli(config=config, **kwargs)


@cli.command(
    context_settings={"ignore_unknown_options": True, "allow_extra_args": True}
)
@click.argument("config", type=click.Path(exists=True, path_type=str))
@click.option(
    "--launcher",
    type=click.Choice(["accelerate", "torchrun", "python"]),
    default="accelerate",
    help="Launcher to use for multi-GPU training",
)
@click.option("--cloud", default=None, type=click.Path(exists=True, path_type=str))
@click.option(
    "--sweep",
    type=click.Path(exists=True, path_type=str),
    help="YAML config for sweeping hyperparameters",
)
@add_options_from_dataclass(TrainerCliArgs)
@add_options_from_config(AxolotlInputConfig)
@filter_none_kwargs
@click.pass_context
def train(
    ctx: click.Context,
    config: str,
    launcher: Literal["accelerate", "torchrun", "python"] = "accelerate",
    cloud: str | None = None,
    sweep: str | None = None,
    **kwargs,
):
    """
    Train or fine-tune a model.

    Args:
        ctx: Click context for extra args.
        config: Path to `axolotl` config YAML file.
        launcher: Launcher to use for multi-GPU training ("accelerate", "torchrun", or "python").
        cloud: Path to a cloud accelerator configuration file
        sweep: Path to YAML config for sweeping hyperparameters.
        kwargs: Additional keyword arguments which correspond to CLI args or `axolotl`
            config options.
    """
    # Extract launcher args from extra args (after --)
    launcher_args = ctx.args if ctx.args else []

    # Handle Ray launcher override
    _launcher = None if kwargs.get("use_ray") else launcher

    # Process each configuration
    for cfg_file, is_group in generate_config_files(config, sweep):
        try:
            use_exec = is_group is not True
            launch_training(cfg_file, _launcher, cloud, kwargs, launcher_args, use_exec)
        except subprocess.CalledProcessError as exc:
            LOG.error(f"Failed to train/fine-tune config '{cfg_file}': {exc}")
            if not sweep:
                raise exc
        finally:
            # Only delete temp files, not the original config
            if cfg_file != config:
                os.unlink(cfg_file)


@cli.command(
    context_settings={"ignore_unknown_options": True, "allow_extra_args": True}
)
@click.argument("config", type=click.Path(exists=True, path_type=str))
@click.option(
    "--launcher",
    type=click.Choice(["accelerate", "torchrun", "python"]),
    default="accelerate",
    help="Launcher to use for multi-GPU evaluation",
)
@add_options_from_dataclass(EvaluateCliArgs)
@add_options_from_config(AxolotlInputConfig)
@filter_none_kwargs
@click.pass_context
def evaluate(ctx: click.Context, config: str, launcher: str, **kwargs):
    """
    Evaluate a model.

    Args:
        ctx: Click context for extra args.
        config: Path to `axolotl` config YAML file.
        launcher: Launcher to use for multi-GPU evaluation ("accelerate", "torchrun", or "python").
        kwargs: Additional keyword arguments which correspond to CLI args or `axolotl`
            config options.
    """
    # Extract launcher args from extra args (after --)
    launcher_args = ctx.args if ctx.args else []

    if launcher in LAUNCHER_COMMAND_MAPPING:
        base_cmd = (
            LAUNCHER_COMMAND_MAPPING[launcher]
            + launcher_args
            + ["-m", "axolotl.cli.evaluate"]
        )
        if config:
            base_cmd.append(config)
        cmd = build_command(base_cmd, kwargs)
        subprocess.run(cmd, check=True)  # nosec B603
    else:
        from axolotl.cli.evaluate import do_cli

        do_cli(config=config, **kwargs)


@cli.command(
    context_settings={"ignore_unknown_options": True, "allow_extra_args": True}
)
@click.argument("config", type=click.Path(exists=True, path_type=str))
@click.option(
    "--launcher",
    type=click.Choice(["accelerate", "torchrun", "python"]),
    default="accelerate",
    help="Launcher to use for multi-GPU inference",
)
@click.option("--gradio", is_flag=True, help="Launch Gradio interface")
@add_options_from_dataclass(TrainerCliArgs)
@add_options_from_config(AxolotlInputConfig)
@filter_none_kwargs
@click.pass_context
def inference(ctx: click.Context, config: str, launcher: str, gradio: bool, **kwargs):
    """
    Run inference with a trained model.

    Args:
        ctx: Click context for extra args.
        config: Path to `axolotl` config YAML file.
        launcher: Launcher to use for multi-GPU inference ("accelerate", "torchrun", or "python").
        gradio: Whether to use Gradio browser interface or command line for inference.
        kwargs: Additional keyword arguments which correspond to CLI args or `axolotl`
            config options.
    """
    # Extract launcher args from extra args (after --)
    launcher_args = ctx.args if ctx.args else []

    if launcher in LAUNCHER_COMMAND_MAPPING:
        base_cmd = (
            LAUNCHER_COMMAND_MAPPING[launcher]
            + launcher_args
            + ["-m", "axolotl.cli.inference"]
        )
        if config:
            base_cmd.append(config)
        if gradio:
            base_cmd.append("--gradio")
        cmd = build_command(base_cmd, kwargs)
        subprocess.run(cmd, check=True)  # nosec B603
    else:
        from axolotl.cli.inference import do_cli

        do_cli(config=config, gradio=gradio, **kwargs)


@cli.command(
    context_settings={"ignore_unknown_options": True, "allow_extra_args": True}
)
@click.argument("config", type=click.Path(exists=True, path_type=str))
@click.option(
    "--launcher",
    type=click.Choice(["accelerate", "torchrun", "python"]),
    default="accelerate",
    help="Launcher to use for weight merging",
)
@add_options_from_dataclass(TrainerCliArgs)
@add_options_from_config(AxolotlInputConfig)
@filter_none_kwargs
@click.pass_context
def merge_sharded_fsdp_weights(
    ctx: click.Context, config: str, launcher: str, **kwargs
):
    """
    Merge sharded FSDP model weights.

    Args:
        ctx: Click context for extra args.
        config: Path to `axolotl` config YAML file.
        launcher: Launcher to use for weight merging ("accelerate", "torchrun", or "python").
        kwargs: Additional keyword arguments which correspond to CLI args or `axolotl`
            config options.
    """
    # Extract launcher args from extra args (after --)
    launcher_args = ctx.args if ctx.args else []

    if launcher in LAUNCHER_COMMAND_MAPPING:
        base_cmd = (
            LAUNCHER_COMMAND_MAPPING[launcher]
            + launcher_args
            + ["-m", "axolotl.cli.merge_sharded_fsdp_weights"]
        )
        if config:
            base_cmd.append(config)
        cmd = build_command(base_cmd, kwargs)
        subprocess.run(cmd, check=True)  # nosec B603
    else:
        from axolotl.cli.merge_sharded_fsdp_weights import do_cli

        do_cli(config=config, **kwargs)


@cli.command()
@click.argument("config", type=click.Path(exists=True, path_type=str))
@add_options_from_dataclass(TrainerCliArgs)
@add_options_from_config(AxolotlInputConfig)
@filter_none_kwargs
def merge_lora(config: str, **kwargs):
    """
    Merge trained LoRA adapters into a base model.

    Args:
        config: Path to `axolotl` config YAML file.
        kwargs: Additional keyword arguments which correspond to CLI args or `axolotl`
            config options.
    """
    from axolotl.cli.merge_lora import do_cli

    do_cli(config=config, **kwargs)


@cli.command()
@click.argument("directory", type=click.Choice(["examples", "deepspeed_configs"]))
@click.option("--dest", help="Destination directory")
def fetch(directory: str, dest: Optional[str]):
    """
    Fetch example configs or other resources.

    Available directories:
    - examples: Example configuration files
    - deepspeed_configs: DeepSpeed configuration files

    Args:
        directory: One of `examples`, `deepspeed_configs`.
        dest: Optional destination directory.
    """
    fetch_from_github(f"{directory}/", dest)


@cli.command()
@click.argument("config", type=click.Path(exists=True, path_type=str))
@add_options_from_dataclass(VllmServeCliArgs)
@filter_none_kwargs
def vllm_serve(config: str, **cli_args: VllmServeCliArgs):
    from axolotl.cli.vllm_serve import do_vllm_serve

    do_vllm_serve(config, cli_args)


@cli.command()
@click.argument("config", type=click.Path(exists=True, path_type=str))
@add_options_from_dataclass(QuantizeCliArgs)
@filter_none_kwargs
def quantize(config: str, **cli_args: QuantizeCliArgs):
    from axolotl.cli.quantize import do_quantize

    do_quantize(config, cli_args)


@cli.command()
@click.argument("model", type=click.Path(exists=True, path_type=str))
@click.argument("output", type=click.Path(exists=False, path_type=str))
def delinearize_llama4(model: str, output: str):
    from axolotl.cli.delinearize_llama4 import do_cli as do_delinearize_llama4

    do_delinearize_llama4(model, output)


cli.add_command(lm_eval)


def main():
    cli()


if __name__ == "__main__":
    main()


================================================
FILE: src/axolotl/cli/merge_lora.py
================================================
"""CLI to merge a trained LoRA into a base model."""

from pathlib import Path
from typing import Union

import fire

from axolotl.cli.config import load_cfg
from axolotl.cli.utils import load_model_and_tokenizer
from axolotl.telemetry.errors import send_errors
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


@send_errors
def do_merge_lora(*, cfg: DictDefault) -> None:
    """
    Calls `transformers`' `merge_and_unload` on the model given in the `axolotl` config
    along with the LoRA adapters to combine them into a single base model.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
    """
    model, tokenizer, processor = load_model_and_tokenizer(cfg=cfg)

    LOG.info("Running merge of LoRA with base model...")
    model = model.merge_and_unload(progressbar=True)
    try:
        model.to(dtype=cfg.torch_dtype)
    except ValueError as e:
        LOG.warning("Failed to convert model to dtype %s", cfg.torch_dtype)
        LOG.warning("Ignore this if the base_model is pre-quantized.")
        LOG.warning("Error raised: %s", e)

    model.generation_config.do_sample = True
    model.config.use_cache = True

    if cfg.local_rank == 0:
        LOG.info(f"Saving merged model to: {str(Path(cfg.output_dir) / 'merged')}...")
        model.save_pretrained(
            str(Path(cfg.output_dir) / "merged"),
            progressbar=True,
        )
        tokenizer.save_pretrained(
            str(Path(cfg.output_dir) / "merged"),
            save_jinja_files=cfg.tokenizer_save_jinja_files,
        )

        if processor:
            processor.save_pretrained(str(Path(cfg.output_dir) / "merged"))


def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs) -> None:
    """
    Parses `axolotl` config, CLI args, and calls `do_merge_lora`. Note that various
    config values will be overwritten to allow the LoRA merge logic to work as expected
    (`load_in_8bit=False`, `load_in4bit=False`, `flash_attention=False`, etc.).

    Args:
        config: Path to `axolotl` config YAML file.
        kwargs: Additional keyword arguments to override config file values.

    Raises:
        ValueError: If target directory for LoRA merged model does not exist.
    """

    parsed_cfg = load_cfg(
        config,
        merge_lora=True,
        load_in_8bit=False,
        load_in_4bit=False,
        quantize_moe_experts=False,
        flash_attention=False,
        context_parallel_size=None,
        deepspeed=None,
        fsdp=None,
        fsdp_config=None,
        **kwargs,
    )

    if not parsed_cfg.lora_model_dir and parsed_cfg.output_dir:
        parsed_cfg.lora_model_dir = parsed_cfg.output_dir
    if not Path(parsed_cfg.lora_model_dir).exists():
        raise ValueError(
            f"Target directory for merge: `{parsed_cfg.lora_model_dir}` does not exist."
        )

    do_merge_lora(cfg=parsed_cfg)


if __name__ == "__main__":
    fire.Fire(do_cli)


================================================
FILE: src/axolotl/cli/merge_sharded_fsdp_weights.py
================================================
"""CLI to merge sharded FSDP model checkpoints into a single combined checkpoint."""

import json
import os
import shutil
from pathlib import Path
from typing import Dict, Union

import fire
import torch
import torch.distributed.checkpoint as dist_cp
import torch.distributed.checkpoint.format_utils as dist_cp_format_utils
from accelerate import PartialState
from accelerate.utils import (
    SAFE_WEIGHTS_INDEX_NAME,
    SAFE_WEIGHTS_NAME,
    is_torch_version,
)
from huggingface_hub import split_torch_state_dict_into_shards
from safetensors.torch import save_file as safe_save_file
from torch.distributed.checkpoint.format_utils import _EmptyStateDictLoadPlanner

from axolotl.cli.config import load_cfg
from axolotl.telemetry.errors import send_errors
from axolotl.utils.logging import get_logger
from axolotl.utils.train import determine_last_checkpoint

LOG = get_logger(__name__)


class BFloat16CastPlanner(_EmptyStateDictLoadPlanner):
    """A custom planner to cast tensors to bfloat16 on the fly during loading."""

    def commit_tensor(self, read_item, tensor):
        tensor.copy_(tensor.to(torch.bfloat16))


def _distributed_checkpoint_to_merged_weights(
    checkpoint_dir: Union[str, Path],
    save_path: str,
    max_shard_size: str = "5GB",
) -> Path:
    """
    Passthrough to `torch.distributed.checkpoint.format_utils.dcp_to_torch_save`. Will
    save under `save_path` as `model.safetensors`.

    Args:
        checkpoint_dir: Directory where distributed checkpoint is saved.
        save_path: Path to save model to.
        max_shard_size: Max size of model shards to save.

    Returns:
        Path where model is saved.
    """

    state_dict: Dict = {}
    save_path_ = Path(save_path)
    save_path_.mkdir(exist_ok=True)
    dist_cp_format_utils._load_state_dict(
        state_dict,
        storage_reader=dist_cp.FileSystemReader(checkpoint_dir),
        planner=BFloat16CastPlanner(),
        no_dist=True,
    )

    # To handle if state is a dict like {model: {...}}
    if len(state_dict.keys()) == 1:
        state_dict = state_dict[list(state_dict)[0]]

    # Ensure all tensors are in bfloat16
    for key, value in state_dict.items():
        if isinstance(value, torch.Tensor) and value.dtype != torch.bfloat16:
            state_dict[key] = value.to(torch.bfloat16)

    filename_pattern = SAFE_WEIGHTS_NAME.replace(".safetensors", "{suffix}.safetensors")
    state_dict_split = split_torch_state_dict_into_shards(
        state_dict, filename_pattern=filename_pattern, max_shard_size=max_shard_size
    )

    # Save index if sharded
    index = None
    if state_dict_split.is_sharded:
        index = {
            "metadata": state_dict_split.metadata,
            "weight_map": state_dict_split.tensor_to_filename,
        }

    # Save the model
    filename_to_tensors = state_dict_split.filename_to_tensors.items()

    for shard_file, tensors in filename_to_tensors:
        shard = {tensor: state_dict[tensor] for tensor in tensors}
        safe_save_file(
            shard, os.path.join(save_path_, shard_file), metadata={"format": "pt"}
        )

    if index is not None:
        save_index_file = os.path.join(save_path_, SAFE_WEIGHTS_INDEX_NAME)
        # Save the index as well
        with open(save_index_file, "w", encoding="utf-8") as fout:
            content = json.dumps(index, indent=2, sort_keys=True) + "\n"
            fout.write(content)

    return save_path_


@send_errors
def merge_fsdp_weights(
    checkpoint_dir: str,
    output_path: str,
    remove_checkpoint_dir: bool = False,
):
    """
    Merge the weights from sharded FSDP model checkpoints into a single combined checkpoint. Should be used if
    `SHARDED_STATE_DICT` was used for the model. Weights will be saved to `{output_path}/model.safetensors`.

    Note: this is a CPU-bound process.

    Args:
        checkpoint_dir (`str`):
            The directory containing the FSDP checkpoints (can be either the model or optimizer).
        output_path (`str`):
            The path to save the merged checkpoint.
        remove_checkpoint_dir (`bool`, *optional*, defaults to `False`):
            Whether to remove the checkpoint directory after merging.

    Raises:
        ValueError: If torch version < 2.3.0, or if `checkpoint_dir` does not exist.
    """
    checkpoint_dir_ = Path(checkpoint_dir)

    if not is_torch_version(">=", "2.3.0"):
        raise ValueError("`merge_fsdp_weights` requires PyTorch >= 2.3.0`")

    # Verify that the checkpoint directory exists
    if not checkpoint_dir_.exists():
        model_path_exists = (checkpoint_dir_ / "pytorch_model_fsdp_0").exists()
        optimizer_path_exists = (checkpoint_dir_ / "optimizer_0").exists()
        err = f"Tried to load from {checkpoint_dir_} but couldn't find a valid metadata file."
        if model_path_exists and optimizer_path_exists:
            err += (
                " However, potential model and optimizer checkpoint directories exist."
            )
            err += f"Please pass in either {checkpoint_dir_}/pytorch_model_fsdp_0 or {checkpoint_dir_}/optimizer_0"
            err += "instead."
        elif model_path_exists:
            err += " However, a potential model checkpoint directory exists."
            err += (
                f"Please try passing in {checkpoint_dir_}/pytorch_model_fsdp_0 instead."
            )
        elif optimizer_path_exists:
            err += " However, a potential optimizer checkpoint directory exists."
            err += f"Please try passing in {checkpoint_dir_}/optimizer_0 instead."
        raise ValueError(err)

    # To setup `save` to work
    state = PartialState()
    if state.is_main_process:
        LOG.info(f"Merging FSDP weights from {checkpoint_dir_}")
        save_path = _distributed_checkpoint_to_merged_weights(
            checkpoint_dir_, output_path
        )
        LOG.info(f"Successfully merged FSDP weights and saved to {save_path}")
        if remove_checkpoint_dir:
            LOG.info(f"Removing old checkpoint directory {checkpoint_dir_}")
            shutil.rmtree(checkpoint_dir_)


def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
    """
    Parses `axolotl` config, CLI args, and calls `merge_fsdp_weights`.

    Args:
        config: Path to `axolotl` config YAML file.
        kwargs: Additional keyword arguments to override config file values.
    """

    parsed_cfg = load_cfg(config, **kwargs)

    fsdp_dir = Path(parsed_cfg.output_dir) / "pytorch_model_fsdp_0"
    if not fsdp_dir.exists():
        checkpoint_dir = determine_last_checkpoint(parsed_cfg, update=False)
        if checkpoint_dir:
            fsdp_dir = Path(checkpoint_dir) / "pytorch_model_fsdp_0"
        if not fsdp_dir.exists():
            raise ValueError(
                f"Could not find FSDP checkpoint `pytorch_model_fsdp_0` in {checkpoint_dir}"
            )

    output_path = str(Path(parsed_cfg.output_dir) / "merged")
    merge_fsdp_weights(
        checkpoint_dir=str(fsdp_dir),
        output_path=output_path,
    )
    state = PartialState()
    state.wait_for_everyone()
    LOG.info(
        f"FSDP SHARDED_STATE_DICT weights successfully merged to: {output_path}",
    )
    LOG.info(
        "Merged weights are only the safetensors and doesn't include the model configuration "
        f"or tokenizer which may be found in {parsed_cfg.output_dir}.",
    )


if __name__ == "__main__":
    fire.Fire(do_cli)


================================================
FILE: src/axolotl/cli/preprocess.py
================================================
"""CLI to run preprocessing of a dataset."""

import os
import warnings
from pathlib import Path
from typing import Union

import fire
import transformers
from accelerate import init_empty_weights
from colorama import Fore
from transformers import AutoModelForCausalLM

from axolotl.cli.args import PreprocessCliArgs
from axolotl.cli.checks import check_accelerate_default_config, check_user_token
from axolotl.cli.config import load_cfg
from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
from axolotl.common.datasets import load_datasets, load_preference_datasets
from axolotl.integrations.base import PluginManager
from axolotl.telemetry.errors import send_errors
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger
from axolotl.utils.trainer import disable_datasets_caching

LOG = get_logger(__name__)


@send_errors
def do_preprocess(cfg: DictDefault, cli_args: PreprocessCliArgs) -> None:
    """
    Preprocesses dataset specified in axolotl config.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: Preprocessing-specific CLI arguments.
    """
    check_accelerate_default_config()
    check_user_token()

    if cli_args.iterable:
        LOG.error(
            "The --iterable CLI argument for 'axolotl preprocess' is no longer "
            "supported. For training, set 'streaming: true' in your YAML config or "
            "pass '--streaming' in your 'axolotl train' command for on-the-fly "
            "preprocessing."
        )
        return

    for key in ["skip_prepare_dataset", "pretraining_dataset"]:
        if cfg.get(key):
            LOG.error(
                f"You have set `{key}:`. `preprocess` is not needed. Run the 'axolotl "
                "train' CLI directly instead."
            )
            return

    if not cfg.dataset_prepared_path:
        msg = (
            Fore.RED
            + "preprocess CLI called without dataset_prepared_path set, "
            + f"using default path: {DEFAULT_DATASET_PREPARED_PATH}"
            + Fore.RESET
        )
        LOG.warning(msg)
        cfg.dataset_prepared_path = DEFAULT_DATASET_PREPARED_PATH

    with disable_datasets_caching():
        plugin_manager = PluginManager.get_instance()
        if plugin_manager.load_datasets(cfg, preprocess=True):
            pass
        elif cfg.rl:
            load_preference_datasets(cfg=cfg, cli_args=cli_args)
        else:
            load_datasets(cfg=cfg, cli_args=cli_args)

    if cli_args.download:
        model_name = cfg.base_model
        with warnings.catch_warnings():
            # there are a bunch of useless UserWarnings about
            # "copying from a non-meta parameter in the checkpoint to a meta parameter in the current model"
            warnings.simplefilter("ignore")
            with init_empty_weights(include_buffers=True):
                # fmt: off
                try:
                    AutoModelForCausalLM.from_pretrained(
                        model_name, trust_remote_code=True
                    )
                except Exception:  # nosec B110
                    pass
                # fmt: on

    LOG.info(
        Fore.GREEN
        + f"Success! Preprocessed data path: `dataset_prepared_path: {cfg.dataset_prepared_path}`"
        + Fore.RESET
    )


def do_cli(
    config: Union[Path, str] = Path("examples/"),
    **kwargs,
) -> None:
    """
    Parses `axolotl` config, CLI args, and calls `do_preprocess`.

    Args:
        config: Path to `axolotl` config YAML file.
        kwargs: Additional keyword arguments to override config file values.
    """

    os.environ["AXOLOTL_IS_PREPROCESS"] = "1"
    is_preprocess = kwargs.pop("is_preprocess", True)
    parsed_cfg = load_cfg(config, is_preprocess=is_preprocess, **kwargs)
    parsed_cfg.is_preprocess = True
    parser = transformers.HfArgumentParser(PreprocessCliArgs)
    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
        return_remaining_strings=True
    )

    do_preprocess(parsed_cfg, parsed_cli_args)


if __name__ == "__main__":
    fire.Fire(do_cli)


================================================
FILE: src/axolotl/cli/quantize.py
================================================
"""
CLI to post-training quantize a model using torchao
"""

from pathlib import Path
from typing import Union

from transformers import AutoConfig, AutoModelForCausalLM, TorchAoConfig

from axolotl.cli.config import load_cfg
from axolotl.loaders import load_processor, load_tokenizer
from axolotl.utils.logging import get_logger
from axolotl.utils.quantization import (
    TorchAOQuantDType,
    get_quantization_config,
    quantization_config_to_str,
    quantize_model,
)

LOG = get_logger(__name__)


def do_quantize(
    config: Union[Path, str],
    cli_args: dict,
):
    """
    Quantizes a model's model's weights

    Args:
        config (Union[Path, str]): The path to the config file
        cli_args (dict): Additional command-line arguments
    """

    cfg = load_cfg(config)

    if cfg.qat and cfg.quantization:
        raise ValueError(
            "QAT and quantization cannot be used together. Please specify only one of qat or quantization in your config file."
        )

    if cfg.qat:
        quantize_cfg = cfg.qat
    elif cfg.quantization:
        quantize_cfg = cfg.quantization
    else:
        raise ValueError(
            "No quantization configuration found. Please specify either qat or quantization in your config file."
        )

    model_path = cli_args.get("base_model") or cfg.output_dir
    if weight_dtype := cli_args.get("weight_dtype"):
        weight_dtype = TorchAOQuantDType.from_string(weight_dtype)
    else:
        weight_dtype = quantize_cfg.weight_dtype
    if activation_dtype := cli_args.get("activation_dtype"):
        activation_dtype = TorchAOQuantDType.from_string(activation_dtype)
    else:
        activation_dtype = quantize_cfg.activation_dtype
    group_size = cli_args.get("group_size") or quantize_cfg.group_size
    quantize_embedding = (
        cli_args.get("quantize_embedding") or quantize_cfg.quantize_embedding
    )
    output_dir = cli_args.get("output_dir") or cfg.output_dir
    hub_model_id = cli_args.get("hub_model_id") or cfg.hub_model_id

    LOG.info(f"Loading model from {model_path}.")
    tokenizer = load_tokenizer(cfg)

    processor = None
    if cfg.is_multimodal:
        processor = load_processor(cfg, tokenizer)

    config = AutoConfig.from_pretrained(model_path)
    torch_dtype = config.torch_dtype if hasattr(config, "torch_dtype") else None
    model = AutoModelForCausalLM.from_pretrained(
        model_path, device_map="auto", dtype=torch_dtype
    )

    LOG.info(
        f"Quantizing model with configuration: \n"
        f"\tweight_dtype: {weight_dtype}\n"
        f"\tactivation_dtype: {activation_dtype}\n"
        f"\tgroup_size: {group_size}\n"
        f"\tquantize_embedding: {quantize_embedding}"
    )

    quantize_model(
        model, weight_dtype, group_size, activation_dtype, quantize_embedding
    )

    quantization_config = get_quantization_config(
        weight_dtype, activation_dtype, group_size
    )

    ao_config = TorchAoConfig(
        quant_type=quantization_config,
        include_input_output_embeddings=quantize_embedding,
    )
    model.config.quantization_config = ao_config

    LOG.info(f"Saving quantized model to: {str(Path(output_dir) / 'quantized')}.")
    model.save_pretrained(
        str(Path(output_dir) / "quantized"),
        progressbar=True,
    )
    tokenizer.save_pretrained(
        str(Path(output_dir) / "quantized"),
        progressbar=True,
        save_jinja_files=cfg.tokenizer_save_jinja_files,
    )

    if processor:
        LOG.info(f"Saving processor to: {str(Path(output_dir) / 'quantized')}.")
        processor.save_pretrained(str(Path(output_dir) / "quantized"))

    if hub_model_id:
        hub_model_id = (
            hub_model_id.rstrip("-")
            + f"-{quantization_config_to_str[type(quantization_config)]}"
        )
        model.push_to_hub(hub_model_id)
        tokenizer.push_to_hub(hub_model_id)
        if processor:
            processor.push_to_hub(hub_model_id)
        LOG.info(f"Quantized model pushed to: {hub_model_id}.")

    LOG.info(f"Quantized model saved to: {str(Path(output_dir) / 'quantized')}.")


================================================
FILE: src/axolotl/cli/train.py
================================================
"""CLI to run training on a model."""

import gc
import os
from pathlib import Path
from typing import Union

import fire
from accelerate import Accelerator
from transformers.hf_argparser import HfArgumentParser

from axolotl.cli.args import TrainerCliArgs
from axolotl.cli.checks import check_accelerate_default_config, check_user_token
from axolotl.cli.config import load_cfg
from axolotl.common.datasets import load_datasets, load_preference_datasets
from axolotl.integrations.base import PluginManager
from axolotl.train import train
from axolotl.utils.config import normalize_config, resolve_dtype
from axolotl.utils.dict import DictDefault
from axolotl.utils.trainer import prepare_optim_env


def do_train(cfg: DictDefault, cli_args: TrainerCliArgs):
    """
    Trains a `transformers` model by first loading the dataset(s) specified in the
    `axolotl` config, and then calling `axolotl.train.train`. Also runs the plugin
    manager's `post_train_unload` once training completes.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: Training-specific CLI arguments.
    """
    check_accelerate_default_config()
    if int(os.getenv("LOCAL_RANK", "0")) == 0:
        check_user_token()

    plugin_manager = PluginManager.get_instance()
    dataset_meta = plugin_manager.load_datasets(cfg, preprocess=False)
    if not dataset_meta:
        if cfg.rl:
            dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)
        else:
            dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

    model, tokenizer, trainer = train(cfg=cfg, dataset_meta=dataset_meta)

    del model, tokenizer, trainer

    gc.collect()

    plugin_manager = PluginManager.get_instance()
    plugin_manager.post_train_unload(cfg)


def do_cli(config: Union[Path, str] = Path("examples/"), **kwargs):
    """
    Parses `axolotl` config, CLI args, and calls `do_train`.

    Args:
        config: Path to `axolotl` config YAML file.
        kwargs: Additional keyword arguments to override config file values.
    """
    parsed_cfg = load_cfg(config, **kwargs)
    parser = HfArgumentParser(TrainerCliArgs)
    parsed_cli_args, _ = parser.parse_args_into_dataclasses(
        return_remaining_strings=True
    )

    if parsed_cfg.use_ray:
        from ray.train import RunConfig, ScalingConfig
        from ray.train.torch import TorchTrainer

        train_loop_config = {"cfg": parsed_cfg.to_dict(), "cli_args": parsed_cli_args}
        trainer = TorchTrainer(
            ray_train_func,
            train_loop_config=train_loop_config,
            scaling_config=ScalingConfig(
                num_workers=parsed_cfg.ray_num_workers,
                resources_per_worker=parsed_cfg.resources_per_worker.to_dict(),
                use_gpu=True,
            ),
            run_config=RunConfig(
                name=parsed_cfg.ray_run_name,
                storage_path=Path(parsed_cfg.output_dir).absolute().as_posix(),
            ),
        )
        return trainer.fit()
    return do_train(parsed_cfg, parsed_cli_args)


def ray_train_func(kwargs: dict):
    # cast `cfg` back to DictDefault (ray tune deepcopy has issues with DictDefault so needed it to be dict)
    # also renormalize the config now that TorchTrainer has spawned distributed workers
    cfg = DictDefault(kwargs["cfg"])
    prepare_optim_env(cfg)
    normalize_config(cfg)

    # now that we are on the worker node, we can check `is_torch_bf16_gpu_available` to resolve dtype
    resolve_dtype(cfg)

    # ray serializing objects gets rid of frozen attribute - HF expects dict not DefaultDict
    if cfg.deepspeed and hasattr(cfg.deepspeed, "to_dict"):
        cfg.deepspeed = cfg.deepspeed.to_dict()

    # initialize accelerator before model instantiation
    Accelerator(gradient_accumulation_steps=cfg.gradient_accumulation_steps)

    # Register plugins in Ray workers
    if cfg.get("plugins"):
        from axolotl.cli.config import plugin_set_cfg, prepare_plugins

        prepare_plugins(cfg)
        plugin_set_cfg(cfg)

    kwargs["cfg"] = cfg

    do_train(**kwargs)


if __name__ == "__main__":
    fire.Fire(do_cli)


================================================
FILE: src/axolotl/cli/utils/__init__.py
================================================
"""Init for axolotl.cli.utils module."""

from .args import (
    add_options_from_config,
    add_options_from_dataclass,
    filter_none_kwargs,
)
from .fetch import fetch_from_github
from .load import load_model_and_tokenizer
from .sweeps import generate_sweep_configs
from .train import build_command, generate_config_files, launch_training

__all__ = [
    "filter_none_kwargs",
    "add_options_from_dataclass",
    "add_options_from_config",
    "build_command",
    "generate_config_files",
    "generate_sweep_configs",
    "load_model_and_tokenizer",
    "launch_training",
    "fetch_from_github",
]


================================================
FILE: src/axolotl/cli/utils/args.py
================================================
"""Utilities for axolotl CLI args."""

import dataclasses
from functools import wraps
from types import NoneType, UnionType
from typing import Any, Callable, Type, Union, get_args, get_origin

import click
from pydantic import BaseModel


def _strip_optional_type(field_type: type | str | None):
    """
    Extracts the non-`None` type from an `Optional` / `Union` type.

    Args:
        field_type: Type of field for Axolotl CLI command.

    Returns:
        If the input type is `Union[T, None]` or `Optional[T]`, returns `T`. Otherwise
            returns the input type unchanged.
    """
    is_union = get_origin(field_type) is Union or isinstance(field_type, UnionType)
    if is_union and type(None) in get_args(field_type):
        field_type = next(
            t for t in get_args(field_type) if not isinstance(t, NoneType)
        )

    return field_type


def filter_none_kwargs(func: Callable) -> Callable:
    """
    Wraps function to remove `None`-valued `kwargs`.

    Args:
        func: Function to wrap.

    Returns:
        Wrapped function.
    """

    @wraps(func)
    def wrapper(*args, **kwargs) -> Callable:
        """Filters out `None`-valued `kwargs`."""
        filtered_kwargs = {k: v for k, v in kwargs.items() if v is not None}

        return func(*args, **filtered_kwargs)

    return wrapper


def add_options_from_dataclass(config_class: Type[Any]) -> Callable:
    """
    Create Click options from the fields of a dataclass.

    Args:
        config_class: Dataclass with fields to parse from the CLI.

    Returns:
        Function decorator for Axolotl CLI command.
    """

    def decorator(function: Callable) -> Callable:
        # Process dataclass fields in reverse order for correct option ordering
        for field in reversed(dataclasses.fields(config_class)):
            field_type = _strip_optional_type(field.type)

            if field_type is bool:
                field_name = field.name.replace("_", "-")
                option_name = f"--{field_name}/--no-{field_name}"
                function = click.option(
                    option_name,
                    default=field.default,
                    help=field.metadata.get("description"),
                )(function)
            else:
                option_name = f"--{field.name.replace('_', '-')}"
                function = click.option(
                    option_name,
                    type=field_type,
                    default=field.default,
                    help=field.metadata.get("description"),
                )(function)

        return function

    return decorator


def _is_pydantic_model(field_type: type) -> bool:
    """Check if a type is a Pydantic BaseModel subclass."""
    try:
        return isinstance(field_type, type) and issubclass(field_type, BaseModel)
    except TypeError:
        return False


def _get_field_description(field) -> str | None:
    """Get description from a Pydantic field, checking both .description and json_schema_extra."""
    if field.description:
        return field.description
    if field.json_schema_extra and isinstance(field.json_schema_extra, dict):
        return field.json_schema_extra.get("description")
    return None


def _add_nested_model_options(
    function: Callable, parent_name: str, model_class: Type[BaseModel]
) -> Callable:
    """
    Add Click options for all fields of a nested Pydantic model using dot-notation.

    Note: Only single-level nesting is supported (e.g., ``--trl.beta``).
    Deeper nesting (e.g., ``--trl.scheduler.warmup``) is not handled.

    Args:
        function: Click command function to add options to.
        parent_name: Parent field name (e.g., "trl").
        model_class: Nested Pydantic model class.

    Returns:
        Function with added Click options.
    """
    for sub_name, sub_field in reversed(model_class.model_fields.items()):
        sub_type = _strip_optional_type(sub_field.annotation)
        # Use dot notation: --parent.sub_field
        cli_name = f"{parent_name}.{sub_name}".replace("_", "-")
        # The kwarg name uses double-underscore as separator
        param_name = f"{parent_name}__{sub_name}"
        description = _get_field_description(sub_field)

        if sub_type is bool:
            option_name = f"--{cli_name}/--no-{cli_name}"
            function = click.option(
                option_name, param_name, default=None, help=description
            )(function)
        else:
            option_name = f"--{cli_name}"
            click_type = {str: str, int: int, float: float}.get(sub_type)
            function = click.option(
                option_name, param_name, default=None, type=click_type, help=description
            )(function)

    return function


def add_options_from_config(config_class: Type[BaseModel]) -> Callable:
    """
    Create Click options from the fields of a Pydantic model.

    For fields whose type is itself a Pydantic BaseModel, dot-notation CLI options are
    generated for each sub-field (e.g., ``--trl.beta=0.1``).

    Args:
        config_class: PyDantic model with fields to parse from the CLI

    Returns:
        Function decorator for Axolotl CLI command.
    """

    def decorator(function: Callable) -> Callable:
        # Process model fields in reverse order for correct option ordering
        for name, field in reversed(config_class.model_fields.items()):
            field_type = _strip_optional_type(field.annotation)

            # Handle nested Pydantic models with dot-notation options
            if _is_pydantic_model(field_type):
                function = _add_nested_model_options(function, name, field_type)
                continue

            if field_type is bool:
                field_name = name.replace("_", "-")
                option_name = f"--{field_name}/--no-{field_name}"
                function = click.option(
                    option_name, default=None, help=field.description
                )(function)
            else:
                option_name = f"--{name.replace('_', '-')}"
                function = click.option(
                    option_name, default=None, help=field.description
                )(function)

        return function

    return decorator


================================================
FILE: src/axolotl/cli/utils/diffusion.py
================================================
"""Helpers for diffusion-mode inference in CLI and Gradio."""

from __future__ import annotations

import gradio as gr
from colorama import Fore, Style

from axolotl.integrations.diffusion import generate, resolve_mask_token_id
from axolotl.utils.dict import DictDefault


def diffusion_inference(
    model,
    tokenizer,
    cfg,
    prompt: str,
    chat_template_str: str | None = None,
):
    """Diffusion inference helper method."""
    mode = "random"
    completion_tokens = 0
    target_mask_ratio = None
    mode, completion_tokens, target_mask_ratio, cleaned = _parse_commands(prompt)

    if cleaned:
        prompt = cleaned

    info = run_diffusion(
        model=model,
        tokenizer=tokenizer,
        cfg=cfg,
        prompt=prompt,
        chat_template_str=chat_template_str,
        mode=mode,
        target_mask_ratio=target_mask_ratio,
        completion_tokens=completion_tokens,
    )
    masked_text = info["masked_text"]
    mask_ratio = info["mask_ratio"]
    generated_ids = info["generated_ids"]
    masked_positions = info["masked_positions"]
    orig_ids = info["orig_ids"]

    # Display with masked preview and colored diff
    if masked_text is not None and mask_ratio is not None:
        print(f"Masked ({mask_ratio:.1%}):\n{masked_text}\n")
    if generated_ids is not None:
        # Compute per-token style
        styles: list[str] = []
        for i, tid in enumerate(generated_ids):
            if i in masked_positions:
                if i < len(orig_ids) and tid == orig_ids[i]:
                    styles.append("green")  # correct fill
                elif i < len(orig_ids):
                    styles.append("red")  # incorrect fill
                else:
                    styles.append("normal")  # appended
            else:
                same = i < len(orig_ids) and tid == orig_ids[i]
                styles.append("dim" if same else "normal")

        # Group contiguous spans by style
        styled_spans: list[tuple[str, int, int]] = []
        if generated_ids:
            current_style = styles[0]
            start = 0
            for i in range(1, len(generated_ids)):
                s = styles[i]
                if s != current_style:
                    styled_spans.append((current_style, start, i))
                    current_style, start = s, i
            styled_spans.append((current_style, start, len(generated_ids)))

        out_parts = []
        for style_name, a, b in styled_spans:
            chunk_text = tokenizer.decode(generated_ids[a:b], skip_special_tokens=False)
            if style_name == "green":
                out_parts.append(Fore.GREEN + chunk_text + Style.RESET_ALL)
            elif style_name == "red":
                out_parts.append(Fore.RED + chunk_text + Style.RESET_ALL)
            else:
                if style_name == "dim":
                    out_parts.append(Style.DIM + chunk_text + Style.RESET_ALL)
                else:
                    out_parts.append(chunk_text)
        print("Generated:\n" + "".join(out_parts))
    else:
        print("Generated:\n(no output)")


def _parse_commands(text: str):
    """
    Parse leading diffusion commands.

    Supported at start of input (can be chained):
      :complete N  -> completion mode with N tokens (default 64)
      :mask R      -> random masking with ratio R in [0, 1]
    """
    tokens = text.strip().split()
    i = 0
    mode = "random"
    completion_tokens = 0
    target_mask_ratio = None
    consumed = 0
    while i < len(tokens) and tokens[i].startswith(":"):
        cmd = tokens[i]
        i += 1
        consumed = i
        if cmd == ":complete":
            mode = "completion"
            if i < len(tokens):
                try:
                    completion_tokens = int(tokens[i])
                    i += 1
                    consumed = i
                except Exception:
                    completion_tokens = 64
            else:
                completion_tokens = 64
        elif cmd == ":mask":
            mode = "random"
            if i < len(tokens):
                try:
                    target_mask_ratio = float(tokens[i])
                    i += 1
                    consumed = i
                except Exception:
                    target_mask_ratio = None
        else:
            i -= 1
            consumed = i
            break

    cleaned = " ".join(tokens[consumed:])

    return mode, completion_tokens, target_mask_ratio, cleaned


def run_diffusion(
    *,
    model,
    tokenizer,
    cfg: DictDefault,
    prompt: str,
    chat_template_str: str | None,
    mode: str = "random",
    target_mask_ratio: float | None = None,
    completion_tokens: int = 0,
):
    """Run a single diffusion generation and return a structured result dict."""
    if chat_template_str:
        batch = tokenizer.apply_chat_template(
            [{"role": "user", "content": prompt}],
            return_tensors="pt",
            add_special_tokens=True,
            add_generation_prompt=True,
            chat_template=chat_template_str,
            tokenize=True,
            return_dict=True,
        )
    else:
        batch = tokenizer(prompt, return_tensors="pt", add_special_tokens=True)

    mask_token_id = resolve_mask_token_id(tokenizer, cfg, allow_add=False)

    seq = batch["input_ids"].to(cfg.device)
    gen_mode = "completion" if mode == "completion" else "random"
    comp_tokens = int(completion_tokens) if gen_mode == "completion" else 0

    result = generate(
        model,
        tokenizer,
        original_sequence=seq[:1],
        num_diffusion_steps=cfg.diffusion.num_diffusion_steps,
        temperature=cfg.diffusion.generation_temperature,
        mask_token_id=int(mask_token_id),
        mode=gen_mode,  # type: ignore[arg-type]
        completion_tokens=comp_tokens,
        target_mask_ratio=target_mask_ratio,
    )

    masked_text = result.get("masked") if isinstance(result, dict) else None
    mask_ratio = result.get("mask_ratio") if isinstance(result, dict) else None
    generated_ids = result.get("generated_ids") if isinstance(result, dict) else None
    masked_positions = (
        set(result.get("masked_positions") or []) if isinstance(result, dict) else set()
    )
    orig_ids = seq[0].detach().cpu().tolist()

    return {
        "masked_text": masked_text,
        "mask_ratio": mask_ratio,
        "generated_ids": generated_ids,
        "masked_positions": masked_positions,
        "orig_ids": orig_ids,
    }


def render_html(
    *,
    generated_ids: list[int] | None,
    orig_ids: list[int],
    masked_positions: set[int],
    tokenizer,
) -> str:
    """Render HTML visualizing diffusion outputs."""
    if not generated_ids:
        return "<pre>Generated:\n(no output)</pre>"

    def _style_for(i: int, tid: int) -> str:
        if i in masked_positions:
            if i < len(orig_ids) and tid == orig_ids[i]:
                return "green"
            if i < len(orig_ids):
                return "red"
            return "normal"
        same = i < len(orig_ids) and tid == orig_ids[i]
        return "dim" if same else "normal"

    # Group contiguous spans by style to reduce HTML size
    spans: list[tuple[str, int, int]] = []
    if generated_ids:
        cur = _style_for(0, generated_ids[0])
        start = 0
        for i in range(1, len(generated_ids)):
            s = _style_for(i, generated_ids[i])
            if s != cur:
                spans.append((cur, start, i))
                cur, start = s, i
        spans.append((cur, start, len(generated_ids)))

    html_parts = []
    for style_name, a, b in spans:
        txt = tokenizer.decode(generated_ids[a:b], skip_special_tokens=False)
        if style_name == "green":
            html_parts.append(f'<span style="color:#2e7d32">{txt}</span>')
        elif style_name == "red":
            html_parts.append(f'<span style="color:#c62828">{txt}</span>')
        elif style_name == "dim":
            html_parts.append(f'<span style="opacity:0.6">{txt}</span>')
        else:
            html_parts.append(txt)

    legend = (
        '<div style="font-size:0.9em;margin-bottom:4px">'
        '<span style="color:#2e7d32">correct</span>, '
        '<span style="color:#c62828">incorrect</span>, '
        '<span style="opacity:0.6">unchanged</span>'
        "</div>"
    )

    return (
        legend
        + '<pre style="white-space:pre-wrap">Generated:\n'
        + "".join(html_parts)
        + "</pre>"
    )


def launch_diffusion_gradio_ui(
    *,
    model,
    tokenizer,
    cfg: DictDefault,
    prompter_module=None,
    chat_template_str: str | None = None,
):
    """Build and launch a simple Gradio UI for diffusion inference."""
    with gr.Blocks(
        title=cfg.get("gradio_title", "Axolotl Diffusion Interface")
    ) as demo:
        gr.Markdown(
            """
            ## Axolotl Diffusion Inference
            - Mode "Random" masks tokens at a target ratio and fills them.
            - Mode "Completion" appends N masked tokens at the end and fills them.
            """
        )

        with gr.Row():
            mode = gr.Radio(
                choices=["random", "completion"],
                value="random",
                label="Mode",
            )
            mask_ratio = gr.Slider(
                minimum=0.0,
                maximum=1.0,
                step=0.05,
                value=0.4,
                label="Mask ratio (random mode)",
                interactive=True,
            )
            completion_tokens = gr.Number(
                value=64,
                precision=0,
                label="Completion tokens (completion mode)",
                interactive=True,
                visible=False,
            )

        instruction = gr.Textbox(label="Instruction", lines=6)
        run_btn = gr.Button("Generate")

        masked_preview = gr.Textbox(label="Masked preview", lines=6)
        html_out = gr.HTML(label="Generated")

        def _toggle_controls(selected_mode: str):
            return (
                gr.update(visible=(selected_mode == "random")),
                gr.update(visible=(selected_mode == "completion")),
            )

        mode.change(
            _toggle_controls,
            inputs=[mode],
            outputs=[mask_ratio, completion_tokens],
        )

        def _gen(instruction_text: str, selected_mode: str, mratio: float, ctoks: int):
            if not instruction_text:
                return "", "<pre>Generated:\n(no output)</pre>"

            if prompter_module:
                prompt: str = next(
                    prompter_module().build_prompt(
                        instruction=instruction_text.strip("\n")
                    )
                )
            else:
                prompt = instruction_text.strip()

            info = run_diffusion(
                model=model,
                tokenizer=tokenizer,
                cfg=cfg,
                prompt=prompt,
                chat_template_str=chat_template_str,
                mode=selected_mode,
                target_mask_ratio=mratio if selected_mode == "random" else None,
                completion_tokens=int(ctoks) if selected_mode == "completion" else 0,
            )

            masked_text = info.get("masked_text")
            mask_ratio_val = info.get("mask_ratio")
            generated_ids = info.get("generated_ids")
            masked_positions = info.get("masked_positions") or set()
            orig_ids = info.get("orig_ids") or []

            preview = (
                f"Masked ({mask_ratio_val:.1%}):\n{masked_text}"
                if masked_text is not None and mask_ratio_val is not None
                else ""
            )
            html = render_html(
                generated_ids=generated_ids,
                orig_ids=orig_ids,
                masked_positions=masked_positions,
                tokenizer=tokenizer,
            )
            return preview, html

        run_btn.click(
            _gen,
            inputs=[instruction, mode, mask_ratio, completion_tokens],
            outputs=[masked_preview, html_out],
        )

        demo.launch(
            footer_links=["gradio", "settings"],
            share=cfg.get("gradio_share", True),
            server_name=cfg.get("gradio_server_name", "127.0.0.1"),
            server_port=cfg.get("gradio_server_port", None),
        )


================================================
FILE: src/axolotl/cli/utils/fetch.py
================================================
"""Utilities for axolotl fetch CLI command."""

import concurrent.futures
import hashlib
import json
from pathlib import Path

import click
import requests

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def _download_file(
    file_info: tuple, raw_base_url: str, dest_path: Path, dir_prefix: str
) -> tuple[str, str]:
    """
    Download a single file and return its processing status.

    Args:
        file_info: Tuple of (file_path, remote_sha).
        raw_base_url: Base URL for raw GitHub content.
        dest_path: Local destination directory.
        dir_prefix: Directory prefix to filter files.

    Returns:
        Tuple of (file_path, status) where status is 'new', 'updated', or 'unchanged'.
    """
    file_path, remote_sha = file_info
    raw_url = f"{raw_base_url}/{file_path}"
    dest_file = dest_path / file_path.split(dir_prefix)[-1]

    # Check if file exists and needs updating
    if dest_file.exists():
        with open(dest_file, "rb") as file:
            content = file.read()
            # Calculate git blob SHA
            blob = b"blob " + str(len(content)).encode() + b"\0" + content
            local_sha = hashlib.sha1(blob, usedforsecurity=False).hexdigest()

        if local_sha == remote_sha:
            print(f"Skipping {file_path} (unchanged)")
            return file_path, "unchanged"

        print(f"Updating {file_path}")
        status = "updated"
    else:
        print(f"Downloading {file_path}")
        status = "new"

    # Create directories if needed
    dest_file.parent.mkdir(parents=True, exist_ok=True)

    # Download and save file
    try:
        response = requests.get(raw_url, timeout=30)
        response.raise_for_status()

        with open(dest_file, "wb") as file:
            file.write(response.content)

        return file_path, status
    except (requests.RequestException, IOError) as request_error:
        print(f"Error downloading {file_path}: {str(request_error)}")
        return file_path, "error"


def fetch_from_github(
    dir_prefix: str, dest_dir: str | None = None, max_workers: int = 5
) -> None:
    """
    Sync files from a specific directory in the GitHub repository.
    Only downloads files that don't exist locally or have changed.

    Args:
        dir_prefix: Directory prefix to filter files (e.g., 'examples/',
            'deepspeed_configs/').
        dest_dir: Local destination directory.
        max_workers: Maximum number of concurrent downloads.
    """
    api_url = "https://api.github.com/repos/axolotl-ai-cloud/axolotl/git/trees/main?recursive=1"
    raw_base_url = "https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main"

    # Get repository tree with timeout
    response = requests.get(api_url, timeout=30)
    response.raise_for_status()
    tree = json.loads(response.text)

    # Filter for files and get their SHA
    files = {
        item["path"]: item["sha"]
        for item in tree["tree"]
        if item["type"] == "blob" and item["path"].startswith(dir_prefix)
    }

    if not files:
        raise click.ClickException(f"No files found in {dir_prefix}")

    # Default destination directory is the last part of dir_prefix
    default_dest = Path(dir_prefix.rstrip("/"))
    dest_path = Path(dest_dir) if dest_dir else default_dest

    # Keep track of processed files for summary
    files_processed: dict[str, list[str]] = {
        "new": [],
        "updated": [],
        "unchanged": [],
        "error": [],
    }

    # Process files in parallel using ThreadPoolExecutor
    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_file = {
            executor.submit(
                _download_file,
                (file_path, remote_sha),
                raw_base_url,
                dest_path,
                dir_prefix,
            ): file_path
            for file_path, remote_sha in files.items()
        }

        # Process completed tasks as they finish
        for future in concurrent.futures.as_completed(future_to_file):
            file_path = future_to_file[future]
            try:
                file_path, status = future.result()
                files_processed[status].append(file_path)
            except (requests.RequestException, IOError) as request_error:
                print(f"Error processing {file_path}: {str(request_error)}")
                files_processed["error"].append(file_path)

    # Log summary
    LOG.info("\nSync Summary:")
    LOG.info(f"New files: {len(files_processed['new'])}")
    LOG.info(f"Updated files: {len(files_processed['updated'])}")
    LOG.info(f"Unchanged files: {len(files_processed['unchanged'])}")
    if files_processed["error"]:
        LOG.info(f"Failed files: {len(files_processed['error'])}")


================================================
FILE: src/axolotl/cli/utils/load.py
================================================
"""Utilities for model, tokenizer, etc. loading."""

from typing import Any

from transformers import (
    PreTrainedModel,
    PreTrainedTokenizer,
    PreTrainedTokenizerFast,
    ProcessorMixin,
)

from axolotl.loaders import load_processor, load_tokenizer
from axolotl.loaders.model import ModelLoader
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def load_model_and_tokenizer(
    *,
    cfg: DictDefault,
    inference: bool = False,
) -> tuple[
    PreTrainedModel,
    PreTrainedTokenizer | PreTrainedTokenizerFast | Any,
    ProcessorMixin | None,
]:
    """
    Helper function for loading a model, tokenizer, and processor specified in the
    given `axolotl` config.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        inference: Boolean denoting inference mode.

    Returns:
        Tuple of (PreTrainedModel, PreTrainedTokenizer, ProcessorMixin).
    """
    LOG.info(f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}")
    tokenizer = load_tokenizer(cfg)

    LOG.info("loading model...")
    model_loader = ModelLoader(cfg, tokenizer, inference=inference)
    model, _ = model_loader.load()

    processor = None
    if cfg.is_multimodal:
        LOG.info("loading processor...")
        processor = load_processor(cfg, tokenizer)

    return model, tokenizer, processor


================================================
FILE: src/axolotl/cli/utils/sweeps.py
================================================
"""Utilities for handling sweeps over configs for axolotl train CLI command"""

import random
from copy import deepcopy
from itertools import product
from typing import Any


def generate_sweep_configs(
    base_config: dict[str, list], sweeps_config: dict[str, list]
) -> list[dict[str, Any]]:
    """
    Recursively generates all possible configurations by applying sweeps to the base config.

    Args:
        base_config (dict): The original configuration dictionary
        sweeps_config (dict): Dictionary where keys are parameters and values are either:
            - lists of values to sweep independently
            - or for paired values, a list of dicts under the '_' key

    Returns:
        list: List of all possible configuration dictionaries

    Example:
        sweeps_config = {
            'learning_rate': [0.1, 0.01],
            '_': [
                {'load_in_8bit': True, 'adapter': 'lora'},
                {'load_in_4bit': True, 'adapter': 'qlora'}
            ]
        }
    """
    # Separate paired values from regular sweeps
    paired_values = sweeps_config.get("_", [])
    regular_sweeps = {k: v for k, v in sweeps_config.items() if k != "_"}

    # Process regular sweeps
    param_names = list(regular_sweeps.keys())
    param_values = list(regular_sweeps.values())

    # Generate combinations for regular sweeps
    regular_combinations = list(product(*param_values)) if param_values else [()]

    # Combine regular sweeps with paired values
    all_combinations = []
    for reg_combo in regular_combinations:
        if paired_values:
            for paired_set in paired_values:
                new_config = {}
                # new_config = deepcopy(base_config)
                # Combine regular parameters with paired parameters
                full_combo = {
                    **dict(zip(param_names, reg_combo, strict=False)),
                    **paired_set,
                }
                for param_name, param_value in full_combo.items():
                    new_config[param_name] = param_value
                print(new_config)
                all_combinations.append(new_config)
        else:
            # If no paired values, just use regular combinations
            # new_config = deepcopy(base_config)
            new_config = {}
            for param_name, param_value in zip(param_names, reg_combo, strict=False):
                new_config[param_name] = param_value
            print(new_config)
            all_combinations.append(new_config)

    # randomize the order of trials
    random.seed(42)
    random.shuffle(all_combinations)

    # Generate a new config for each combination
    result_configs = []
    for combination in all_combinations:
        new_config = deepcopy(base_config)
        for param_name, param_value in combination.items():
            new_config[param_name] = param_value
        result_configs.append(new_config)

    return result_configs


================================================
FILE: src/axolotl/cli/utils/train.py
================================================
"""Utilities for axolotl train CLI command."""

import os
import subprocess  # nosec
import sys
import tempfile
from pathlib import Path
from typing import Any, Iterator, Literal

import yaml

from axolotl.cli.utils.sweeps import generate_sweep_configs


def _add_default_rdzv_args(launcher_args: list[str]) -> list[str]:
    """
    Add default RDZV arguments if rdzv_endpoint is set but rdzv_backend/rdzv_id are missing.

    Args:
        launcher_args: List of launcher arguments

    Returns:
        Updated launcher args with defaults added if needed
    """
    args = launcher_args.copy()

    # Check if rdzv_endpoint is present
    has_rdzv_endpoint = any("--rdzv_endpoint" in arg for arg in args)

    if has_rdzv_endpoint:
        # Check if rdzv_backend is already provided
        has_rdzv_backend = any("--rdzv_backend" in arg for arg in args)
        if not has_rdzv_backend:
            args.extend(["--rdzv_backend", "c10d"])

        # Check if rdzv_id is already provided
        has_rdzv_id = any("--rdzv_id" in arg for arg in args)
        if not has_rdzv_id:
            import uuid

            args.extend(["--rdzv_id", str(uuid.uuid4())[:8]])

    return args


def build_command(base_cmd: list[str], options: dict[str, Any]) -> list[str]:
    """
    Build command list from base command and options.

    Args:
        base_cmd: Command without options.
        options: Options to parse and append to base command.

    Returns:
        List of strings giving shell command.
    """
    cmd = base_cmd.copy()

    for key, value in options.items():
        if value is None:
            continue

        key = key.replace("_", "-")
        cmd.append(f"--{key}={value}")

    return cmd


def generate_config_files(config: str, sweep: str | None) -> Iterator[tuple[str, bool]]:
    """
    Generate list of configuration files to process. Yields a tuple of the configuration file name and a boolean indicating
    whether this is a group of configurations (i.e., a sweep).

    Args:
        config: Base configuration file
        sweep: Sweep configuration file
    """

    if not sweep:
        yield config, False
        return

    # Load sweep and base configurations
    with open(sweep, "r", encoding="utf-8") as fin:
        sweep_config: dict[str, list] = yaml.safe_load(fin)
    with open(config, "r", encoding="utf-8") as fin:
        base_config: dict[str, list] = yaml.safe_load(fin)

    # Generate all possible configurations
    permutations = generate_sweep_configs(base_config, sweep_config)
    is_group = len(permutations) > 1
    base_output_dir = base_config.get("output_dir", "./model-out")
    for idx, permutation in enumerate(permutations, start=1):
        permutation_dir = Path(permutation.get("output_dir", base_output_dir))
        permutation_id = f"sweep{idx:04d}"
        permutation["output_dir"] = str(permutation_dir / permutation_id)

        temp_file = tempfile.NamedTemporaryFile(
            mode="w",
            suffix=".yaml",
            delete=False,
            encoding="utf-8",
        )
        yaml.dump(permutation, temp_file)
        temp_file.close()
        yield temp_file.name, is_group


def launch_training(
    cfg_file: str,
    launcher: Literal["accelerate", "torchrun", "python"] | None,
    cloud: str | None,
    kwargs: dict,
    launcher_args: list[str] | None = None,
    use_exec: bool = False,
) -> None:
    """Execute training with the given configuration."""
    launcher_args = launcher_args or []

    if cloud:
        _launch_cloud_training(cloud, cfg_file, launcher, kwargs, launcher_args)
    elif launcher:
        if launcher == "accelerate":
            _launch_accelerate_training(cfg_file, kwargs, launcher_args, use_exec)
        elif launcher == "torchrun":
            _launch_torchrun_training(cfg_file, kwargs, launcher_args, use_exec)
        elif launcher == "python":
            _launch_python_training(cfg_file, kwargs)
    elif launcher is None:
        # handle ray train launch
        _launch_python_training(cfg_file, kwargs)


def _launch_cloud_training(
    cloud: str,
    cfg_file: str,
    launcher: Literal["accelerate", "torchrun", "python"] | None,
    kwargs: dict,
    launcher_args: list[str] | None = None,
) -> None:
    """Execute training via cloud launcher."""
    from axolotl.cli.cloud import do_cli_train

    launcher_args = launcher_args or []
    cwd = os.getcwd() if launcher else None

    do_cli_train(
        cloud_config=cloud,
        config=cfg_file,
        launcher=launcher or "accelerate",
        launcher_args=launcher_args,
        cwd=cwd,
        **kwargs,
    )


def _launch_accelerate_training(
    cfg_file: str,
    kwargs: dict,
    launcher_args: list[str] | None = None,
    use_exec: bool = False,
) -> None:
    """Execute training via accelerate launcher."""
    launcher_args = launcher_args or []
    internal_launcher_args = []

    # Extract launcher-specific arguments from kwargs (legacy support)
    if "main_process_port" in kwargs:
        main_process_port = kwargs.pop("main_process_port")
        internal_launcher_args.extend(["--main_process_port", str(main_process_port)])

    if "num_processes" in kwargs:
        num_processes = kwargs.pop("num_processes")
        internal_launcher_args.extend(["--num_processes", str(num_processes)])

    # Combine internal args with user-provided launcher args
    all_launcher_args = internal_launcher_args + launcher_args

    base_cmd = (
        ["accelerate", "launch"] + all_launcher_args + ["-m", "axolotl.cli.train"]
    )
    if cfg_file:
        base_cmd.append(cfg_file)

    cmd = build_command(base_cmd, kwargs)
    if use_exec:
        # make sure to flush stdout and stderr before replacing the process
        sys.stdout.flush()
        sys.stderr.flush()
        os.execvpe(cmd[0], cmd, os.environ)  # nosec B606
    else:
        subprocess.run(cmd, check=True)  # nosec B603


def _launch_torchrun_training(
    cfg_file: str,
    kwargs: dict,
    launcher_args: list[str] | None = None,
    use_exec: bool = False,
) -> None:
    """Execute training via torchrun launcher."""
    launcher_args = launcher_args or []

    # Add default RDZV arguments if rdzv_endpoint is set
    launcher_args = _add_default_rdzv_args(launcher_args)

    base_cmd = ["torchrun"] + launcher_args + ["-m", "axolotl.cli.train"]
    if cfg_file:
        base_cmd.append(cfg_file)

    cmd = build_command(base_cmd, kwargs)
    if use_exec:
        # make sure to flush stdout and stderr before replacing the process
        sys.stdout.flush()
        sys.stderr.flush()
        os.execvpe(cmd[0], cmd, os.environ)  # nosec B606
    else:
        subprocess.run(cmd, check=True)  # nosec B603


def _launch_python_training(cfg_file: str, kwargs: dict) -> None:
    """Execute training via python launcher."""
    from axolotl.cli.train import do_cli

    do_cli(config=cfg_file, **kwargs)


================================================
FILE: src/axolotl/cli/vllm_serve.py
================================================
"""
CLI to start the vllm server for online RL
"""

from dataclasses import dataclass, field
from pathlib import Path
from typing import Union

from trl.scripts.vllm_serve import ScriptArguments

from axolotl.cli.config import load_cfg


@dataclass
class AxolotlScriptArguments(ScriptArguments):
    """
    Additional arguments for the VLLM server
    """

    reasoning_parser: str = field(default="", kw_only=True)
    enable_reasoning: bool | None = field(default=None, kw_only=True)


def do_vllm_serve(
    config: Union[Path, str],
    cli_args: dict,
):
    """
    Starts the VLLM server for serving LLM models used for online RL

    Args
        :param cfg: Parsed doct of the YAML config
        :param cli_args: dict of additional command-line arguments of type VllmServeCliArgs

    Returns:
        process_id: the process id of the started VLLM server
    """
    cfg = load_cfg(config)
    model = cfg.base_model

    # Determine serve module: explicit CLI/config > auto-select from vllm_lora_sync > default
    serve_module = cli_args.get("serve_module") or getattr(
        cfg.vllm, "serve_module", None
    )
    if (
        serve_module is None
        and getattr(cfg, "trl", None)
        and getattr(cfg.trl, "vllm_lora_sync", False)
    ):
        serve_module = "axolotl.scripts.vllm_serve_lora"
    if serve_module is None:
        serve_module = "trl.scripts.vllm_serve"
    vllm_serve_main = __import__(serve_module, fromlist=["main"]).main
    tensor_parallel_size = 1
    data_parallel_size = 1

    if cli_args.get("tensor_parallel_size") or cfg.vllm.tensor_parallel_size:
        tensor_parallel_size = (
            cli_args.get("tensor_parallel_size") or cfg.vllm.tensor_parallel_size
        )
    if cli_args.get("data_parallel_size") or cfg.vllm.data_parallel_size:
        data_parallel_size = (
            cli_args.get("data_parallel_size") or cfg.vllm.data_parallel_size
        )
    host = cli_args.get("host") or cfg.vllm.host
    port = cli_args.get("port") or cfg.vllm.port
    gpu_memory_utilization = (
        cli_args.get("gpu_memory_utilization") or cfg.vllm.gpu_memory_utilization
    )
    dtype = cli_args.get("dtype") or cfg.vllm.dtype
    max_model_len = cli_args.get("max_model_len") or cfg.vllm.max_model_len
    enable_prefix_caching = (
        cli_args.get("enable_prefix_caching") or cfg.vllm.enable_prefix_caching
    )
    reasoning_parser = (
        cli_args.get("reasoning_parser") or cfg.vllm.reasoning_parser or ""
    )
    enable_reasoning = (
        cli_args.get("enable_reasoning") or cfg.vllm.enable_reasoning or False
    )

    base_kwargs = dict(
        model=model,
        tensor_parallel_size=tensor_parallel_size,
        data_parallel_size=data_parallel_size,
        host=host,
        port=port,
        gpu_memory_utilization=gpu_memory_utilization,
        dtype=dtype,
        max_model_len=max_model_len,
        enable_prefix_caching=enable_prefix_caching,
    )

    # Use LoRAScriptArguments when serving with native LoRA support
    if serve_module == "axolotl.scripts.vllm_serve_lora":
        from axolotl.scripts.vllm_serve_lora import LoRAScriptArguments

        lora_kwargs = {}
        if hasattr(cfg, "lora_r") and cfg.lora_r:
            lora_kwargs["max_lora_rank"] = cfg.lora_r
        vllm_script_args = LoRAScriptArguments(**base_kwargs, **lora_kwargs)
    else:
        vllm_script_args = AxolotlScriptArguments(
            **base_kwargs,
            reasoning_parser=reasoning_parser,
            enable_reasoning=enable_reasoning,
        )

    vllm_serve_main(vllm_script_args)


================================================
FILE: src/axolotl/common/__init__.py
================================================


================================================
FILE: src/axolotl/common/architectures.py
================================================
"""
Common architecture specific constants
"""

MOE_ARCH_BLOCK = {
    "dbrx": "DbrxFFN",
    "jamba": "JambaSparseMoeBlock",
    "jetmoe": [
        "JetMoeMoA",
        "JetMoeMoE",
    ],
    "mixtral": "MixtralSparseMoeBlock",
    "qwen2_moe": "Qwen2MoeSparseMoeBlock",
    "qwen3_moe": "Qwen3MoeSparseMoeBlock",
    "qwen3_5_moe": "Qwen3_5MoeSparseMoeBlock",
    "qwen3_vl_moe": "Qwen3VLMoeTextSparseMoeBlock",
    "deepseek_v2": "DeepseekV2MoE",
    "deepseek_v3": "DeepseekV3MoE",
    "mistral4": "Mistral4MoE",
    "gpt_oss": "GptOssDecoderLayer",
    "lfm2_moe": "Lfm2MoeSparseMoeBlock",
    "afmoe": "AfmoeMoE",
    "glm4_moe": "Glm4MoeDecoderLayer",
    "glm4_moe_lite": "Glm4MoeLiteDecoderLayer",
    "glm_moe_dsa": "GlmMoeDsaDecoderLayer",
}


================================================
FILE: src/axolotl/common/const.py
================================================
"""Various shared constants"""

DEFAULT_DATASET_PREPARED_PATH = "last_run_prepared"


================================================
FILE: src/axolotl/common/datasets.py
================================================
"""Dataset loading utilities."""

import math
import random
from dataclasses import dataclass

from datasets import Dataset

import axolotl.monkeypatch.data.batch_dataset_fetcher  # noqa: F401
from axolotl.cli.args import PreprocessCliArgs, TrainerCliArgs
from axolotl.loaders import load_processor, load_tokenizer
from axolotl.telemetry.errors import send_errors
from axolotl.utils.data import prepare_datasets, prepare_preference_datasets
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger
from axolotl.utils.schemas.enums import RLType
from axolotl.utils.tokenization import check_dataset_labels

LOG = get_logger(__name__)


@dataclass
class TrainDatasetMeta:
    """Dataclass with fields for training and validation datasets and metadata."""

    train_dataset: Dataset
    eval_dataset: Dataset | None = None
    total_num_steps: int | None = None


def sample_dataset(dataset: Dataset, num_samples: int) -> Dataset:
    """Randomly sample `num_samples` samples with replacement from `dataset`."""
    return dataset.select(
        [random.randrange(0, len(dataset) - 1) for _ in range(num_samples)]  # nosec
    )


@send_errors
def load_datasets(
    *,
    cfg: DictDefault,
    cli_args: PreprocessCliArgs | TrainerCliArgs | None = None,
    debug: bool = False,
) -> TrainDatasetMeta:
    """Loads one or more training or evaluation datasets, calling
    `axolotl.utils.data.prepare_datasets`. Optionally, logs out debug information.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: Command-specific CLI arguments.
        debug: Whether to print out tokenization of sample. This is duplicated in
            `cfg` and `cli_args`, but is kept due to use in our Colab notebooks.

    Returns:
        Dataclass with fields for training and evaluation datasets and the computed
            `total_num_steps`.
    """
    tokenizer = load_tokenizer(cfg)
    processor = load_processor(cfg, tokenizer=tokenizer) if cfg.processor_type else None

    train_dataset, eval_dataset, total_num_steps, prompters = prepare_datasets(
        cfg,
        tokenizer,
        processor=processor,
    )

    if (
        cfg.debug
        or getattr(cli_args, "debug", False)
        or getattr(cli_args, "debug_text_only", False)
        or getattr(cli_args, "debug_num_examples", 0) > 0
        or debug
    ):
        LOG.info("check_dataset_labels...")

        num_examples = cli_args.debug_num_examples if cli_args else 1
        text_only = cli_args.debug_text_only if cli_args else False
        try:
            train_samples = sample_dataset(train_dataset, num_examples)
            check_dataset_labels(
                train_samples,
                tokenizer,
                num_examples=num_examples,
                text_only=text_only,
            )
        except AttributeError:
            # can't sample iterable datasets
            pass

        LOG.info("printing prompters...")
        for prompter in prompters:
            LOG.info(prompter)

    return TrainDatasetMeta(
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        total_num_steps=total_num_steps,
    )


@send_errors
def load_preference_datasets(
    *, cfg: DictDefault, cli_args: PreprocessCliArgs | TrainerCliArgs | None = None
) -> TrainDatasetMeta:
    """Loads one or more training or evaluation datasets for RL training using paired
    preference data, calling `axolotl.utils.data.rl.prepare_preference_datasets`.
    Optionally, logs out debug information.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        cli_args: Command-specific CLI arguments.

    Returns:
        Dataclass with fields for training and evaluation datasets and the computed
        `total_num_steps`.
    """
    tokenizer = load_tokenizer(cfg)
    train_dataset, eval_dataset = prepare_preference_datasets(cfg, tokenizer)

    total_num_steps: int | None = None
    if cfg.rl is not RLType.GRPO:
        total_num_steps = int(
            math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
        )

    if ((cli_args and cli_args.debug) or cfg.debug) and cfg.rl != RLType.ORPO:
        LOG.info("check_dataset_labels...")

        num_examples = cli_args.debug_num_examples if cli_args else 1
        text_only = cli_args.debug_text_only if cli_args else False

        tokenizer = load_tokenizer(cfg)
        train_samples = sample_dataset(train_dataset, num_examples)
        check_dataset_labels(
            dataset=train_samples,
            tokenizer=tokenizer,
            num_examples=num_examples,
            text_only=text_only,
            rl_mode=True,
        )

    return TrainDatasetMeta(
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        total_num_steps=total_num_steps,
    )


================================================
FILE: src/axolotl/convert.py
================================================
"""Module containing File Reader, File Writer, Json Parser, and Jsonl Serializer classes"""

import json
import sys


class FileReader:
    """
    Reads a file and returns its contents as a string
    """

    def read(self, file_path):
        with open(file_path, encoding="utf-8") as file:
            return file.read()


class FileWriter:
    """
    Writes a string to a file
    """

    def __init__(self, file_path):
        self.file_path = file_path

    def write(self, content):
        with open(self.file_path, "w", encoding="utf-8") as file:
            file.write(content)


class StdoutWriter:
    """
    Writes a string to stdout
    """

    def write(self, content):
        sys.stdout.write(content)
        sys.stdout.write("\n")


class JsonParser:
    """
    Parses a string as JSON and returns the result
    """

    def parse(self, content):
        return json.loads(content)


class JsonlSerializer:
    """
    Serializes a list of JSON objects into a JSONL string
    """

    def serialize(self, data):
        lines = [json.dumps(item) for item in data]
        return "\n".join(lines)


class JsonToJsonlConverter:
    """
    Converts a JSON file to JSONL
    """

    def __init__(self, file_reader, file_writer, json_parser, jsonl_serializer):
        self.file_reader = file_reader
        self.file_writer = file_writer
        self.json_parser = json_parser
        self.jsonl_serializer = jsonl_serializer

    def convert(self, input_file_path):
        content = self.file_reader.read(input_file_path)
        data = self.json_parser.parse(content)
        # data = [r for r in data if r["conversations"]]  # vicuna cleaned has rows with empty conversations
        jsonl_content = self.jsonl_serializer.serialize(data)
        self.file_writer.write(jsonl_content)


================================================
FILE: src/axolotl/core/__init__.py
================================================


================================================
FILE: src/axolotl/core/attention/__init__.py
================================================


================================================
FILE: src/axolotl/core/builders/__init__.py
================================================
"""Trainer builder classes"""

from .causal import HFCausalTrainerBuilder
from .rl import HFRLTrainerBuilder

__all__ = ["HFCausalTrainerBuilder", "HFRLTrainerBuilder"]


================================================
FILE: src/axolotl/core/builders/base.py
================================================
# Copyright 2024 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Base class for trainer builder"""

import abc
import importlib
import logging
import sys
from abc import abstractmethod
from contextlib import suppress
from pathlib import Path
from typing import Any

import torch
from transformers import TrainerCallback
from transformers.trainer_pt_utils import AcceleratorConfig

from axolotl.integrations.base import PluginManager
from axolotl.monkeypatch.trainer.lr import patch_trainer_get_lr
from axolotl.telemetry.callbacks import TelemetryCallback
from axolotl.telemetry.manager import TelemetryManager
from axolotl.utils import (
    is_comet_available,
    is_mlflow_available,
    is_opentelemetry_available,
    is_trackio_available,
)
from axolotl.utils.callbacks import (
    GCCallback,
    SaveAxolotlConfigtoWandBCallback,
    SaveModelOnFirstStepCallback,
)
from axolotl.utils.callbacks.profiler import PytorchProfilerCallback
from axolotl.utils.distributed import build_parallelism_config
from axolotl.utils.schemas.enums import CustomSupportedOptimizers

LOG = logging.getLogger(__name__)

with suppress(ImportError):
    import torch._dynamo


class TrainerBuilderBase(abc.ABC):
    """Base class for trainer builder."""

    def __init__(self, cfg, model, tokenizer, processor=None):
        self.cfg = cfg
        self.model = model
        self.tokenizer = tokenizer
        self.processor = processor

        self._train_dataset = None
        self._eval_dataset = None
        self._model_ref = None
        self._peft_config = None

        # If the model supports tagging, add the axolotl tag.
        # This makes sure the tag is correctly pushed even if a user calls
        # model.push_to_hub instead of trainer.push_to_hub.
        if hasattr(model, "add_model_tags"):
            model.add_model_tags(["axolotl"])

        patch_trainer_get_lr()

    @property
    def model_ref(self):
        return self._model_ref

    @model_ref.setter
    def model_ref(self, model):
        self._model_ref = model

    @property
    def train_dataset(self):
        return self._train_dataset

    @train_dataset.setter
    def train_dataset(self, dataset):
        self._train_dataset = dataset

    @property
    def eval_dataset(self):
        return self._eval_dataset

    @eval_dataset.setter
    def eval_dataset(self, dataset):
        self._eval_dataset = dataset

    @property
    def peft_config(self):
        return self._peft_config

    @peft_config.setter
    def peft_config(self, peft_config):
        self._peft_config = peft_config

    @abstractmethod
    def build(self, total_num_steps):
        pass

    def get_callbacks(self) -> list[TrainerCallback]:
        callbacks = []

        plugin_manager = PluginManager.get_instance()
        callbacks.extend(
            plugin_manager.add_callbacks_pre_trainer(cfg=self.cfg, model=self.model)
        )

        if self.cfg.gc_steps:
            callbacks.append(GCCallback(gc_steps=self.cfg.gc_steps))

        if self.cfg.dynamic_checkpoint and self.cfg.dynamic_checkpoint.enabled:
            from axolotl.utils.callbacks.dynamic_checkpoint import (
                DynamicCheckpointCallback,
            )

            callbacks.append(DynamicCheckpointCallback(self.cfg))

        if self.cfg.use_wandb:
            callbacks.append(
                SaveAxolotlConfigtoWandBCallback(self.cfg.axolotl_config_path)
            )
        if self.cfg.use_mlflow and is_mlflow_available():
            from axolotl.utils.callbacks.mlflow_ import (
                SaveAxolotlConfigtoMlflowCallback,
            )

            callbacks.extend(
                [
                    SaveAxolotlConfigtoMlflowCallback(self.cfg.axolotl_config_path),
                ]
            )
        if self.cfg.use_comet and is_comet_available():
            from axolotl.utils.callbacks.comet_ import SaveAxolotlConfigtoCometCallback

            callbacks.append(
                SaveAxolotlConfigtoCometCallback(self.cfg.axolotl_config_path)
            )
        if self.cfg.use_trackio and is_trackio_available():
            from axolotl.utils.callbacks.trackio_ import (
                SaveAxolotlConfigtoTrackioCallback,
            )

            callbacks.append(
                SaveAxolotlConfigtoTrackioCallback(self.cfg.axolotl_config_path)
            )
        if self.cfg.use_otel_metrics and is_opentelemetry_available():
            from axolotl.utils.callbacks.opentelemetry import (
                OpenTelemetryMetricsCallback,
            )

            callbacks.append(OpenTelemetryMetricsCallback(self.cfg))
        if self.cfg.save_first_step:
            callbacks.append(SaveModelOnFirstStepCallback())

        if self.cfg.profiler_steps:
            callbacks.append(
                PytorchProfilerCallback(
                    steps_to_profile=self.cfg.profiler_steps,
                    profiler_steps_start=self.cfg.profiler_steps_start,
                )
            )

        telemetry_manager = TelemetryManager.get_instance()
        if telemetry_manager.enabled:
            callbacks.append(TelemetryCallback())

        return callbacks

    def get_post_trainer_create_callbacks(self, trainer):
        """
        Callbacks added after the trainer is created, usually b/c these need access to the trainer
        """
        callbacks = []
        if self.cfg.plugins:
            plugin_manager = PluginManager.get_instance()
            callbacks.extend(
                [
                    cb
                    for cb in plugin_manager.add_callbacks_post_trainer(
                        self.cfg, trainer
                    )
                    if cb
                ]
            )
        return callbacks

    def hook_pre_create_training_args(self, training_arguments_kwargs):
        # TODO
        return training_arguments_kwargs

    def hook_post_create_training_args(self, training_arguments):
        # TODO
        return training_arguments

    def hook_pre_create_trainer(self, trainer_kwargs, trainer_cls):
        # TODO
        return trainer_kwargs, trainer_cls

    def hook_post_create_trainer(self, trainer):
        # TODO
        return trainer

    def _configure_warmup_and_logging(
        self, total_num_steps: int, training_args_kwargs: dict
    ):
        warmup_steps: int | float = 0
        warmup_ratio = 0.0
        if self.cfg.warmup_steps is not None:
            warmup_steps = self.cfg.warmup_steps
        elif self.cfg.warmup_ratio is not None:
            if total_num_steps:
                warmup_steps = max(int(self.cfg.warmup_ratio * total_num_steps), 0)
            else:
                warmup_ratio = self.cfg.warmup_ratio
        elif total_num_steps:
            warmup_steps = min(int(0.03 * total_num_steps), 100)
        else:
            warmup_ratio = 0.03

        # transformers v5
        if warmup_ratio > 0.0 and warmup_steps == 0:
            warmup_steps = warmup_ratio

        if warmup_steps == 1:
            warmup_steps = 2

        if self.cfg.logging_steps is not None:
            training_args_kwargs["logging_steps"] = self.cfg.logging_steps
        else:
            training_args_kwargs["logging_steps"] = (
                500  # transformers defaults to 500
                if not total_num_steps
                else max(min(int(0.005 * total_num_steps), 10), 1)
            )

        training_args_kwargs["warmup_steps"] = warmup_steps

    def _configure_precision_settings(self, training_args_kwargs: dict):
        training_args_kwargs["fp16"] = (self.cfg.fp16 and not self.cfg.bf16) or False
        training_args_kwargs["tf32"] = True if self.cfg.tf32 is True else False
        if self.cfg.bf16 == "full":
            training_args_kwargs["bf16_full_eval"] = True
        else:
            bf16 = self.cfg.bf16 or self.cfg.bfloat16
            bf16 = bf16 if bf16 is not None else False
            training_args_kwargs["bf16"] = bf16

    def _configure_scheduler(self, training_args_kwargs: dict):
        if self.cfg.lr_scheduler in ["one_cycle", "rex"]:
            training_args_kwargs["lr_scheduler_type"] = "cosine"
            training_args_kwargs["alternate_lr_scheduler_type"] = self.cfg.lr_scheduler
        else:
            training_args_kwargs["lr_scheduler_type"] = (
                self.cfg.lr_scheduler if self.cfg.lr_scheduler else "cosine"
            )
        training_args_kwargs["lr_scheduler_kwargs"] = (
            self.cfg.lr_scheduler_kwargs if self.cfg.lr_scheduler_kwargs else {}
        )

    def _configure_optimizer(self, training_args_kwargs: dict, trainer_kwargs: dict):
        def _configure_custom_optimizer(
            training_args_kwargs: dict, trainer_kwargs: dict
        ):
            # Common optimizer kwargs
            optimizer_kwargs = {
                "lr": training_args_kwargs["learning_rate"],
                "weight_decay": training_args_kwargs["weight_decay"],
            }

            # Adam-specific kwargs
            adam_kwargs: dict = {}
            if training_args_kwargs.get("adam_beta1") and training_args_kwargs.get(
                "adam_beta2"
            ):
                adam_kwargs["betas"] = (
                    training_args_kwargs.get("adam_beta1"),
                    training_args_kwargs.get("adam_beta2"),
                )
            if training_args_kwargs.get("adam_epsilon"):
                adam_kwargs["eps"] = training_args_kwargs.get("adam_epsilon")

            if self.cfg.optimizer == "muon":
                _, device_mesh = build_parallelism_config(self.cfg)

                if device_mesh is not None:
                    from axolotl.contribs.mit.muon.dist_muon import (
                        DistMuonOptimizerFactory,
                    )

                    optimizer_cls = DistMuonOptimizerFactory
                    optimizer_kwargs["device_mesh"] = device_mesh
                else:
                    from axolotl.contribs.mit.muon import (
                        MuonOptimizerFactory,
                    )

                    optimizer_cls = MuonOptimizerFactory

                optimizer_kwargs.update(adam_kwargs)
            elif self.cfg.optimizer == "dion":
                from axolotl.contribs.mit.dion import (
                    DionOptimizerFactory,
                )

                optimizer_cls = DionOptimizerFactory
                optimizer_kwargs["dion_lr"] = training_args_kwargs["dion_learning_rate"]
                optimizer_kwargs["dion_mu"] = training_args_kwargs["dion_momentum"]
                optimizer_kwargs.update(adam_kwargs)
                _, device_mesh = build_parallelism_config(self.cfg)
                if device_mesh is not None:
                    optimizer_kwargs["device_mesh"] = device_mesh
            elif self.cfg.optimizer == "optimi_adamw":
                from optimi import AdamW

                optimizer_kwargs["foreach"] = False
                optimizer_cls = AdamW
                optimizer_kwargs.update(adam_kwargs)
            elif self.cfg.optimizer == "ao_adamw_fp8":
                from torchao.prototype.low_bit_optim import AdamWFp8

                optimizer_cls = AdamWFp8
                optimizer_kwargs.update(adam_kwargs)
            elif self.cfg.optimizer == "adopt_adamw":
                from axolotl.utils.optimizers.adopt import ADOPT

                optimizer_cls = ADOPT
                adam_kwargs["decouple"] = True
                optimizer_kwargs.update(adam_kwargs)
            elif self.cfg.optimizer == "came_pytorch":
                from came_pytorch import CAME

                optimizer_cls = CAME

                beta1 = training_args_kwargs.get("adam_beta1", 0.9)
                beta2 = training_args_kwargs.get("adam_beta2", 0.999)
                beta3 = training_args_kwargs.get("adam_beta3", 0.9999)
                eps1 = training_args_kwargs.get("adam_epsilon", 1e-30)
                eps2 = training_args_kwargs.get("adam_epsilon2", 1e-16)
                adam_kwargs["betas"] = (beta1, beta2, beta3)
                adam_kwargs["eps"] = (eps1, eps2)

                optimizer_kwargs.update(adam_kwargs)
            elif self.cfg.optimizer == "flash_adamw":
                from flashoptim import FlashAdamW

                optimizer_cls = FlashAdamW
                optimizer_kwargs.update(adam_kwargs)
            elif self.cfg.optimizer == "flash_adam":
                from flashoptim import FlashAdam

                optimizer_cls = FlashAdam
                optimizer_kwargs.update(adam_kwargs)
            elif self.cfg.optimizer == "flash_sgd":
                from flashoptim import FlashSGD

                optimizer_cls = FlashSGD
            elif self.cfg.optimizer == "flash_sgdw":
                from flashoptim import FlashSGDW

                optimizer_cls = FlashSGDW
            elif self.cfg.optimizer == "flash_lion":
                from flashoptim import FlashLion

                optimizer_cls = FlashLion
                if "betas" in adam_kwargs:
                    optimizer_kwargs["betas"] = adam_kwargs["betas"]
            else:
                raise ValueError(
                    f"Unhandled optimizer: {self.cfg.optimizer}. Please raise an Issue."
                )

            # Parse any additional optimizer args from config
            if self.cfg.optim_args:
                if isinstance(self.cfg.optim_args, dict):
                    optimizer_kwargs.update(self.cfg.optim_args)
                else:
                    # Parse string format "key1=value1,key2=value2"
                    for mapping in self.cfg.optim_args.replace(" ", "").split(","):
                        key, value = mapping.split("=")
                        optimizer_kwargs[key] = value

            # Note: This is not used in training_args_kwargs, but in trainer_kwargs
            trainer_kwargs["optimizer_cls_and_kwargs"] = (
                optimizer_cls,
                optimizer_kwargs,
            )

        # Handle custom optimizer
        custom_supported_optimizers = [opt.value for opt in CustomSupportedOptimizers]
        if self.cfg.optimizer in custom_supported_optimizers:
            _configure_custom_optimizer(training_args_kwargs, trainer_kwargs)
        else:
            # Use transformers' optimizer
            training_args_kwargs["optim"] = self.cfg.optimizer

            # Parse any additional optimizer args from config
            if self.cfg.optim_args:
                if isinstance(self.cfg.optim_args, dict):
                    optim_args = ",".join(
                        [f"{key}={value}" for key, value in self.cfg.optim_args.items()]
                    )
                else:
                    optim_args = self.cfg.optim_args
                training_args_kwargs["optim_args"] = optim_args

            if (
                self.cfg.optimizer == "adamw_anyprecision"
                and Path(self.cfg.torchdistx_path).exists()
            ):
                sys.path.append(self.cfg.torchdistx_path)
                importlib.import_module("torchdistx")

    def _configure_hub_parameters(self, training_args_kwargs: dict):
        if self.cfg.hub_model_id:
            training_args_kwargs["hub_model_id"] = self.cfg.hub_model_id
            training_args_kwargs["push_to_hub"] = True
            training_args_kwargs["hub_private_repo"] = True
            training_args_kwargs["hub_always_push"] = True

            if self.cfg.hub_strategy:
                training_args_kwargs["hub_strategy"] = self.cfg.hub_strategy

            if self.cfg.hub_revision:
                training_args_kwargs["hub_revision"] = self.cfg.hub_revision

    def _configure_save_and_eval_strategy(self, training_args_kwargs: dict):
        # save_strategy and save_steps
        if self.cfg.save_steps:
            training_args_kwargs["save_strategy"] = "steps"
            training_args_kwargs["save_steps"] = self.cfg.save_steps
        elif self.cfg.save_strategy:
            training_args_kwargs["save_strategy"] = self.cfg.save_strategy
        else:
            # default to saving each epoch if not defined
            training_args_kwargs["save_strategy"] = "epoch"

        training_args_kwargs["save_total_limit"] = (
            self.cfg.save_total_limit if self.cfg.save_total_limit else 4
        )

        # eval_strategy and eval_steps
        if not self.eval_dataset and self.cfg.val_set_size == 0:
            # do not eval if no eval_dataset and val_set_size=0
            training_args_kwargs["eval_strategy"] = "no"
        elif self.cfg.eval_steps:
            training_args_kwargs["eval_strategy"] = "steps"
            training_args_kwargs["eval_steps"] = self.cfg.eval_steps
            training_args_kwargs["eval_on_start"] = True
        elif self.cfg.eval_strategy:
            training_args_kwargs["eval_strategy"] = self.cfg.eval_strategy
            training_args_kwargs["eval_on_start"] = True

    def _configure_reporting(self, training_args_kwargs: dict):
        report_to = []
        if self.cfg.use_wandb:
            report_to.append("wandb")
        if self.cfg.use_mlflow:
            report_to.append("mlflow")
        if self.cfg.use_tensorboard:
            report_to.append("tensorboard")
        if self.cfg.use_comet:
            report_to.append("comet_ml")
        if self.cfg.use_trackio:
            report_to.append("trackio")

        training_args_kwargs["report_to"] = report_to

        if self.cfg.use_wandb:
            training_args_kwargs["run_name"] = self.cfg.wandb_name
        elif self.cfg.use_mlflow:
            training_args_kwargs["run_name"] = self.cfg.mlflow_run_name
        elif self.cfg.use_trackio:
            training_args_kwargs["run_name"] = self.cfg.trackio_run_name
        else:
            training_args_kwargs["run_name"] = None

    def _configure_torch_compile(self, training_args_kwargs: dict):
        if self.cfg.torch_compile and getattr(torch, "_dynamo", None):
            torch._dynamo.config.suppress_errors = True
            torch._dynamo.config.accumulated_cache_size_limit = 256
            training_args_kwargs["torch_compile"] = self.cfg.torch_compile
            if self.cfg.torch_compile_backend:
                training_args_kwargs["torch_compile_backend"] = (
                    self.cfg.torch_compile_backend
                )
            if self.cfg.torch_compile_mode:
                training_args_kwargs["torch_compile_mode"] = self.cfg.torch_compile_mode

    def _configure_accelerator_config(self, training_args_kwargs: dict):
        if self.cfg.accelerator_config:
            training_args_kwargs["accelerator_config"] = AcceleratorConfig(
                **self.cfg.accelerator_config
            )
        else:
            training_args_kwargs["accelerator_config"] = AcceleratorConfig()

    def _configure_gradient_checkpointing(self, training_args_kwargs: dict):
        if self.cfg.activation_offloading is True:
            # don't use the HF gradient checkpointing, manually wrap
            training_args_kwargs["gradient_checkpointing"] = False
            training_args_kwargs["activation_offloading"] = True
        elif self.cfg.gradient_checkpointing is not None:
            training_args_kwargs["gradient_checkpointing"] = (
                self.cfg.gradient_checkpointing
            )
            if self.cfg.gradient_checkpointing_kwargs is not None:
                training_args_kwargs["gradient_checkpointing_kwargs"] = (
                    self.cfg.gradient_checkpointing_kwargs
                )
            else:
                training_args_kwargs["gradient_checkpointing_kwargs"] = {
                    "use_reentrant": False
                }

    def _set_base_training_args(
        self, total_num_steps
    ) -> tuple[dict[str, Any], dict[str, Any]]:
        training_args_kwargs: dict[str, Any] = {}
        trainer_kwargs: dict[str, Any] = {}

        self._configure_warmup_and_logging(total_num_steps, training_args_kwargs)
        self._configure_precision_settings(training_args_kwargs)
        self._configure_save_and_eval_strategy(training_args_kwargs)
        self._configure_gradient_checkpointing(training_args_kwargs)

        # set arg into trainer_args_kwargs with same name if value not None
        for arg in [
            # optim/scheduler
            "adam_beta1",
            "adam_beta2",
            "adam_beta3",
            "adam_epsilon",
            "adam_epsilon2",
            "cosine_min_lr_ratio",
            "cosine_constant_lr_ratio",
            "optim_target_modules",
            # trainer
            "max_grad_norm",
            "dataloader_num_workers",
            "dataloader_pin_memory",
            "dataloader_prefetch_factor",
            "gradient_accumulation_steps",
            "learning_rate",
            "embedding_lr",
            "embedding_lr_scale",
            "lr_groups",
            "loraplus_lr_ratio",
            "loraplus_lr_embedding",
            "output_dir",
            "save_only_model",
            "weight_decay",
            "seed",
            "dion_momentum",
            "dion_rank_fraction",
            "dion_rank_multiple_of",
            "dataset_num_proc",
        ]:
            if hasattr(self.cfg, arg) and getattr(self.cfg, arg) is not None:
                training_args_kwargs[arg] = getattr(self.cfg, arg)

        arg_map = {
            "dion_learning_rate": "dion_lr",
            "include_num_input_tokens_seen": "include_tokens_per_second",
        }
        for kwarg, cfg_arg in arg_map.items():
            if hasattr(self.cfg, cfg_arg) and getattr(self.cfg, cfg_arg) is not None:
                training_args_kwargs[kwarg] = getattr(self.cfg, cfg_arg)

        training_args_kwargs["per_device_train_batch_size"] = self.cfg.micro_batch_size
        training_args_kwargs["average_tokens_across_devices"] = False

        if self.cfg.eval_batch_size:
            training_args_kwargs["per_device_eval_batch_size"] = (
                self.cfg.eval_batch_size
            )

        training_args_kwargs["include_tkps"] = self.cfg.include_tkps
        training_args_kwargs["max_steps"] = self.cfg.max_steps or total_num_steps or -1
        training_args_kwargs["num_train_epochs"] = self.cfg.num_epochs

        # max_length is not used in CausalTrainer
        if self.cfg.reward_model or self.cfg.rl:
            training_args_kwargs["max_length"] = self.cfg.sequence_len

        if self.cfg.fsdp_config or self.cfg.fsdp:
            training_args_kwargs["fsdp_config"] = self.cfg.fsdp_config
            training_args_kwargs["fsdp"] = self.cfg.fsdp if self.cfg.fsdp else True

        self._configure_reporting(training_args_kwargs)
        self._configure_hub_parameters(training_args_kwargs)
        self._configure_scheduler(training_args_kwargs)
        self._configure_optimizer(training_args_kwargs, trainer_kwargs)
        self._configure_torch_compile(training_args_kwargs)
        self._configure_accelerator_config(training_args_kwargs)

        return training_args_kwargs, trainer_kwargs


================================================
FILE: src/axolotl/core/builders/causal.py
================================================
"""Builder for causal trainers"""

import inspect
import math
import os
from pathlib import Path
from typing import Type, Union

import transformers
from transformers import (
    DataCollatorWithFlattening,
    EarlyStoppingCallback,
    Trainer,
)
from trl.trainer.reward_trainer import DataCollatorForPreference

from axolotl.core.builders.base import TrainerBuilderBase
from axolotl.core.trainers import (
    AxolotlMambaTrainer,
    AxolotlPRMTrainer,
    AxolotlRewardTrainer,
    AxolotlTrainer,
)
from axolotl.integrations.base import PluginManager
from axolotl.monkeypatch.multipack import SUPPORTED_MULTIPACK_MODEL_TYPES
from axolotl.monkeypatch.relora import ReLoRACallback
from axolotl.processing_strategies import get_processing_strategy
from axolotl.utils import is_comet_available, is_mlflow_available
from axolotl.utils.callbacks import (
    LossWatchDogCallback,
    bench_eval_callback_factory,
    causal_lm_bench_eval_callback_factory,
    colab_inference_post_train_callback,
    log_prediction_callback_factory,
)
from axolotl.utils.callbacks.lisa import lisa_callback_factory
from axolotl.utils.callbacks.qat import QATCallback
from axolotl.utils.callbacks.tokens_per_second import TokensPerSecondCallback
from axolotl.utils.chat_templates import get_chat_template_from_config
from axolotl.utils.collators import (
    BatchSamplerDataCollatorForSeq2Seq,
    DataCollatorForSeq2Seq,
    MambaDataCollator,
    V2BatchSamplerDataCollatorForSeq2Seq,
)
from axolotl.utils.collators.mm_chat import MultiModalChatDataCollator
from axolotl.utils.import_helper import get_cls_from_module_str
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


class HFCausalTrainerBuilder(TrainerBuilderBase):
    """
    Build the HuggingFace training args/trainer for causal models and reward modeling
    using TRL.
    """

    def get_callbacks(self):
        callbacks = super().get_callbacks()

        if self.cfg.relora:
            callbacks.append(ReLoRACallback(self.cfg))

        # TODO: check if can move to base class
        if self.cfg.loss_watchdog_threshold is not None:
            callbacks.append(LossWatchDogCallback(self.cfg))

        if self.cfg.qat:
            callbacks.append(QATCallback(self.cfg.qat))

        if self.cfg.include_tkps:
            callbacks.append(
                TokensPerSecondCallback(
                    self.cfg.tensor_parallel_size,
                    self.cfg.context_parallel_size,
                    resume_from_checkpoint=self.cfg.resume_from_checkpoint,
                )
            )
        return callbacks

    def get_post_trainer_create_callbacks(self, trainer):
        callbacks = []
        if self.cfg.use_wandb and self.cfg.eval_table_size > 0:
            LogPredictionCallback = log_prediction_callback_factory(
                trainer, self.tokenizer, "wandb"
            )
            callbacks.append(LogPredictionCallback(self.cfg))
        if (
            self.cfg.use_mlflow
            and is_mlflow_available()
            and self.cfg.eval_table_size > 0
        ):
            LogPredictionCallback = log_prediction_callback_factory(
                trainer, self.tokenizer, "mlflow"
            )
            callbacks.append(LogPredictionCallback(self.cfg))
        if self.cfg.use_comet and is_comet_available() and self.cfg.eval_table_size > 0:
            LogPredictionCallback = log_prediction_callback_factory(
                trainer, self.tokenizer, "comet_ml"
            )
            callbacks.append(LogPredictionCallback(self.cfg))

        if self.cfg.do_bench_eval:
            callbacks.append(bench_eval_callback_factory(trainer, self.tokenizer))
        if self.cfg.do_causal_lm_eval:
            CausalLMBenchEvalCallback = causal_lm_bench_eval_callback_factory(
                trainer, self.tokenizer
            )
            callbacks.append(CausalLMBenchEvalCallback(self.cfg))

        if self.cfg.early_stopping_patience:
            early_stop_cb = EarlyStoppingCallback(
                self.cfg.early_stopping_patience,
            )
            callbacks.append(early_stop_cb)

        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
            callbacks.append(lisa_callback_factory(trainer))

        if any("COLAB_" in key for key in os.environ):
            ColabCallback = colab_inference_post_train_callback(trainer)
            callbacks.append(ColabCallback(self.cfg))

        if getattr(self.cfg, "generate_samples", False):
            from axolotl.utils.callbacks.generation import SFTGenerationCallback

            callbacks.append(SFTGenerationCallback(trainer))
            LOG.info("SFT sample generation enabled")

        callbacks.extend(super().get_post_trainer_create_callbacks(trainer=trainer))
        return callbacks

    def _get_trainer_cls(self):
        """
        Gets the trainer class for the given configuration.
        """
        if self.cfg.plugins:
            plugin_manager = PluginManager.get_instance()
            trainer_cls = plugin_manager.get_trainer_cls(self.cfg)
            if trainer_cls:
                return trainer_cls
        if self.cfg.model_config_type == "mamba":
            return AxolotlMambaTrainer
        if self.cfg.reward_model:
            return AxolotlRewardTrainer
        if self.cfg.process_reward_model:
            return AxolotlPRMTrainer

        if self.cfg.trainer_cls:
            # override the trainer cls
            try:
                trainer_cls = get_cls_from_module_str(self.cfg.trainer_cls)
                LOG.debug(f"Using custom trainer class: {self.cfg.trainer_cls}")
                return trainer_cls
            except (ImportError, AttributeError, ValueError) as e:
                raise ValueError(
                    f"Failed to load custom trainer class '{self.cfg.trainer_cls}': {e}"
                ) from e

        return AxolotlTrainer

    def build(self, total_num_steps):
        from axolotl.core.training_args import (
            AxolotlPRMConfig,
            AxolotlRewardConfig,
            AxolotlTrainingArguments,
        )

        training_arguments_kwargs, trainer_kwargs = self._set_base_training_args(
            total_num_steps
        )
        if self.cfg.adapter == "qlora":
            training_arguments_kwargs["qlora"] = True

        # deepspeed
        if self.cfg.deepspeed:
            training_arguments_kwargs["deepspeed"] = self.cfg.deepspeed

        if self.cfg.lr_quadratic_warmup is not None:
            training_arguments_kwargs["lr_quadratic_warmup"] = (
                self.cfg.lr_quadratic_warmup
            )

        if self.cfg.dataloader_drop_last is not None:
            training_arguments_kwargs["dataloader_drop_last"] = (
                self.cfg.dataloader_drop_last
            )
        elif self.cfg.sample_packing and self.cfg.eval_sample_packing is False:
            training_arguments_kwargs["dataloader_drop_last"] = True

        if self.cfg.remove_unused_columns is not None:
            training_arguments_kwargs["remove_unused_columns"] = (
                self.cfg.remove_unused_columns
            )

        if self.cfg.do_bench_eval:
            training_arguments_kwargs["do_bench_eval"] = self.cfg.do_bench_eval
            if self.cfg.bench_dataset:
                training_arguments_kwargs["bench_dataset"] = self.cfg.bench_dataset
        if self.cfg.do_causal_lm_eval:
            training_arguments_kwargs["do_causal_lm_eval"] = self.cfg.do_causal_lm_eval
        if self.cfg.metric_for_best_model:
            training_arguments_kwargs["metric_for_best_model"] = (
                self.cfg.metric_for_best_model
            )
        if self.cfg.greater_is_better:
            training_arguments_kwargs["greater_is_better"] = self.cfg.greater_is_better

        # DDP Config
        if self.cfg.ddp_timeout:
            training_arguments_kwargs["ddp_timeout"] = self.cfg.ddp_timeout
        # see https://pytorch.org/docs/stable/generated/torch.nn.parallel.DistributedDataParallel.html
        if self.cfg.ddp_bucket_cap_mb:
            training_arguments_kwargs["ddp_bucket_cap_mb"] = self.cfg.ddp_bucket_cap_mb
        if self.cfg.ddp_broadcast_buffers is not None:
            training_arguments_kwargs["ddp_broadcast_buffers"] = (
                self.cfg.ddp_broadcast_buffers
            )

        # these are all the "standard" kwargs that are def used
        training_arguments_kwargs["max_seq_length"] = self.cfg.sequence_len

        if self.cfg.auto_find_batch_size is not None:
            training_arguments_kwargs["auto_find_batch_size"] = (
                self.cfg.auto_find_batch_size
            )

        training_arguments_kwargs["eval_accumulation_steps"] = (
            self.cfg.gradient_accumulation_steps
        )

        training_arguments_kwargs["load_best_model_at_end"] = (
            (
                self.cfg.load_best_model_at_end is not False
                or self.cfg.early_stopping_patience
            )
            and (
                (not self.cfg.test_datasets and self.cfg.val_set_size > 0)
                or (self.cfg.test_datasets and self.cfg.val_set_size == 0)
            )
            and self.cfg.save_steps
            and self.cfg.eval_steps
            and self.cfg.save_steps % self.cfg.eval_steps == 0
        ) or False

        # handle ddp
        ddp_find_unused_parameters = None
        if self.cfg.ddp:
            ddp_find_unused_parameters = bool(self.cfg.ddp_find_unused_parameters)
        training_arguments_kwargs["ddp_find_unused_parameters"] = (
            ddp_find_unused_parameters
        )

        if self.cfg.group_by_length:
            training_arguments_kwargs["train_sampling_strategy"] = "group_by_length"
        training_arguments_kwargs["curriculum_sampling"] = self.cfg.curriculum_sampling

        training_arguments_kwargs["sample_packing"] = bool(self.cfg.sample_packing)
        training_arguments_kwargs["sample_packing_drop_attention_mask"] = bool(
            self.cfg.flash_attention
            or self.cfg.xformers_attention
            or self.cfg.flex_attention
        )
        training_arguments_kwargs["multipack_real_batches"] = (
            self.cfg.multipack_real_batches
            if self.cfg.multipack_real_batches is not None
            else not (
                self.cfg.flash_attention
                or self.cfg.flex_attention
                or self.cfg.xformers_attention
            )
        )
        training_arguments_kwargs["eval_sample_packing"] = bool(
            self.cfg.eval_sample_packing
        )
        if self.cfg.sample_packing_sequentially is not None:
            training_arguments_kwargs["sample_packing_sequentially"] = (
                self.cfg.sample_packing_sequentially
            )
        if self.cfg.sample_packing_bin_size is not None:
            training_arguments_kwargs["sample_packing_bin_size"] = (
                self.cfg.sample_packing_bin_size
            )
        if self.cfg.sample_packing_group_size is not None:
            training_arguments_kwargs["sample_packing_group_size"] = (
                self.cfg.sample_packing_group_size
            )
        if self.cfg.sample_packing_eff_est:
            training_arguments_kwargs["sample_packing_efficiency"] = (
                self.cfg.sample_packing_eff_est
            )

        if self.cfg.relora and self.cfg.jagged_restart_steps:
            if self.cfg.relora_prune_ratio:
                training_arguments_kwargs["relora_prune_ratio"] = (
                    self.cfg.relora_prune_ratio
                )

        if self.cfg.jagged_restart_steps:
            training_arguments_kwargs["jagged_restart_steps"] = (
                self.cfg.jagged_restart_steps
            )
            if self.cfg.jagged_restart_warmup_steps:
                training_arguments_kwargs["jagged_restart_warmup_steps"] = (
                    self.cfg.jagged_restart_warmup_steps
                )
            if self.cfg.jagged_restart_anneal_steps:
                training_arguments_kwargs["jagged_restart_anneal_steps"] = (
                    self.cfg.jagged_restart_anneal_steps
                )

        if self.cfg.lisa_step_interval and self.cfg.lisa_n_layers:
            training_arguments_kwargs["lisa_n_layers"] = self.cfg.lisa_n_layers
            training_arguments_kwargs["lisa_step_interval"] = (
                self.cfg.lisa_step_interval
            )
            training_arguments_kwargs["lisa_layers_attribute"] = (
                self.cfg.lisa_layers_attribute
            )

        training_arguments_kwargs = self.hook_pre_create_training_args(
            training_arguments_kwargs
        )
        training_arguments_kwargs["model_type"] = self.cfg.model_config_type
        training_arguments_kwargs["pretraining"] = bool(self.cfg.pretraining_dataset)
        if self.cfg.chat_template:
            training_arguments_kwargs["chat_template"] = get_chat_template_from_config(
                cfg=self.cfg,
                tokenizer=self.tokenizer,
            )

        if self.cfg.neftune_noise_alpha is not None:
            training_arguments_kwargs["neftune_noise_alpha"] = (
                self.cfg.neftune_noise_alpha
            )

        if self.cfg.image_size:
            training_arguments_kwargs["image_size"] = self.cfg.image_size
        if self.cfg.image_resize_algorithm:
            training_arguments_kwargs["image_resize_algorithm"] = (
                self.cfg.image_resize_algorithm
            )

        if self.cfg.plugins:
            plugin_manager = PluginManager.get_instance()
            plugin_training_args = plugin_manager.get_training_args(self.cfg)
            if plugin_training_args:
                training_arguments_kwargs.update(plugin_training_args)

        if self.cfg.reward_model:
            training_args_cls = AxolotlRewardConfig
            if self.cfg.center_rewards_coefficient is not None:
                training_arguments_kwargs["center_rewards_coefficient"] = (
                    self.cfg.center_rewards_coefficient
                )
        elif self.cfg.process_reward_model:
            training_args_cls = AxolotlPRMConfig
        else:
            training_args_cls = AxolotlTrainingArguments
        training_args = training_args_cls(
            **training_arguments_kwargs,
        )
        training_args = self.hook_post_create_training_args(training_args)

        # unset run_name so wandb sets up experiment names
        if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
            training_args.run_name = None

        data_collator_kwargs = {
            "padding": True,  # True/"longest" is the default
        }
        multiple = 64
        if self.cfg.pad_to_sequence_len:
            data_collator_kwargs["pad_to_multiple_of"] = multiple * math.ceil(
                self.cfg.sequence_len / multiple
            )
        elif self.cfg.pad_to_sequence_len is None:
            # A100 is best at 64, while others at 8. Let's use the larger so we don't have to check
            # https://docs.nvidia.com/deeplearning/performance/dl-performance-matrix-multiplication/index.html
            data_collator_kwargs["pad_to_multiple_of"] = multiple

        if self.cfg.use_eaft:
            from functools import partial

            from axolotl.monkeypatch.loss.eaft import eaft_loss

            configured_eaft_loss = partial(
                eaft_loss,
                alpha=self.cfg.eaft_alpha if self.cfg.eaft_alpha is not None else 1.0,
                k=self.cfg.eaft_k if self.cfg.eaft_k is not None else 20,
            )
            trainer_kwargs["compute_loss_func"] = configured_eaft_loss

        trainer_cls = self._get_trainer_cls()

        trainer_kwargs, trainer_cls = self.hook_pre_create_trainer(
            trainer_kwargs, trainer_cls
        )
        if eval_data_collator := self.build_collator(
            training_args, is_eval=True, **data_collator_kwargs
        ):
            if not (self.cfg.reward_model or self.cfg.process_reward_model):
                trainer_kwargs["eval_data_collator"] = eval_data_collator
        if not (self.cfg.reward_model or self.cfg.process_reward_model):
            trainer_kwargs["bench_data_collator"] = transformers.DataCollatorForSeq2Seq(
                self.tokenizer,
                return_tensors="pt",
                **data_collator_kwargs,
            )
        sig = inspect.signature(trainer_cls)
        if "processing_class" in sig.parameters or issubclass(trainer_cls, Trainer):
            trainer_kwargs["processing_class"] = self.tokenizer
        elif "tokenizer" in sig.parameters:
            trainer_kwargs["tokenizer"] = self.tokenizer

        if (
            trainer_cls not in [AxolotlRewardTrainer, AxolotlPRMTrainer]
            and self.cfg.datasets is not None
        ):
            trainer_kwargs["dataset_tags"] = [
                d["path"] for d in self.cfg.datasets if not Path(d["path"]).is_dir()
            ]
        # TRL's RewardTrainer validates num_labels=1 on pre-loaded models; ensure the
        # config reflects this regardless of how the model was instantiated.
        if (
            self.cfg.reward_model
            and getattr(self.model.config, "num_labels", None) != 1
        ):
            self.model.config.num_labels = 1
        trainer = trainer_cls(
            model=self.model,
            train_dataset=self.train_dataset,
            eval_dataset=self.eval_dataset,
            args=training_args,
            data_collator=self.build_collator(training_args, **data_collator_kwargs),
            callbacks=self.get_callbacks(),
            **trainer_kwargs,
        )
        trainer = self.hook_post_create_trainer(trainer)
        # if the trainer has the `axolotl_cfg` property, set it
        if hasattr(trainer, "axolotl_cfg"):
            trainer.axolotl_cfg = self.cfg
        for callback in self.get_post_trainer_create_callbacks(trainer):
            trainer.add_callback(callback)

        if self.cfg.deepspeed and self.cfg.sample_packing:
            trainer.accelerator.state.deepspeed_plugin.deepspeed_config[
                "train_micro_batch_size_per_gpu"
            ] = self.cfg.micro_batch_size

        return trainer

    def build_collator(
        self,
        training_args,  # type: "AxolotlTrainingArguments"  # type: ignore
        is_eval=False,
        **kwargs,
    ):
        if training_args.pretraining:
            if (
                self.cfg.pretraining_sample_concatenation is False
                or self.cfg.micro_batch_size > 1
            ):
                return DataCollatorForSeq2Seq(self.tokenizer, **kwargs)
            if not (self.cfg.sample_packing and self.cfg.pretrain_multipack_attn) or (
                self.cfg.micro_batch_size == 1 and is_eval is False
            ):
                return None

        if self.cfg.model_config_type == "mamba":
            return MambaDataCollator(tokenizer=self.tokenizer)

        use_batch_sampler_collator = False
        if is_eval is False and training_args.sample_packing:
            use_batch_sampler_collator = True
        if is_eval and training_args.eval_sample_packing:
            use_batch_sampler_collator = True

        collator: Type[
            Union[
                V2BatchSamplerDataCollatorForSeq2Seq,
                BatchSamplerDataCollatorForSeq2Seq,
                DataCollatorForSeq2Seq,
                DataCollatorWithFlattening,
                DataCollatorForPreference,
            ]
        ]
        collator_args = [self.tokenizer]

        collator_cls_and_kwargs = None
        if self.cfg.plugins:
            plugin_manager = PluginManager.get_instance()
            collator_cls_and_kwargs = plugin_manager.get_collator_cls_and_kwargs(
                self.cfg, is_eval=is_eval
            )

        if collator_cls_and_kwargs:
            collator = collator_cls_and_kwargs[0]
            if kwargs and isinstance(kwargs, dict):
                kwargs.update(collator_cls_and_kwargs[1])
        elif self.cfg.reward_model:
            collator = DataCollatorForPreference
            tokenizer = collator_args.pop(0)
            kwargs["pad_token_id"] = tokenizer.pad_token_id
            kwargs.pop("padding")
        elif use_batch_sampler_collator:
            # Use V2BatchSamplerDataCollatorForSeq2Seq for flex attention,
            # supported multipack models, or non-flash-attention llama
            if (
                self.cfg.flex_attention
                or self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES
                or (
                    self.cfg.model_config_type in ["llama"]
                    and self.cfg.flash_attention is not True
                )
            ):
                collator = V2BatchSamplerDataCollatorForSeq2Seq
            else:
                collator = BatchSamplerDataCollatorForSeq2Seq
        else:
            if self.cfg.processor_type and self.processor:
                collator = MultiModalChatDataCollator
                kwargs["processing_strategy"] = get_processing_strategy(
                    self.processor,
                    training_args.chat_template,
                    self.cfg.chat_template,
                    image_size=training_args.image_size,
                    image_resize_algorithm=training_args.image_resize_algorithm,
                )
            elif self.cfg.batch_flattening:
                collator = DataCollatorWithFlattening
                collator_args.pop(0)
                kwargs.pop("pad_to_multiple_of", None)
                kwargs.pop("padding", None)
            else:
                collator = DataCollatorForSeq2Seq

        kwargs["return_tensors"] = "pt"

        return collator(
            *collator_args,
            **kwargs,
        )


================================================
FILE: src/axolotl/core/builders/rl.py
================================================
"""Builder for RLHF trainers"""

import inspect
from pathlib import Path

from axolotl.core.builders.base import TrainerBuilderBase
from axolotl.core.trainers import (
    AxolotlCPOTrainer,
    AxolotlKTOTrainer,
    AxolotlORPOTrainer,
)
from axolotl.core.trainers.dpo import DPOStrategy
from axolotl.core.trainers.dpo.args import AxolotlDPOConfig
from axolotl.integrations.base import PluginManager
from axolotl.loaders.utils import ensure_dtype
from axolotl.utils.callbacks.qat import QATCallback
from axolotl.utils.import_helper import get_cls_from_module_str
from axolotl.utils.logging import get_logger
from axolotl.utils.schemas.enums import RLType

LOG = get_logger(__name__)


class HFRLTrainerBuilder(TrainerBuilderBase):
    """Trainer factory class for TRL-based RLHF trainers (e.g. DPO)"""

    def get_callbacks(self):
        callbacks = super().get_callbacks()

        if self.cfg.qat:
            callbacks.append(QATCallback(self.cfg.qat))

        return callbacks

    def get_post_trainer_create_callbacks(self, trainer):
        callbacks = super().get_post_trainer_create_callbacks(trainer=trainer)
        return callbacks

    def _get_trainer_cls(self, trainer_kwargs: dict):
        """
        Returns trainer_cls and trainer_cls_args
        """
        if self.cfg.plugins:
            plugin_manager = PluginManager.get_instance()
            trainer_cls = plugin_manager.get_trainer_cls(self.cfg)
            trainer_cls_args = []  # type: ignore

            if trainer_cls is not None:
                return trainer_cls, trainer_cls_args

        trainer_cls = None
        trainer_cls_args = [self.model]

        if self.cfg.rl in {RLType.GRPO, RLType.GDPO}:
            from axolotl.core.trainers.grpo import GRPOStrategy

            async_grpo = bool(
                self.cfg.trl
                and (
                    getattr(self.cfg.trl, "async_prefetch", False)
                    or getattr(self.cfg.trl, "use_data_producer", False)
                )
            )
            trainer_cls = GRPOStrategy.get_trainer_class(
                sequence_parallel=self.cfg.context_parallel_size > 1,
                async_grpo=async_grpo,
            )
            trainer_cls_args.extend(GRPOStrategy.set_trainer_args(self.cfg))
            trainer_kwargs.update(GRPOStrategy.set_trainer_kwargs(self.cfg))

        elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
            trainer_cls = DPOStrategy.get_trainer_class()
            trainer_cls_args.append(self.model_ref)

        elif self.cfg.rl is RLType.ORPO:
            trainer_cls = AxolotlORPOTrainer
        elif self.cfg.rl is RLType.KTO:
            trainer_cls = AxolotlKTOTrainer
        elif self.cfg.rl is RLType.SIMPO:
            trainer_cls = AxolotlCPOTrainer
        else:
            raise ValueError(f"Unsupported RL: {self.cfg.rl}")

        if self.cfg.trainer_cls:
            # override the trainer cls
            try:
                trainer_cls = get_cls_from_module_str(self.cfg.trainer_cls)
                LOG.debug(f"Using custom trainer class: {self.cfg.trainer_cls}")
            except (ImportError, AttributeError, ValueError) as e:
                raise ValueError(
                    f"Failed to load custom trainer class '{self.cfg.trainer_cls}': {e}"
                ) from e

        return trainer_cls, trainer_cls_args

    def _build_training_arguments(self, total_num_steps):
        """
        Returns training_args and trainer_kwargs
        """
        from axolotl.core.training_args import (
            AxolotlCPOConfig,
            AxolotlKTOConfig,
            AxolotlORPOConfig,
        )

        training_args_kwargs, trainer_kwargs = self._set_base_training_args(
            total_num_steps=total_num_steps
        )

        if self.cfg.remove_unused_columns is not None:
            training_args_kwargs["remove_unused_columns"] = (
                self.cfg.remove_unused_columns
            )
        else:
            training_args_kwargs["remove_unused_columns"] = False

        if self.cfg.trl and self.cfg.trl.beta is not None:
            training_args_kwargs["beta"] = self.cfg.trl.beta
        elif self.cfg.rl_beta is not None:
            training_args_kwargs["beta"] = self.cfg.rl_beta
        elif self.cfg.orpo_alpha is not None:
            # trl does some odd mapping of alpha to beta to reuse the beta parameter ???
            training_args_kwargs["beta"] = self.cfg.orpo_alpha

        if self.cfg.rpo_alpha is not None:
            training_args_kwargs["rpo_alpha"] = self.cfg.rpo_alpha

        if self.cfg.use_wandb:
            training_args_kwargs["run_name"] = self.cfg.wandb_name

        training_args_cls = None
        blocklist_args_kwargs = []
        if self.cfg.rl is RLType.SIMPO:
            training_args_cls = AxolotlCPOConfig
            training_args_kwargs["loss_type"] = "simpo"
            training_args_kwargs["simpo_gamma"] = self.cfg.simpo_gamma
            if self.cfg.cpo_alpha is not None:
                training_args_kwargs["cpo_alpha"] = self.cfg.cpo_alpha

            blocklist_args_kwargs.append("max_prompt_length")

        elif self.cfg.rl is RLType.ORPO:
            training_args_cls = AxolotlORPOConfig

            blocklist_args_kwargs.append("max_prompt_length")

        elif self.cfg.rl is RLType.KTO:
            training_args_cls = AxolotlKTOConfig
            # KTOConfig in TRL >= 0.27.0 no longer accepts max_prompt_length
            blocklist_args_kwargs.append("max_prompt_length")

            training_args_kwargs["desirable_weight"] = (
                self.cfg.kto_desirable_weight or 1.0
            )
            training_args_kwargs["undesirable_weight"] = (
                self.cfg.kto_undesirable_weight or 1.0
            )

        elif self.cfg.rl in {RLType.GRPO, RLType.GDPO}:
            from axolotl.core.trainers.grpo import GRPOStrategy

            async_grpo = bool(
                self.cfg.trl
                and (
                    getattr(self.cfg.trl, "async_prefetch", False)
                    or getattr(self.cfg.trl, "use_data_producer", False)
                )
            )
            training_args_cls = GRPOStrategy.get_training_args_class(
                async_grpo=async_grpo
            )
            training_args_kwargs.update(GRPOStrategy.set_training_args_kwargs(self.cfg))
            blocklist_args_kwargs = GRPOStrategy.get_blocklist_args_kwargs()
            if self.cfg.rl is RLType.GDPO:
                training_args_kwargs.setdefault(
                    "multi_objective_aggregation", "normalize_then_sum"
                )

        elif self.cfg.rl in [RLType.DPO, RLType.IPO]:
            training_args_cls = AxolotlDPOConfig
            training_args_kwargs.update(DPOStrategy.set_training_args_kwargs(self.cfg))
        else:
            raise ValueError(f"Unsupported RL: {self.cfg.rl}")

        for blocklist_key in blocklist_args_kwargs:
            if blocklist_key in training_args_kwargs:
                del training_args_kwargs[blocklist_key]

        if self.cfg.plugins:
            plugin_manager = PluginManager.get_instance()
            plugin_training_args = plugin_manager.get_training_args(self.cfg)
            if plugin_training_args:
                training_args_kwargs.update(plugin_training_args)

        training_args = training_args_cls(
            logging_first_step=True,
            **training_args_kwargs,
        )

        # unset run_name so wandb sets up experiment names
        if self.cfg.use_wandb and training_args.run_name == training_args.output_dir:
            training_args.run_name = None

        return training_args, trainer_kwargs

    def build(self, total_num_steps):
        training_args, trainer_kwargs = self._build_training_arguments(total_num_steps)

        if self.eval_dataset:
            trainer_kwargs["eval_dataset"] = self.eval_dataset
        if self.cfg.adapter and self.peft_config and self.cfg.rl is not RLType.GRPO:
            trainer_kwargs["peft_config"] = self.peft_config
        if self.cfg.precompute_ref_log_probs is not None:
            trainer_kwargs["precompute_ref_log_probs"] = (
                self.cfg.precompute_ref_log_probs
            )

        trainer_cls, trainer_cls_args = self._get_trainer_cls(trainer_kwargs)

        sig = inspect.signature(trainer_cls)
        if "tokenizer" in sig.parameters:
            trainer_kwargs["tokenizer"] = self.tokenizer
        else:
            trainer_kwargs["processing_class"] = self.tokenizer

        if self.cfg.datasets is not None and (
            trainer_cls is DPOStrategy.get_trainer_class()
        ):
            trainer_kwargs["dataset_tags"] = [
                d["path"] for d in self.cfg.datasets if not Path(d["path"]).is_dir()
            ]

        trainer_kwargs, trainer_cls = self.hook_pre_create_trainer(
            trainer_kwargs, trainer_cls
        )

        # Allow FP8-quantized models to be fine-tuned with LoRA adapters.
        # transformers' validate_quantization_for_training blocks FP8 because
        # hf_quantizer.is_trainable is False, but LoRA only trains the adapters
        # (base weights stay frozen in FP8).
        _orig_validate_quant = None
        if (
            self.cfg.adapter
            and hasattr(self.model, "is_quantized")
            and self.model.is_quantized
        ):
            import transformers.trainer as _trainer_module

            _orig_validate_quant = _trainer_module.validate_quantization_for_training
            _trainer_module.validate_quantization_for_training = lambda model: None

        try:
            trainer = trainer_cls(
                *trainer_cls_args,
                args=training_args,
                train_dataset=self.train_dataset,
                callbacks=self.get_callbacks(),
                **trainer_kwargs,
            )
        finally:
            if _orig_validate_quant is not None:
                import transformers.trainer as _trainer_module

                _trainer_module.validate_quantization_for_training = (
                    _orig_validate_quant
                )
        if self.cfg.fsdp_config or self.cfg.fsdp:
            ensure_dtype(trainer.model, dtype=self.cfg.torch_dtype)
            if self.cfg.rl in [RLType.DPO, RLType.IPO] and trainer.ref_model:
                ensure_dtype(trainer.ref_model, dtype=self.cfg.torch_dtype)

        trainer = self.hook_post_create_trainer(trainer)
        for callback in self.get_post_trainer_create_callbacks(trainer):
            trainer.add_callback(callback)

        return trainer


================================================
FILE: src/axolotl/core/chat/__init__.py
================================================


================================================
FILE: src/axolotl/core/chat/format/__init__.py
================================================


================================================
FILE: src/axolotl/core/chat/format/chatml.py
================================================
"""
ChatML transformation functions for MessageContents
"""

from typing import Optional

from ..messages import MessageContents, Messages
from .shared import wrap_tools


def format_message(
    message: Messages,
    message_index: Optional[int] = None,
) -> Messages:
    if message.is_chat_formatted:
        return message

    # prepend the role prefix within a MessageContents to message.content
    message.content.insert(
        0,
        MessageContents(
            type="text",
            value=f"<|im_start|>{message.role}\n",
            weight=0,
        ),
    )
    message.content.append(
        MessageContents(type="text", value="<|im_end|>", weight=message.weight)
    )
    message.content.append(MessageContents(type="text", value="\n", weight=0))

    message = wrap_tools(message)

    message.is_chat_formatted = True
    return message


================================================
FILE: src/axolotl/core/chat/format/llama3x.py
================================================
"""
Llama 3.x chat formatting functions for MessageContents
"""

from typing import Optional

from ..messages import MessageContents, Messages
from .shared import wrap_tools


def format_message(message: Messages, message_index: Optional[int] = None) -> Messages:
    if message.is_chat_formatted:
        return message

    message_role = message.role
    if message.role == "tool":
        message_role = "ipython"

    # prepend the role prefix within a MessageContents to message.content
    message.content.insert(
        0,
        MessageContents(
            type="text",
            value=f"<|start_header_id|>{message_role}<|end_header_id|>\n\n",
            weight=0,
        ),
    )

    message.content.append(
        MessageContents(type="text", value="<|eot_id|>", weight=message.weight)
    )

    message = wrap_tools(message)

    if message_index == 0:
        message.content.insert(
            0,
            MessageContents(
                type="text",
                value="<|begin_of_text|>",
                weight=0,
            ),
        )

    message.is_chat_formatted = True
    return message


================================================
FILE: src/axolotl/core/chat/format/shared.py
================================================
"""
shared functions for format transforms
"""

from axolotl.core.chat.messages import MessageContents, Messages


def wrap_tools(message: Messages):
    # loop over message.content by index to find tool calls, we need to wrap each with tags,
    # so be wary of indexing issues when changing the list while iterating.
    # iterate over the range in reverse order to avoid index shifting
    for i in range(len(message.content) - 1, -1, -1):
        if message.content[i].type == "tool_call":
            # append a </tool_call> MessageContents text tag after
            message.content.insert(
                i + 1,
                MessageContents(
                    type="text", value="</tool_call>\n", weight=message.weight
                ),
            )
            # make sure the actual tool call content ends with a newline
            message.content[i].has_newline = True
            # prepend a <tool_call> MessageContents text tag before
            message.content.insert(
                i,
                MessageContents(
                    type="text", value="<tool_call>\n", weight=message.weight
                ),
            )
        elif message.content[i].type == "tool_response":
            # append a </tool_call> MessageContents text tag after
            message.content.insert(
                i + 1,
                MessageContents(
                    type="text", value="</tool_response>\n", weight=message.weight
                ),
            )
            # make sure the actual tool response content ends with a newline
            message.content[i].has_newline = True
            # prepend a <tool_call> MessageContents text tag before
            message.content.insert(
                i,
                MessageContents(
                    type="text", value="<tool_response>\n", weight=message.weight
                ),
            )

    return message


================================================
FILE: src/axolotl/core/chat/messages.py
================================================
"""
internal message representations of chat messages
"""

import json
from enum import Enum
from typing import Any, Callable, List, Optional, Union

from pydantic import BaseModel
from transformers import PreTrainedTokenizer


class MessageRoles(str, Enum):
    """
    Message roles for the system, user, assistant, and tools
    """

    system = "system"
    user = "user"
    assistant = "assistant"
    tool = "tool"
    ipython = (
        # for responses from builtin tools
        "ipython"
    )


class MessageContentTypes(str, Enum):
    """
    Message content types for text, image, audio, tool calls, and tool responses
    """

    special_token = "special_token"  # nosec B105
    text = "text"
    image = "image"
    audio = "audio"
    tool_call = "tool_call"
    tool_response = "tool_response"


class SpecialToken(str, Enum):
    """
    Special tokens for beginning of string and end of string
    """

    bos_token = "bos_token"  # nosec B105
    eos_token = "eos_token"  # nosec B105


class ToolCallFunction(BaseModel):
    """
    Tool call function with name and arguments
    """

    name: str
    arguments: dict[str, str]


class Tool(BaseModel):
    """
    Tool with description, function, and parameters
    """

    description: str
    function: ToolCallFunction
    parameters: dict[str, str]  # .properties


class ToolCallContents(BaseModel):
    """
    Tool call contents with name, arguments, and optional id
    """

    name: str
    arguments: dict[str, Union[str, int]]
    id: Optional[str] = None

    def __str__(self) -> str:
        data = {"name": self.name, "arguments": self.arguments}
        if self.id is not None:
            data["id"] = self.id
        return json.dumps(data)


class ToolResponseContents(BaseModel):
    """
    Tool response contents with name, content, and optional id
    """

    name: str
    content: Union[str, dict[str, Union[str, int, float]]]
    id: Optional[str] = None

    def __str__(self) -> str:
        data = {"name": self.name, "content": self.content}
        if self.id is not None:
            data["id"] = self.id
        return json.dumps(data)


class MessageContents(BaseModel):
    """
    Message contents with type, value, metadata, weight, newline, and end of contents
    """

    type: Union[str, MessageContentTypes]
    value: Union[str, ToolCallContents, ToolResponseContents, SpecialToken]
    meta: Optional[dict[str, Any]] = None  # support additional arbitrary metadata
    weight: Optional[Union[int, float]] = None
    has_newline: bool = False
    eoc: bool = False  # end of contents

    def __str__(self) -> str:
        str_val = str(self.value)
        if self.has_newline and not str_val.endswith("\n"):
            str_val += "\n"
        return str_val


class Messages(BaseModel):
    """
    Messages with role, content, metadata, weight, and chat formatting
    """

    role: Union[MessageRoles, str]  # allows for arbitrary roles
    content: List["MessageContents"]
    meta: Optional[dict[str, Any]] = None  # support additional arbitrary metadata
    weight: Optional[Union[int, float]] = None
    is_chat_formatted: bool = False

    def __str__(self) -> str:
        return "".join(str(c) for c in self.content)

    def tokenized(
        self, tokenizer: PreTrainedTokenizer, ignore_index=-100
    ) -> dict[str, List[int]]:
        # iterate over the contents, tokenizing the concatenated string values up to the current MessageContents
        # returns a dictionary mapping w input_ids, attention_mask, and labels
        input_ids: List[int] = []
        labels: List[int] = []
        pending_input_ids: List[int] = []
        pending_weight = self.weight
        running_content = ""
        for _, msg_content in enumerate(self.content):
            # TODO also handle non-text content types
            if msg_content.type in [
                MessageContentTypes.text.value,
                MessageContentTypes.tool_call.value,
                MessageContentTypes.tool_response.value,
            ]:
                running_content += str(msg_content)
                tok_results = tokenizer(running_content, add_special_tokens=False)
                tok_input_ids = tok_results["input_ids"]
                if pending_input_ids:
                    new_pending_inputs = tok_input_ids[
                        len(input_ids) : len(input_ids) + len(pending_input_ids)
                    ]
                    if new_pending_inputs != pending_input_ids:
                        pending_input_ids = new_pending_inputs
                    input_ids.extend(pending_input_ids)
                    if pending_weight:
                        labels.extend(pending_input_ids)
                    else:
                        labels.extend([ignore_index] * len(pending_input_ids))
                pending_input_ids = tok_results["input_ids"][len(input_ids) :]
                pending_weight = self.weight and msg_content.weight not in [0, 0.0]
        input_ids.extend(pending_input_ids)
        if pending_weight:
            labels.extend(pending_input_ids)
        else:
            labels.extend([ignore_index] * len(pending_input_ids))
        attention_mask = [1] * len(input_ids)
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }


class Chats(BaseModel):
    """
    top level data structure for chat conversations
    """

    conversation: List[Messages]

    def __str__(self) -> str:
        return "".join(str(c) for c in self.conversation)

    def tokenized(
        self, tokenizer: Callable[[str], dict[str, List[int]]], ignore_index=-100
    ) -> dict[str, List[int]]:
        input_ids = []
        attention_mask = []
        labels = []
        for msg in self.conversation:
            msg_results = msg.tokenized(tokenizer, ignore_index)
            input_ids.extend(msg_results["input_ids"])
            attention_mask.extend(msg_results["attention_mask"])
            labels.extend(msg_results["labels"])
        return {
            "input_ids": input_ids,
            "attention_mask": attention_mask,
            "labels": labels,
        }


class ChatFormattedChats(Chats):
    """
    Chat formatted chats with formatter and optional train on inputs
    """

    formatter: Callable  # [[Union[dict, Chats]], Chats]
    train_on_inputs: bool = False

    def model_post_init(self, __context):
        for i, msg in enumerate(self.conversation):
            self.conversation[i] = self.formatter(msg, message_index=i)
            if self.train_on_inputs:
                self.conversation[i].weight = 1


class PreferenceChats(BaseModel):
    """
    representation for preference data for chat
    """

    prompt: List[Messages]
    chosen: Messages
    rejected: Messages


================================================
FILE: src/axolotl/core/datasets/__init__.py
================================================


================================================
FILE: src/axolotl/core/datasets/chat.py
================================================
"""
chat dataset module
"""

from typing import Callable, Optional, Union

from datasets import Dataset
from transformers import PreTrainedTokenizer

from axolotl.core.chat.messages import ChatFormattedChats


class TokenizedChatDataset(Dataset):
    """
    Tokenized chat dataset
    """

    def __init__(
        self,
        data: Dataset,
        model_transform: Union[PreTrainedTokenizer, Callable],
        *args,
        message_transform: Optional[Callable] = None,
        formatter=None,
        process_count: Optional[int] = None,
        keep_in_memory: Optional[bool] = False,
        **kwargs,
    ):
        def map_fn(ex):
            if message_transform is not None:
                ex = message_transform(ex)
            if formatter is not None:
                ex = ChatFormattedChats(
                    formatter=formatter,
                    **ex,
                )
            else:
                ex = ChatFormattedChats(
                    **ex,
                )
            return ex.tokenized(model_transform)

        features = data.features.keys()
        tokenized_data = data.map(
            map_fn,
            num_proc=process_count,
            keep_in_memory=keep_in_memory,
            remove_columns=features,
            desc="Tokenizing Chats",
        )
        super().__init__(tokenized_data.data, *args, **kwargs)


================================================
FILE: src/axolotl/core/datasets/transforms/__init__.py
================================================


================================================
FILE: src/axolotl/core/datasets/transforms/chat_builder.py
================================================
"""
This module contains a function that builds a transform that takes a row from the
dataset and converts it to a Chat.
"""

from typing import Any, Mapping


def chat_message_transform_builder(
    train_on_inputs=False,
    conversations_field: str = "messages",
    message_field_role: str | list[str] | None = None,  # commonly "role"
    message_field_content: str | list[str] | None = None,  # commonly "content"
    message_field_training: str | list[str] | None = None,  # commonly "weight"
):
    """Builds a transform that takes a row from the dataset and converts it to a Chat

    Args:
        train_on_inputs (bool, optional):
            If True, the transform will train on the inputs. If False, the transform will train on the targets.
            Defaults to False.
        conversations_field (str, optional):
            The field name of the conversations. Defaults to "messages".
        message_field_role (str | list[str], optional):
            The field name of the role.
        message_field_content (str | list[str], optional):
            The field name of the message content.
        message_field_training (str | list[str], optional):
            The field name of the train/weight.

    Returns:
        Callable:
            A function that takes a list of conversations and returns a list of messages.
    """

    if message_field_training is None:
        message_field_training = ["train", "weight"]
    if message_field_content is None:
        message_field_content = ["value", "text", "content"]
    if message_field_role is None:
        message_field_role = ["role", "from"]
    message_field_role = (
        [message_field_role]
        if isinstance(message_field_role, str)
        else message_field_role
    )
    message_field_content = (
        [message_field_content]
        if isinstance(message_field_content, str)
        else message_field_content
    )
    message_weight_fields = (
        [message_field_training]
        if isinstance(message_field_training, str)
        else message_field_training
    )

    role_value_mappings = {
        "system": "system",
        "user": "user",
        "human": "user",
        "assistant": "assistant",
        "gpt": "assistant",
        "tool": "tool",
        "ipython": "ipython",
    }
    if train_on_inputs:
        role_default_weights_mappings = {
            "system": 1,
            "user": 1,
            "assistant": 1,
            "tool": 1,
            "ipython": 1,
        }
    else:
        role_default_weights_mappings = {
            "system": 0,
            "user": 0,
            "assistant": 1,
            "tool": 0,
            "ipython": 0,
        }

    def transform_builder(sample: Mapping[str, Any]):
        if conversations_field not in sample:
            raise ValueError(f"Field '{conversations_field}' not found in sample.")
        # if none of the role fields are in the message, raise an error
        if not any(
            role in sample[conversations_field][0] for role in message_field_role
        ):
            raise ValueError("No role field found in message.")
        role_field = next(
            role
            for role in message_field_role
            if role in sample[conversations_field][0]
        )
        if not any(
            field in sample[conversations_field][0] for field in message_field_content
        ):
            raise ValueError("No message_content field found in message.")
        message_content_field = next(
            field
            for field in message_field_content
            if field in sample[conversations_field][0]
        )
        if not any(
            field in sample[conversations_field][0] for field in message_field_training
        ):
            message_weight_field = None
        else:
            message_weight_field = next(
                field
                for field in message_weight_fields
                if field in sample[conversations_field][0]
            )

        messages = []
        for message in sample[conversations_field]:
            role = role_value_mappings[message[role_field]]
            weight = (
                int(message[message_weight_field])
                if message_weight_field
                else role_default_weights_mappings[role]
            )

            # TODO if "tool_calls" in message[message_content_field]: then convert tool call to ToolCallContents
            if isinstance(message[message_content_field], str):
                messages.append(
                    {
                        "role": role,
                        "content": [
                            {
                                "type": "text",
                                "value": message[message_content_field],
                            }
                        ],
                        "weight": weight,
                    }
                )
            else:
                messages.append(
                    {
                        "role": role,
                        "content": message[message_content_field],
                        "weight": weight,
                    }
                )

        return {"conversation": messages}

    return transform_builder


================================================
FILE: src/axolotl/core/trainers/__init__.py
================================================
"""Init for axolotl.core.trainers"""

# flake8: noqa

from .base import AxolotlTrainer
from .dpo.trainer import AxolotlDPOTrainer
from .mamba import AxolotlMambaTrainer
from .trl import (
    AxolotlCPOTrainer,
    AxolotlKTOTrainer,
    AxolotlORPOTrainer,
    AxolotlPRMTrainer,
    AxolotlRewardTrainer,
)


================================================
FILE: src/axolotl/core/trainers/base.py
================================================
"""Module for customized trainers"""

from __future__ import annotations

import json
import math
import os
from collections import defaultdict
from functools import partial, wraps
from typing import Any, Callable, Literal, Optional

import datasets
import safetensors
import torch
from accelerate.state import AcceleratorState
from datasets import Dataset
from peft import PeftModel
from torch.utils.data import (
    BatchSampler,
    DataLoader,
    RandomSampler,
    Sampler,
    SequentialSampler,
)
from transformers import PreTrainedModel, Trainer
from transformers.trainer import TRAINING_ARGS_NAME
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR, has_length, seed_worker
from transformers.utils import SAFE_WEIGHTS_NAME, is_peft_available
from trl.experimental.utils import pad_to_length
from typing_extensions import override

from axolotl.core.trainers.mixins import (
    ActivationOffloadingMixin,
    CheckpointSaveMixin,
    DistributedParallelMixin,
    OptimizerMixin,
    PackingMixin,
    RngLoaderMixin,
    SchedulerMixin,
)
from axolotl.core.trainers.utils import (
    sanitize_kwargs_for_ds_tagging,
    sanitize_kwargs_for_tagging,
)
from axolotl.utils import get_not_null
from axolotl.utils.bench import get_gpu_memory_usage
from axolotl.utils.dict import DictDefault
from axolotl.utils.distributed import is_distributed, is_main_process
from axolotl.utils.logging import get_logger
from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths

LOG = get_logger(__name__)

TOKENS_STATE_FILE = "tokens_state."

REDUCTION_FNS = {
    "mean": torch.mean,
    "min": torch.min,
    "max": torch.max,
    "sum": torch.sum,
}


class AxolotlTrainer(
    PackingMixin,
    SchedulerMixin,
    OptimizerMixin,
    RngLoaderMixin,
    CheckpointSaveMixin,
    ActivationOffloadingMixin,
    DistributedParallelMixin,
    Trainer,
):
    """Extend the base Trainer for axolotl helpers"""

    args = None  # type: "AxolotlTrainingArguments"  # type: ignore[name-defined]
    tag_names = ["axolotl"]
    _axolotl_cfg: DictDefault | None = None

    @property
    def axolotl_cfg(self):
        return self._axolotl_cfg

    @axolotl_cfg.setter
    def axolotl_cfg(self, cfg):
        self._axolotl_cfg = cfg

    def __init__(
        self,
        *_args,
        bench_data_collator=None,
        eval_data_collator=None,
        dataset_tags=None,
        **kwargs,
    ):
        self.bench_data_collator = bench_data_collator
        self.eval_data_collator = eval_data_collator
        self.dataset_tags = dataset_tags
        self._signature_columns = None  # workaround for pylint

        super().__init__(*_args, **kwargs)
        self.train_data_collator = self.data_collator
        self._stored_metrics = defaultdict(
            lambda: defaultdict(lambda: {"values": [], "reduction": "mean"})
        )
        if self.args.orpo_alpha:
            self.loss_fct = torch.nn.CrossEntropyLoss(reduction="none")

    def _create_multipack_sampler(
        self, base_sampler: Sampler, dataset: Dataset
    ) -> MultipackBatchSampler:
        """
        Helper method to create a `MultipackBatchSampler` for multipacking sequences
        for training.

        Args:
            base_sampler: Sampler to wrap with `MultipackBatchSampler`.
            dataset: Dataset to sample from.

        Returns:
            Multipack (sample packing) batch sampler.
        """
        if self.args.multipack_real_batches:
            batch_size = self.args.per_device_train_batch_size
            batch_max_len = self.args.max_seq_length
        else:
            batch_size = 1
            train_batch_size = (
                self.state.train_batch_size or self.args.per_device_train_batch_size
            )
            batch_max_len = train_batch_size * self.args.max_seq_length

        sampler = MultipackBatchSampler(
            base_sampler,
            lengths=get_dataset_lengths(dataset),
            packing_efficiency_estimate=self.args.sample_packing_efficiency,
            batch_max_len=batch_max_len,
            batch_size=batch_size,
            group_size=self.args.sample_packing_group_size,
            bin_size=self.args.sample_packing_bin_size,
            sequential=self.args.sample_packing_sequentially,
            drop_last=True,
            num_processes=self.args.dataset_num_proc,
            mp_start_method=self.args.sample_packing_mp_start_method or "fork",
        )

        len(sampler)
        return sampler

    def _get_train_sampler(
        self, train_dataset: Dataset | None = None
    ) -> Sampler | None:
        """
        Helper method to get the sampler for training. Handles cases for sample packing
        and curriculum sampling (sequential).

        Returns:
            If the dataset is non-empty, a sampler is returned, the type of which
                depends on the passed training args.
        """
        # from https://github.com/huggingface/transformers/blob/2166b6b4ff09f6dd3867ab982f262f66482aa968/src/transformers/trainer.py#L969C1-L972C24
        if train_dataset is None:
            train_dataset = self.train_dataset
        if train_dataset is None or not has_length(train_dataset):
            return None

        use_sample_packing = self.args.sample_packing and not self.args.pretraining

        # Determine the base sampler first
        if self.args.curriculum_sampling:
            base_sampler = SequentialSampler(train_dataset)
        elif use_sample_packing:
            base_sampler = RandomSampler(train_dataset)
        else:
            # Default to parent class implementation for standard random sampling
            return super()._get_train_sampler(train_dataset)

        # Apply multipack wrapper if needed
        if use_sample_packing:
            return self._create_multipack_sampler(
                base_sampler=base_sampler,
                dataset=train_dataset,
            )

        return base_sampler

    def _get_eval_sampler(self, eval_dataset: Dataset | None = None) -> Sampler | None:
        """
        Helper method to get the sampler for evaluation. Handles sample packing case.

        Returns:
            If the dataset is non-empty, a sampler is returned, the type of which
                depends on the passed training args.
        """
        # from https://github.com/huggingface/transformers/blob/2166b6b4ff09f6dd3867ab982f262f66482aa968/src/transformers/trainer.py#L1065C9-L1066C24
        if eval_dataset is None or not has_length(eval_dataset):
            return None

        # Multipacking enabled if training is enabled and eval is not explicitly disabled
        use_multipack = (
            self.args.sample_packing and self.args.eval_sample_packing is not False
        )

        # Determine the base sampler
        if use_multipack:
            base_sampler = SequentialSampler(eval_dataset)
        else:
            return super()._get_eval_sampler(eval_dataset)

        # Apply multipack wrapper if needed
        if use_multipack:
            return self._create_multipack_sampler(
                base_sampler=base_sampler,
                dataset=eval_dataset,
            )

        return base_sampler

    def _get_dataloader(
        self,
        dataset: Dataset,
        description: str,
        batch_size: int,
        sampler_fn: Optional[Callable[[Dataset], torch.utils.data.Sampler]] = None,
        is_training: bool = False,
        dataloader_key: Optional[str] = None,
    ) -> DataLoader:
        """Create a [`~torch.utils.data.DataLoader`] from the given dataset."""

        data_collator = self.data_collator if is_training else self.eval_data_collator

        if isinstance(dataset, datasets.Dataset):
            if is_training:
                if not self.args.sample_packing or self.args.pretraining:
                    dataset = self._remove_unused_columns(
                        dataset, description="training"
                    )
            elif (
                not is_training
                and self.args.sample_packing
                and self.args.eval_sample_packing is not False
            ):
                batch_size = (
                    batch_size
                    if self.args.sample_packing
                    else self.args.per_device_eval_batch_size
                )
            else:
                dataset = self._remove_unused_columns(dataset, description=description)
        else:
            data_collator = self._get_collator_with_removed_columns(
                self.data_collator, description=description
            )

        dataloader_params = {
            "batch_size": batch_size,
            "collate_fn": data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
            "persistent_workers": self.args.dataloader_persistent_workers,
        }

        if not isinstance(dataset, torch.utils.data.IterableDataset):
            dataloader_params["drop_last"] = get_not_null(
                self.args.dataloader_drop_last, True
            )
            if sampler_fn is not None:
                sampler = sampler_fn(dataset)
                if isinstance(sampler, BatchSampler):
                    # batch_size and batch_sampler are mutually exclusive
                    dataloader_params["batch_sampler"] = sampler
                    del dataloader_params["batch_size"]
                    del dataloader_params["drop_last"]
                else:
                    dataloader_params["sampler"] = sampler

            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor
            if is_training:
                dataloader_params["worker_init_fn"] = partial(
                    seed_worker,
                    num_workers=self.args.dataloader_num_workers,
                    rank=self.args.process_index,
                )
        if self.args.sample_packing and (
            (is_training and not self.args.pretraining)
            or (not is_training and self.args.eval_sample_packing is not False)
        ):
            self.accelerator.even_batches = False

        if dataset.column_names and "length" in dataset.column_names:
            dataset = dataset.remove_columns(["length"])

        if (
            dataset.column_names
            and "position_ids" in dataset.column_names
            and "attention_mask" in dataset.column_names
            and self.args.sample_packing
            and self.args.sample_packing_drop_attention_mask
        ):
            dataset = dataset.remove_columns(["attention_mask"])

        dataloader = DataLoader(dataset, **dataloader_params)

        # Accelerator.free_memory() will destroy the references, so
        # we need to store the non-prepared version for eval dataloaders.
        # fmt: off
        if dataloader_key is not None and self.args.dataloader_persistent_workers:
            if hasattr(self, "_eval_dataloaders"):
                self._eval_dataloaders[dataloader_key] = dataloader  # type: ignore
            else:
                self._eval_dataloaders = {dataloader_key: dataloader}
        # fmt: on

        return self.accelerator.prepare(dataloader)

    def _get_bench_sampler(
        self, bench_dataset: Dataset
    ) -> torch.utils.data.Sampler | None:
        if self.args.world_size <= 1:
            return SequentialSampler(bench_dataset)
        return None

    def get_bench_dataloader(
        self,
        bench_dataset: Dataset,
    ) -> DataLoader:
        dataloader_params = {
            "batch_size": self.args.eval_batch_size,
            "collate_fn": self.bench_data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
        }
        if self.args.dataloader_prefetch_factor:
            dataloader_params["prefetch_factor"] = self.args.dataloader_prefetch_factor

        if not isinstance(bench_dataset, torch.utils.data.IterableDataset):
            dataloader_params["sampler"] = self._get_bench_sampler(bench_dataset)
            dataloader_params["drop_last"] = self.args.dataloader_drop_last

        return DataLoader(bench_dataset, **dataloader_params)
        # return self.accelerator.prepare(DataLoader(bench_dataset, **dataloader_params))

    @override
    def compute_loss(
        self, model, inputs, return_outputs=False, num_items_in_batch=None
    ):
        # use one's weighted cross entropy loss calc
        # if self.args.sample_packing:
        #     labels = inputs.pop("labels")
        #     outputs = model(**inputs)
        #     loss = trainer_weighted_loss(outputs, labels, shift_labels=True)
        #     return (loss, outputs) if return_outputs else loss

        # track number of tokens for tokens per second calculation
        if self.args.include_tkps and model.training:
            inputs_key = "labels" if "labels" in inputs else "input_ids"
            trainable_tokens = (inputs[inputs_key] != -100).sum()
            total_tokens = inputs[inputs_key].numel()
            total_tokens = torch.tensor(total_tokens, device=inputs[inputs_key].device)

            if is_distributed():
                torch.distributed.all_reduce(
                    trainable_tokens, op=torch.distributed.ReduceOp.SUM
                )
                torch.distributed.all_reduce(
                    total_tokens, op=torch.distributed.ReduceOp.SUM
                )

            if not hasattr(self.state, "tokens"):
                self.state.tokens = {
                    "trainable": torch.zeros(1),
                    "total": torch.zeros(1),
                }

            # trainable tokens for throughput and total token slots for summaries
            self.state.tokens["trainable"] = (
                self.state.tokens["trainable"] + trainable_tokens.detach().cpu()
            )
            self.state.tokens["total"] = self.state.tokens["total"] + total_tokens.cpu()
            # Store per-step trainable tokens for throughput calculation
            self.state.tokens["trainable_tokens"] = trainable_tokens.detach().cpu()

        if self.args.orpo_alpha:
            return self.orpo_compute_loss(
                model,
                inputs,
                return_outputs=return_outputs,
                num_items_in_batch=num_items_in_batch,
            )

        return super().compute_loss(
            model,
            inputs,
            return_outputs=return_outputs,
            num_items_in_batch=num_items_in_batch,
        )

    @override
    def evaluate(self, *args, **kwargs):
        LOG.info("Running evaluation step...")
        return super().evaluate(*args, **kwargs)

    @staticmethod
    def orpo_concatenate_inputs(inputs, label_pad_token=-100, pad_token=0, device=None):
        concatenated_batch = {}

        max_length = max(
            inputs["input_ids"].shape[1], inputs["rejected_input_ids"].shape[1]
        )
        # Concatenate positive and negative inputs
        concatenated_batch["input_ids"] = pad_to_length(
            inputs["input_ids"], max_length, pad_token
        )
        concatenated_batch["rejected_input_ids"] = pad_to_length(
            inputs["rejected_input_ids"], max_length, pad_token
        )
        concatenated_batch["labels"] = pad_to_length(
            inputs["labels"], max_length, label_pad_token
        )
        concatenated_batch["rejected_labels"] = pad_to_length(
            inputs["rejected_labels"], max_length, label_pad_token
        )
        concatenated_batch["attention_mask"] = pad_to_length(
            inputs["attention_mask"], max_length, 0
        )
        concatenated_batch["rejected_attention_mask"] = pad_to_length(
            inputs["rejected_attention_mask"], max_length, 0
        )
        concatenated_batch["prompt_attention_mask"] = pad_to_length(
            inputs["prompt_attention_mask"], max_length, 0
        ).to(device=device)

        input_ids = torch.cat(
            [concatenated_batch["input_ids"], concatenated_batch["rejected_input_ids"]],
            dim=0,
        ).to(device=device)
        attention_mask = torch.cat(
            [
                concatenated_batch["attention_mask"],
                concatenated_batch["rejected_attention_mask"],
            ],
            dim=0,
        ).to(device=device)
        labels = torch.cat(
            [concatenated_batch["labels"], concatenated_batch["rejected_labels"]], dim=0
        ).to(device=device)

        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": attention_mask,
            "prompt_attention_mask": concatenated_batch["prompt_attention_mask"],
        }

    def orpo_compute_custom_loss(self, logits, labels):
        logits = logits.contiguous()
        loss = 0.0

        if labels is not None:
            # move labels to correct device to enable model parallelism
            labels = labels.to(logits.device)
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()

            # Flatten the tokens
            loss = self.loss_fct(shift_logits.transpose(2, 1), shift_labels).mean(
                dim=-1
            )

        return loss

    def orpo_compute_logps(
        self, prompt_attention_mask, chosen_inputs, chosen_attention_mask, logits
    ):
        # Get the shape of chosen_attention_mask[:, :-1]
        chosen_shape = chosen_attention_mask[:, :-1].shape

        # Calculate the padding size
        pad_length = chosen_shape[1] - (prompt_attention_mask.shape[1] - 1)

        # Pad prompt_attention_mask with zeros to match the desired shape
        prompt_attention_mask_padded = torch.nn.functional.pad(
            prompt_attention_mask[:, 1:], (0, pad_length), mode="constant", value=0
        )

        # Perform the subtraction operation
        mask = chosen_attention_mask[:, :-1] > prompt_attention_mask_padded

        per_token_logps = torch.gather(
            logits[:, :-1, :].log_softmax(-1),
            dim=2,
            index=(mask * chosen_inputs[:, 1:]).unsqueeze(2),
        ).squeeze(2)
        return torch.mul(per_token_logps, mask).sum(dim=1) / mask.sum(dim=1)

    def orpo_compute_loss(
        self,
        model,
        inputs,
        return_outputs=False,
        num_items_in_batch=None,
    ):
        concat_inputs = AxolotlTrainer.orpo_concatenate_inputs(
            inputs,
            label_pad_token=-100,
            pad_token=self.tokenizer.pad_token_id,
            device=self.accelerator.device,
        )

        # Perform a single forward pass
        outputs = model(
            **{
                "input_ids": concat_inputs["input_ids"],
                "attention_mask": concat_inputs["attention_mask"],
                "labels": concat_inputs["labels"],
            },
            output_hidden_states=True,
        )

        # Split the outputs for positive and negative examples
        outputs_pos, outputs_neg = outputs.logits.chunk(2)

        # Calculate NLL loss
        pos_loss = self.orpo_compute_custom_loss(
            logits=outputs_pos, labels=concat_inputs["input_ids"].chunk(2)[0]
        )

        # Calculate Log Probability
        pos_prob = self.orpo_compute_logps(
            prompt_attention_mask=concat_inputs["prompt_attention_mask"],
            chosen_inputs=concat_inputs["input_ids"].chunk(2)[0],
            chosen_attention_mask=concat_inputs["attention_mask"].chunk(2)[0],
            logits=outputs_pos,
        )
        neg_prob = self.orpo_compute_logps(
            prompt_attention_mask=concat_inputs["prompt_attention_mask"],
            chosen_inputs=concat_inputs["input_ids"].chunk(2)[1],
            chosen_attention_mask=concat_inputs["attention_mask"].chunk(2)[1],
            logits=outputs_neg,
        )

        # Calculate log odds
        log_odds = (pos_prob - neg_prob) - (
            torch.log(1 - torch.exp(pos_prob)) - torch.log(1 - torch.exp(neg_prob))
        )
        sig_ratio = torch.nn.functional.sigmoid(log_odds)
        ratio = torch.log(sig_ratio)

        # Calculate the Final Loss
        loss = torch.mean(pos_loss - self.args.orpo_alpha * ratio).to(
            dtype=torch.bfloat16
        )

        metrics = {}
        metrics["chosen_geometric_mean"] = torch.mean(pos_prob).cpu().item()
        metrics["rejected_geometric_mean"] = torch.mean(neg_prob).cpu().item()
        metrics["log_odds_ratio"] = torch.mean(ratio).cpu().item()
        metrics["log_odds"] = torch.mean(log_odds).cpu().item()
        self.store_metrics(metrics, train_eval="train")

        return (loss, outputs_pos) if return_outputs else loss

    @wraps(Trainer.push_to_hub)
    def push_to_hub(self, *args, **kwargs) -> str:
        """
        Overwrite the `push_to_hub` method in order to force-add the tags when pushing the
        model on the Hub. Please refer to `~transformers.Trainer.push_to_hub` for more details.
        """
        kwargs = sanitize_kwargs_for_ds_tagging(
            dataset_tags=self.dataset_tags, kwargs=kwargs
        )
        kwargs = sanitize_kwargs_for_tagging(tag_names=self.tag_names, kwargs=kwargs)

        return super().push_to_hub(*args, **kwargs)

    @wraps(Trainer.create_accelerator_and_postprocess)
    def create_accelerator_and_postprocess(self):
        # cleanup the PartialState states so Accelerate automatically configures everything from the env vars
        accelerator_config = self.args.accelerator_config.to_dict()
        use_configured_state = accelerator_config.get("use_configured_state", False)
        if not use_configured_state:
            AcceleratorState._reset_state(reset_partial_state=True)

        super().create_accelerator_and_postprocess()

    def additional_accelerator_args(
        self, fp8: bool = False, enable_fsdp_float8_all_gather: bool = False, **kwargs
    ) -> dict[str, Any]:
        ret_kwargs = {}
        if fp8:
            from accelerate.utils import AORecipeKwargs
            from torchao.float8 import Float8LinearConfig

            # By default, Float8LinearConfig is instantiated using the "tensorwise"
            # scaling strategy. See more details here:
            # https://github.com/pytorch/ao/tree/main/torchao/float8.
            config = Float8LinearConfig(
                enable_fsdp_float8_all_gather=enable_fsdp_float8_all_gather,
                force_recompute_fp8_weight_in_bwd=enable_fsdp_float8_all_gather is True,
            )

            ret_kwargs["mixed_precision"] = "fp8"
            ret_kwargs["kwargs_handlers"] = [AORecipeKwargs(config=config)]  # type: ignore
            os.environ["ACCELERATE_MIXED_PRECISION"] = "fp8"

        return ret_kwargs

    def log(self, logs: dict[str, float], start_time: float | None = None) -> None:
        """
        Log `logs` on the various objects watching training, including stored metrics.

        Args:
            logs: The values to log.
            start_time: The start of training.
        """
        # logs either has 'loss' or 'eval_loss'
        train_eval = "train" if "loss" in logs else "eval"
        metric_ndigits = int(os.getenv("AXOLOTL_METRIC_NDIGITS", "5"))

        for key, metric_data in self._stored_metrics[train_eval].items():
            values = torch.tensor(metric_data["values"])  # type: ignore[arg-type]
            reduction_type = metric_data["reduction"]

            fn = REDUCTION_FNS.get(reduction_type)
            if fn is None:
                raise NotImplementedError(
                    "Metric reduction must be one of [mean, min, max, sum]"
                )
            logs[key] = round(fn(values).item(), metric_ndigits)

        if "loss" in logs:
            try:
                logs["ppl"] = round(math.exp(logs["loss"]), metric_ndigits)
            except OverflowError:
                logs["ppl"] = float("inf")
        if "eval_loss" in logs:
            try:
                logs["eval_ppl"] = round(math.exp(logs["eval_loss"]), metric_ndigits)
            except OverflowError:
                logs["eval_ppl"] = float("inf")

        if is_main_process():
            # Add memory usage
            try:
                active, allocated, reserved = get_gpu_memory_usage()
                logs["memory/max_active (GiB)"] = round(active, 2)
                logs["memory/max_allocated (GiB)"] = round(allocated, 2)
                logs["memory/device_reserved (GiB)"] = round(reserved, 2)
            except (ValueError, TypeError, FileNotFoundError):
                pass

        if (
            self.args.include_tkps
            and train_eval == "train"
            and hasattr(self.state, "tokens")
        ):
            # each rank will log its own tokens per second
            # for logging_steps > 1 we obtain a moving average of this metric
            logs["tokens/train_per_sec_per_gpu"] = round(
                self.state.last_tokens_per_second.item() / self.args.logging_steps, 2
            )
            if "total" in self.state.tokens:
                logs["tokens/total"] = int(self.state.tokens["total"].item())
            if "trainable" in self.state.tokens:
                logs["tokens/trainable"] = int(self.state.tokens["trainable"].item())

        del self._stored_metrics[train_eval]

        return super().log(logs, start_time)

    def store_metrics(
        self,
        metrics: dict[str, float] | dict[str, tuple[int | float, str]],
        train_eval: Literal["train", "eval"] = "train",
        reduction: Literal["mean", "min", "max", "sum"] = "mean",
    ) -> None:
        """
        Store metrics with specified reduction type.

        Args:
            metrics: Dictionary of metric names to values, or metric names to (value,
                reduction_type) tuples.
            train_eval: Whether this is for training or evaluation.
        """
        for key, value in metrics.items():
            if isinstance(value, tuple):
                value, _reduction = value  # type: ignore[assignment]
            else:
                value, _reduction = value, reduction

            self._stored_metrics[train_eval][key]["values"].append(value)
            self._stored_metrics[train_eval][key]["reduction"] = _reduction

    def _save_checkpoint(self, model, trial, **kwargs):
        # make sure the checkpoint dir exists, since trainer is flakey
        checkpoint_folder = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}"
        run_dir = self._get_output_dir(trial=trial)
        output_dir = os.path.join(run_dir, checkpoint_folder)
        os.makedirs(output_dir, exist_ok=True)

        # Save total_tokens state if tracking is enabled
        if self.args.include_tkps and hasattr(self.state, "tokens"):
            tokens_state = {
                "total": int(torch.as_tensor(self.state.tokens.get("total", 0)).item()),
                "trainable": int(
                    torch.as_tensor(self.state.tokens.get("trainable", 0)).item()
                ),
            }
            tokens_state_path = os.path.join(output_dir, TOKENS_STATE_FILE)
            with open(tokens_state_path, "w", encoding="utf-8") as f:
                json.dump(tokens_state, f)

        return super()._save_checkpoint(model, trial, **kwargs)

    # TODO(wing): remove once https://github.com/huggingface/transformers/pull/39866/files is merged
    def _save(self, output_dir: Optional[str] = None, state_dict=None):
        # If we are executing this function, we are the process zero, so we don't check for that.
        output_dir = output_dir if output_dir is not None else self.args.output_dir
        os.makedirs(output_dir, exist_ok=True)
        LOG.info(f"Saving model checkpoint to {output_dir}")

        # fix for Context Parallel save: CP eval invalidates tensor storage
        # pointers, so clone to CPU to get fresh valid storage for safetensors
        if (
            state_dict is not None
            and self.axolotl_cfg
            and self.axolotl_cfg.context_parallel_size
            and self.axolotl_cfg.context_parallel_size > 1
        ):
            state_dict = {
                k: v.detach().cpu() if isinstance(v, torch.Tensor) else v
                for k, v in state_dict.items()
            }

        supported_classes = (
            (PreTrainedModel,)
            if not is_peft_available()
            else (PreTrainedModel, PeftModel)
        )
        # Save a trained model and configuration using `save_pretrained()`.
        # They can then be reloaded using `from_pretrained()`
        if not isinstance(self.model, supported_classes):
            if state_dict is None:
                state_dict = self.model.state_dict()

            if isinstance(
                self.accelerator.unwrap_model(self.model, keep_torch_compile=False),
                supported_classes,
            ):
                self.accelerator.unwrap_model(
                    self.model, keep_torch_compile=False
                ).save_pretrained(
                    output_dir,
                    state_dict=state_dict,
                    is_main_process=self.accelerator.is_main_process,
                )
            else:
                LOG.info(
                    "Trainer.model is not a `PreTrainedModel`, only saving its state dict."
                )
                safetensors.torch.save_file(
                    state_dict,
                    os.path.join(output_dir, SAFE_WEIGHTS_NAME),
                    metadata={"format": "pt"},
                )
        else:
            self.model.save_pretrained(
                output_dir,
                state_dict=state_dict,
                is_main_process=self.accelerator.is_main_process,
            )

        if self.processing_class is not None:
            self.processing_class.save_pretrained(output_dir)
        elif (
            self.data_collator is not None
            and hasattr(self.data_collator, "tokenizer")
            and self.data_collator.tokenizer is not None
        ):
            LOG.info(
                "Saving Trainer.data_collator.tokenizer by default as Trainer.processing_class is `None`"
            )
            self.data_collator.tokenizer.save_pretrained(output_dir)

        # Good practice: save your training arguments together with the trained model
        torch.save(self.args, os.path.join(output_dir, TRAINING_ARGS_NAME))


================================================
FILE: src/axolotl/core/trainers/dpo/__init__.py
================================================
"""DPO Specific Strategy for training"""

from axolotl.core.trainers.dpo.trainer import AxolotlDPOTrainer
from axolotl.utils.schemas.enums import RLType


class DPOStrategy:
    """Strategy for DPO training"""

    @classmethod
    def get_trainer_class(cls):
        return AxolotlDPOTrainer

    @classmethod
    def get_training_args_class(cls):
        from axolotl.core.trainers.dpo.args import AxolotlDPOConfig

        return AxolotlDPOConfig

    @classmethod
    def set_training_args_kwargs(cls, cfg):
        training_args_kwargs = {}
        if cfg.rl is RLType.IPO:
            training_args_kwargs["loss_type"] = "ipo"
        # Label smoothing is not compatible with IPO
        if cfg.rl is RLType.DPO and cfg.dpo_label_smoothing:
            training_args_kwargs["label_smoothing"] = cfg.dpo_label_smoothing
        training_args_kwargs["max_length"] = cfg.sequence_len
        if cfg.dpo_use_weighting is not None:
            training_args_kwargs["use_weighting"] = cfg.dpo_use_weighting
        if cfg.dpo_padding_free is not None:
            training_args_kwargs["padding_free"] = cfg.dpo_padding_free
        if cfg.dpo_norm_loss is not None:
            training_args_kwargs["dpo_norm_loss"] = cfg.dpo_norm_loss
        if cfg.dpo_use_liger_kernel is not None:
            training_args_kwargs["use_liger_kernel"] = cfg.dpo_use_liger_kernel
        return training_args_kwargs


================================================
FILE: src/axolotl/core/trainers/dpo/args.py
================================================
"""
Axolotl specific DPO args
"""

from dataclasses import dataclass

from trl import DPOConfig

from axolotl.core.training_args import AxolotlTrainingMixins


@dataclass
class AxolotlDPOConfig(AxolotlTrainingMixins, DPOConfig):
    """
    DPO config for DPO training
    """

    dpo_norm_loss: bool | None = False


================================================
FILE: src/axolotl/core/trainers/dpo/trainer.py
================================================
"""DPO trainer for axolotl"""

import gc
from functools import wraps
from typing import Any, Dict, Union

import torch
from torch import nn
from trl import DPOTrainer

from axolotl.core.trainers.mixins import (
    DistributedParallelMixin,
    RngLoaderMixin,
    SchedulerMixin,
)
from axolotl.core.trainers.mixins.optimizer import OptimizerInitMixin, OptimizerMixin
from axolotl.core.trainers.utils import (
    sanitize_kwargs_for_ds_tagging,
    sanitize_kwargs_for_tagging,
)


class AxolotlDPOTrainer(
    RngLoaderMixin,
    SchedulerMixin,
    OptimizerMixin,
    OptimizerInitMixin,
    DPOTrainer,
    DistributedParallelMixin,
):
    """Extend the base DPOTrainer for axolotl helpers."""

    tag_names = ["axolotl", "dpo"]

    def __init__(self, *args, dataset_tags=None, **kwargs):
        super().__init__(*args, **kwargs)

        self.dataset_tags = dataset_tags
        self.optimizer = None
        self.model_accepts_loss_kwargs = False

    @wraps(DPOTrainer.push_to_hub)
    def push_to_hub(self, *args, **kwargs) -> str:
        """
        Overwrite the `push_to_hub` method in order to force-add the tags when pushing
        the model on the Hub. Please refer to `~transformers.Trainer.push_to_hub`
        for more details.
        """
        kwargs = sanitize_kwargs_for_ds_tagging(
            dataset_tags=self.dataset_tags, kwargs=kwargs
        )
        kwargs = sanitize_kwargs_for_tagging(tag_names=self.tag_names, kwargs=kwargs)

        return super().push_to_hub(*args, **kwargs)

    @staticmethod
    def tokenize_row(
        features,
        processing_class,
        max_prompt_length: int | None = None,
        max_completion_length: int | None = None,
        add_special_tokens: bool = True,
        is_chat: bool = False,
    ) -> Dict:
        res = DPOTrainer.tokenize_row(
            features,
            processing_class,
            max_prompt_length=max_prompt_length,
            max_completion_length=max_completion_length,
            add_special_tokens=add_special_tokens,
            is_chat=is_chat,
        )
        # fix when the tokenizer doesn't have a bos_token_id, e.g. Qwen
        if processing_class.bos_token is None and res["prompt_input_ids"][0] is None:
            for key in res.keys():
                res[key] = res[key][1:]

        if processing_class.bos_token and processing_class.bos_token_id is not None:
            # dpo trainer may incorrectly prepend the bos_token_id to the dpo outputs
            if res["chosen_input_ids"][0] == processing_class.bos_token_id:
                res["chosen_input_ids"] = res["chosen_input_ids"][1:]
            if res["rejected_input_ids"][0] == processing_class.bos_token_id:
                res["rejected_input_ids"] = res["rejected_input_ids"][1:]

        return res

    def training_step(
        self,
        model: nn.Module,
        inputs: Dict[str, Union[torch.Tensor, Any]],
        num_items_in_batch=None,
    ) -> torch.Tensor:
        loss: torch.Tensor = super().training_step(model, inputs, num_items_in_batch)
        gc.collect()
        torch.cuda.empty_cache()
        return loss

    def concatenated_forward(
        self,
        model: nn.Module,
        batch: dict[str, Union[list, torch.LongTensor]],
        is_ref_model: bool = False,
    ) -> dict[str, torch.Tensor]:
        if self.args.dpo_norm_loss:
            # fmt: off
            loss_type: list[str] = self.loss_type  # type: ignore[has-type]
            # fmt: on
            # concatenated_forward handles avg token logprob for ipo case already
            self.loss_type = ["ipo"]
            res = super().concatenated_forward(model, batch, is_ref_model=is_ref_model)
            self.loss_type = loss_type
            return res
        return super().concatenated_forward(model, batch, is_ref_model=is_ref_model)


================================================
FILE: src/axolotl/core/trainers/grpo/__init__.py
================================================
"""GRPO Specific Strategy for training"""

import importlib
import inspect
import os
from typing import Any

from huggingface_hub import snapshot_download
from requests import HTTPError
from trl.trainer.grpo_trainer import RewardFunc

from axolotl.core.trainers.grpo.args import AxolotlAsyncGRPOConfig, AxolotlGRPOConfig
from axolotl.core.trainers.grpo.trainer import (
    AxolotlAsyncGRPOTrainer,
    AxolotlGRPOSequenceParallelTrainer,
    AxolotlGRPOTrainer,
)
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger
from axolotl.utils.schemas.trl import TRLConfig
from axolotl.utils.schemas.vllm import VllmConfig

LOG = get_logger(__name__)


class GRPOStrategy:
    """Strategy for GRPO training"""

    @classmethod
    def get_trainer_class(
        cls,
        sequence_parallel: bool,
        async_grpo: bool = False,
    ) -> (
        type[AxolotlGRPOTrainer]
        | type[AxolotlGRPOSequenceParallelTrainer]
        | type[AxolotlAsyncGRPOTrainer]
    ):
        if sequence_parallel and async_grpo:
            raise ValueError(
                "sequence_parallel and async_grpo cannot both be enabled. "
                "Disable one of context_parallel_size > 1 or async_prefetch/use_data_producer."
            )
        if sequence_parallel:
            return AxolotlGRPOSequenceParallelTrainer
        if async_grpo:
            return AxolotlAsyncGRPOTrainer
        return AxolotlGRPOTrainer

    @classmethod
    def get_training_args_class(
        cls, async_grpo: bool = False
    ) -> type[AxolotlGRPOConfig] | type[AxolotlAsyncGRPOConfig]:
        if async_grpo:
            return AxolotlAsyncGRPOConfig
        return AxolotlGRPOConfig

    @classmethod
    def set_training_args_kwargs(cls, cfg: DictDefault) -> dict[str, Any]:
        grpo_args_kwargs: dict[str, Any] = {}

        if not hasattr(cfg, "trl") or not cfg.trl:
            return grpo_args_kwargs

        trl: TRLConfig = cfg.trl  # type: ignore
        vllm_cfg: VllmConfig = cfg.vllm  # type: ignore

        if trl.use_vllm:
            grpo_args_kwargs["use_vllm"] = trl.use_vllm
            if trl.vllm_mode:
                grpo_args_kwargs["vllm_mode"] = trl.vllm_mode
            if trl.vllm_mode == "colocate":
                grpo_args_kwargs["vllm_enable_sleep_mode"] = trl.vllm_enable_sleep_mode  # type: ignore[attr-defined]
                grpo_args_kwargs["vllm_gpu_memory_utilization"] = (
                    vllm_cfg.gpu_memory_utilization
                )
                grpo_args_kwargs["vllm_tensor_parallel_size"] = (
                    vllm_cfg.tensor_parallel_size
                )
            grpo_args_kwargs["vllm_server_host"] = trl.vllm_server_host or trl.vllm.host  # type: ignore[attr-defined]
            grpo_args_kwargs["vllm_server_port"] = trl.vllm_server_port or trl.vllm.port  # type: ignore[attr-defined]
            if trl.vllm_server_timeout:
                grpo_args_kwargs["vllm_server_timeout"] = trl.vllm_server_timeout
            if trl.vllm_guided_decoding_regex:
                grpo_args_kwargs["vllm_guided_decoding_regex"] = (
                    trl.vllm_guided_decoding_regex
                )

        if trl.num_generations:
            grpo_args_kwargs["num_generations"] = trl.num_generations

        if trl.sync_ref_model:
            grpo_args_kwargs["sync_ref_model"] = trl.sync_ref_model

            if trl.ref_model_mixup_alpha:
                grpo_args_kwargs["ref_model_mixup_alpha"] = trl.ref_model_mixup_alpha

            if trl.ref_model_sync_steps:
                grpo_args_kwargs["ref_model_sync_steps"] = trl.ref_model_sync_steps

        grpo_args_kwargs["max_completion_length"] = trl.max_completion_length
        grpo_args_kwargs["log_completions"] = trl.log_completions
        grpo_args_kwargs["num_completions_to_print"] = trl.num_completions_to_print

        if cfg.context_parallel_size > 1:
            grpo_args_kwargs["context_parallel_size"] = cfg.context_parallel_size

        if trl.importance_sampling_level is not None:
            grpo_args_kwargs["importance_sampling_level"] = (
                trl.importance_sampling_level
            )

        if trl.reward_weights:
            grpo_args_kwargs["reward_weights"] = trl.reward_weights

        if trl.scale_rewards is not None:
            grpo_args_kwargs["scale_rewards"] = trl.scale_rewards

        if trl.loss_type is not None:
            grpo_args_kwargs["loss_type"] = trl.loss_type
        if trl.mask_truncated_completions is not None:
            grpo_args_kwargs["mask_truncated_completions"] = (
                trl.mask_truncated_completions
            )

        if trl.temperature is not None:
            grpo_args_kwargs["temperature"] = trl.temperature
        if trl.top_p is not None:
            grpo_args_kwargs["top_p"] = trl.top_p
        if trl.top_k is not None:
            grpo_args_kwargs["top_k"] = trl.top_k
        if trl.min_p is not None:
            grpo_args_kwargs["min_p"] = trl.min_p
        if trl.repetition_penalty is not None:
            grpo_args_kwargs["repetition_penalty"] = trl.repetition_penalty

        if trl.num_iterations is not None:
            grpo_args_kwargs["num_iterations"] = trl.num_iterations
        if trl.epsilon is not None:
            grpo_args_kwargs["epsilon"] = trl.epsilon
        if trl.epsilon_high is not None:
            grpo_args_kwargs["epsilon_high"] = trl.epsilon_high

        if trl.use_liger_loss is not None:
            grpo_args_kwargs["use_liger_kernel"] = trl.use_liger_loss

        if trl.multi_objective_aggregation is not None:
            grpo_args_kwargs["multi_objective_aggregation"] = (
                trl.multi_objective_aggregation
            )

        # Async GRPO fields
        if getattr(trl, "use_data_producer", None) is not None:
            grpo_args_kwargs["use_data_producer"] = trl.use_data_producer
        if getattr(trl, "async_prefetch", None) is not None:
            grpo_args_kwargs["async_prefetch"] = trl.async_prefetch
        if getattr(trl, "prefetch_depth", None) is not None:
            grpo_args_kwargs["prefetch_depth"] = trl.prefetch_depth
        if getattr(trl, "vllm_sync_interval", None) is not None:
            grpo_args_kwargs["vllm_sync_interval"] = trl.vllm_sync_interval
        if getattr(trl, "streaming_partial_batch", None) is not None:
            grpo_args_kwargs["streaming_partial_batch"] = trl.streaming_partial_batch
        if getattr(trl, "streaming_min_groups", None) is not None:
            grpo_args_kwargs["streaming_min_groups"] = trl.streaming_min_groups
        if getattr(trl, "vllm_importance_sampling_correction", None) is not None:
            grpo_args_kwargs["vllm_importance_sampling_correction"] = (
                trl.vllm_importance_sampling_correction
            )
        if getattr(trl, "vllm_importance_sampling_mode", None) is not None:
            grpo_args_kwargs["vllm_importance_sampling_mode"] = (
                trl.vllm_importance_sampling_mode
            )
        if getattr(trl, "vllm_importance_sampling_cap", None) is not None:
            grpo_args_kwargs["vllm_importance_sampling_cap"] = (
                trl.vllm_importance_sampling_cap
            )
        if getattr(trl, "off_policy_mask_threshold", None) is not None:
            grpo_args_kwargs["off_policy_mask_threshold"] = (
                trl.off_policy_mask_threshold
            )
        if getattr(trl, "use_bias_correction_kl", None) is not None:
            grpo_args_kwargs["use_bias_correction_kl"] = trl.use_bias_correction_kl

        # Fast Async GRPO fields
        if getattr(trl, "reward_num_workers", None) is not None:
            grpo_args_kwargs["reward_num_workers"] = trl.reward_num_workers
        if getattr(trl, "replay_buffer_size", None) is not None:
            grpo_args_kwargs["replay_buffer_size"] = trl.replay_buffer_size
        if getattr(trl, "replay_recompute_logps", None) is not None:
            grpo_args_kwargs["replay_recompute_logps"] = trl.replay_recompute_logps
        if getattr(trl, "reroll_start_fraction", None) is not None:
            grpo_args_kwargs["reroll_start_fraction"] = trl.reroll_start_fraction
        if getattr(trl, "reroll_max_groups", None) is not None:
            grpo_args_kwargs["reroll_max_groups"] = trl.reroll_max_groups
        if getattr(trl, "skip_zero_advantage_batches", None) is not None:
            grpo_args_kwargs["skip_zero_advantage_batches"] = (
                trl.skip_zero_advantage_batches
            )
        if getattr(trl, "vllm_lora_sync", None) is not None:
            grpo_args_kwargs["vllm_lora_sync"] = trl.vllm_lora_sync

        return grpo_args_kwargs

    @classmethod
    def set_trainer_args(cls, cfg: DictDefault) -> list[Any]:
        trainer_args = []
        if cfg.trl and cfg.trl.reward_funcs:
            reward_funcs = []
            for reward_func_fqn in cfg.trl.reward_funcs:
                reward_funcs.append(cls.get_reward_func(reward_func_fqn))
            trainer_args.append(reward_funcs)

        return trainer_args

    @classmethod
    def set_trainer_kwargs(cls, cfg: DictDefault) -> dict[str, Any]:
        trainer_kwargs = {}
        if cfg.trl and cfg.trl.reward_processing_classes:
            trainer_kwargs["reward_processing_classes"] = (
                cfg.trl.reward_processing_classes
            )
        if cfg.trl and cfg.trl.rollout_func:
            trainer_kwargs["rollout_func"] = cls.get_rollout_func(cfg.trl.rollout_func)

        return trainer_kwargs

    @classmethod
    def get_collator(cls, *args, **kwargs):
        # No data collation is needed in GRPO, handled by trl's trainer __init__
        return None

    @classmethod
    def get_blocklist_args_kwargs(cls) -> list[str]:
        return [
            "dataset_num_proc",
            "max_length",
            "include_tokens_per_second",
            "max_prompt_length",
        ]

    @classmethod
    def get_reward_func(cls, reward_func_fqn: str) -> RewardFunc:
        """
        Returns the reward function from the given fully qualified name, or the path to the reward function model.

        Args:
            reward_func_fqn (str): Fully qualified name of the reward function (e.g. r1_grpo.gsm8k_transform),
                or a HF hub path to the reward model.

        Returns:
            RewardFunc: A callable that accepts prompts and completions and returns rewards,
                or a path to a reward model.

        Raises:
            ValueError: If the reward function does not accept at least two arguments.
        """
        try:
            # use importlib to dynamically load the reward function from the module
            reward_func_module_name = reward_func_fqn.split(".")[-1]
            reward_func_module = importlib.import_module(
                ".".join(reward_func_fqn.split(".")[:-1])
            )
            reward_func = getattr(reward_func_module, reward_func_module_name)
            if not len(inspect.signature(reward_func).parameters) >= 2:
                raise ValueError(
                    "Reward function must accept at least two arguments: prompts: list and completions: list"
                )
            return reward_func
        except ModuleNotFoundError as exc:
            # the user has passed a string (ideally indicating the path of a reward model)
            # check if it's a local dir path and not empty dir to a reward model
            pretrained_log_msg = f"Reward function {reward_func_fqn} is a pre-trained model path - if this is unexpected, please check the reward function path."
            if os.path.isdir(reward_func_fqn) and os.listdir(reward_func_fqn):
                LOG.info(pretrained_log_msg)
                return reward_func_fqn
            try:
                snapshot_download(reward_func_fqn, repo_type="model")
                LOG.info(pretrained_log_msg)
                return reward_func_fqn
            except HTTPError:
                raise ValueError(
                    f"Reward function {reward_func_fqn} not found."
                ) from exc

    @classmethod
    def get_rollout_func(cls, rollout_func_fqn: str):
        """
        Returns the rollout function from the given fully qualified name.

        Args:
            rollout_func_fqn (str): Fully qualified name of the rollout function
                                    (e.g. my_module.my_rollout_func)

        Returns:
            Callable rollout function
        """
        try:
            rollout_func_module_name = rollout_func_fqn.split(".")[-1]
            rollout_func_module = importlib.import_module(
                ".".join(rollout_func_fqn.split(".")[:-1])
            )
            rollout_func = getattr(rollout_func_module, rollout_func_module_name)

            if not callable(rollout_func):
                raise ValueError(
                    f"Rollout function {rollout_func_fqn} must be callable"
                )

            return rollout_func

        except ModuleNotFoundError as exc:
            raise ValueError(f"Rollout function {rollout_func_fqn} not found.") from exc


================================================
FILE: src/axolotl/core/trainers/grpo/args.py
================================================
"""
Axolotl Specific Training Args
"""

from dataclasses import dataclass

from trl import GRPOConfig

from axolotl.core.trainers.grpo.fast_async_trainer import FastAsyncGRPOConfig
from axolotl.core.training_args import AxolotlTrainingMixins


@dataclass
class AxolotlGRPOConfig(AxolotlTrainingMixins, GRPOConfig):
    """Axolotl GRPO Config for GRPO training"""

    context_parallel_size: int | None = None


@dataclass
class AxolotlAsyncGRPOConfig(AxolotlTrainingMixins, FastAsyncGRPOConfig):
    """Axolotl Async GRPO Config — adds async prefetch, streaming scoring, and IS correction."""

    context_parallel_size: int | None = None


================================================
FILE: src/axolotl/core/trainers/grpo/async_trainer.py
================================================
"""
Async GRPO training with streaming scoring and IS correction.

Works on stock TRL v0.29.0 and transformers v5.3.0 — no custom branches needed.

Features:
  - Async prefetch: background thread generates completions via vLLM while the main
    thread trains on the previous rollout.
  - Deferred scoring: rewards, advantages, and policy logprobs computed on the main
    thread (thread-safe with GPU forward passes).
  - Streaming group scoring: scores prompt groups incrementally so that reward
    computation overlaps with the next group's logprob computation.
  - Importance sampling (IS) correction: corrects for stale vLLM weights.
  - Off-Policy Sequence Mask (OPSM): drops sequences with high KL + negative advantage.
  - Configurable vLLM weight sync interval.

Classes exported:
  - AsyncGRPOConfig: GRPOConfig extended with async/streaming/IS fields
  - AsyncGRPOTrainer: GRPOTrainer with async prefetch and IS correction
  - ProducerConfig, DataProducer, BaseDataProducer, AsyncDataProducer: data producer protocol
"""

import atexit
import concurrent.futures
import logging
import queue
import threading
from abc import ABC, abstractmethod
from collections import deque
from contextlib import nullcontext
from dataclasses import dataclass, field
from typing import Any

import torch
from torch.utils.data import DataLoader, Dataset
from trl.extras.profiling import profiling_decorator
from trl.trainer import GRPOConfig, GRPOTrainer
from trl.trainer.utils import (
    RepeatSampler,
    entropy_from_logits,
    nanmax,
    nanmin,
    nanstd,
    pad,
    selective_log_softmax,
    shuffle_sequence_dict,
    split_pixel_values_by_grid,
    split_tensor_dict,
    unsplit_pixel_values_by_grid,
)

try:
    from trl.data_utils import (
        apply_chat_template,
        is_conversational,
        prepare_multimodal_messages,
    )
except ImportError:
    from trl.chat_template_utils import apply_chat_template
    from trl.data_utils import is_conversational, prepare_multimodal_messages

try:
    from trl.models.utils import disable_gradient_checkpointing
except ImportError:
    from contextlib import contextmanager

    @contextmanager
    def disable_gradient_checkpointing(model, kwargs):
        yield


try:
    from accelerate.utils import gather_object
except ImportError:
    gather_object = None

try:
    from peft import PeftModel
    from trl.trainer.utils import use_adapter
except ImportError:
    PeftModel = None
    use_adapter = nullcontext

try:
    from liger_kernel.ops.grpo_loss import (
        fused_selective_log_softmax as _fused_selective_log_softmax,
    )
except ImportError:
    _fused_selective_log_softmax = None


# ---------------------------------------------------------------------------
# Config
# ---------------------------------------------------------------------------


@dataclass
class AsyncGRPOConfig(GRPOConfig):
    """GRPOConfig extended with async prefetch, streaming scoring, and IS correction fields.

    Fields already present in stock GRPOConfig (e.g. ``importance_sampling_level``,
    ``multi_objective_aggregation``) are listed here for safety: if the stock version
    does not define them, the defaults below ensure everything works.
    """

    # --- Data producer ---
    use_data_producer: bool = field(
        default=False,
        metadata={
            "help": "Use the GRPODataProducer protocol for online data generation."
        },
    )

    # --- Async data production ---
    async_prefetch: bool = field(
        default=False,
        metadata={
            "help": "Generate rollouts in a background thread while training on the previous rollout."
        },
    )
    prefetch_depth: int = field(
        default=1,
        metadata={"help": "Number of rollouts to prefetch ahead of training."},
    )
    vllm_sync_interval: int = field(
        default=1,
        metadata={
            "help": "Sync model weights to vLLM every N optimizer steps (async mode only)."
        },
    )

    # --- Streaming scoring ---
    streaming_partial_batch: bool = field(
        default=False,
        metadata={
            "help": "Score prompt groups incrementally instead of the full batch at once."
        },
    )
    streaming_min_groups: int = field(
        default=1,
        metadata={"help": "Minimum prompt groups to score per streaming chunk."},
    )

    # --- vLLM importance sampling correction ---
    vllm_importance_sampling_correction: bool = field(
        default=True,
        metadata={
            "help": "Apply IS correction for distribution mismatch between vLLM and training model."
        },
    )
    vllm_importance_sampling_mode: str = field(
        default="token_truncate",
        metadata={
            "help": "IS mode: token_truncate, token_mask, sequence_truncate, or sequence_mask."
        },
    )
    vllm_importance_sampling_cap: float = field(
        default=3.0,
        metadata={"help": "Cap C for IS ratio clipping/masking."},
    )

    # --- Off-policy sequence mask (OPSM) ---
    off_policy_mask_threshold: float | None = field(
        default=None,
        metadata={"help": "KL threshold for OPSM (DeepSeek-V3.2). None = disabled."},
    )

    # --- Bias-corrected KL ---
    use_bias_correction_kl: bool = field(
        default=False,
        metadata={"help": "Apply IS correction to KL divergence term."},
    )


# ---------------------------------------------------------------------------
# Data Producer Protocol (standalone — no transformers branch needed)
# ---------------------------------------------------------------------------

logger = logging.getLogger(__name__)
_dp_logger = logging.getLogger(__name__ + ".data_producer")


@dataclass
class ProducerConfig:
    """Configuration for a :class:`DataProducer`.

    Args:
        mini_epochs: Number of training passes over each produced dataset.
        max_rollouts: Maximum number of produce-then-train rounds (None = unlimited).
        steps_per_generation: Optimisation steps per produced dataset before regenerating.
        num_iterations: Number of times to reuse each generation across optimisation steps.
        async_prefetch: Produce the next dataset in a background thread.
        prefetch_depth: How many rollouts to queue ahead when async.
        sync_warmup_rollouts: Initial on-policy rollouts before switching to async.
        eval_during_produce: Switch model to eval() during produce().
        empty_cache_before_produce: torch.cuda.empty_cache() before produce().
        empty_cache_after_produce: torch.cuda.empty_cache() after produce().
    """

    mini_epochs: int = 1
    max_rollouts: int | None = None
    steps_per_generation: int | None = None
    num_iterations: int = 1
    async_prefetch: bool = False
    prefetch_depth: int = 1
    sync_warmup_rollouts: int = 0
    eval_during_produce: bool = True
    empty_cache_before_produce: bool = False
    empty_cache_after_produce: bool = False

    def __post_init__(self):
        if self.mini_epochs < 1:
            raise ValueError(f"mini_epochs must be >= 1, got {self.mini_epochs}")
        if self.max_rollouts is not None and self.max_rollouts < 1:
            raise ValueError(
                f"max_rollouts must be >= 1 or None, got {self.max_rollouts}"
            )
        if self.num_iterations < 1:
            raise ValueError(f"num_iterations must be >= 1, got {self.num_iterations}")
        if self.steps_per_generation is not None and self.steps_per_generation < 1:
            raise ValueError(
                f"steps_per_generation must be >= 1 or None, got {self.steps_per_generation}"
            )
        if self.prefetch_depth < 1:
            raise ValueError(f"prefetch_depth must be >= 1, got {self.prefetch_depth}")
        if self.sync_warmup_rollouts < 0:
            raise ValueError(
                f"sync_warmup_rollouts must be >= 0, got {self.sync_warmup_rollouts}"
            )


class DataProducer(ABC):
    """Abstract base class for online data producers.

    Subclass this and implement :meth:`produce` to supply fresh training data
    each rollout round.
    """

    config: ProducerConfig

    @abstractmethod
    def produce(
        self,
        model: Any,
        global_step: int,
        *,
        processing_class: Any = None,
        accelerator: Any = None,
        args: Any = None,
        **kwargs,
    ) -> Dataset:
        """Generate a fresh training dataset."""
        ...


class BaseDataProducer(DataProducer):
    """Convenience base class with a default :class:`ProducerConfig` and lifecycle hooks."""

    def __init__(self, config: ProducerConfig | None = None):
        self.config = config or ProducerConfig()

    def on_rollout_begin(self, global_step: int) -> None:
        """Called before each produce() invocation."""

    def on_rollout_end(self, dataset: Dataset, global_step: int) -> None:
        """Called after each produce() invocation with the produced dataset."""


class AsyncDataProducer:
    """Wraps a synchronous :class:`DataProducer` for background-thread data generation.

    While the Trainer trains on the current rollout, this wrapper produces upcoming
    datasets in a background thread.

    FSDP compatibility: Background threads must NOT call cross-rank collectives
    (gather_object, broadcast_object_list, FSDP all-gather) because the main thread
    may be doing FSDP forward/backward concurrently, causing deadlocks. When
    ``num_processes > 1``, only rank 0 runs BG generation; results are broadcast
    to other ranks on the main thread when ``produce()`` is next called.
    """

    def __init__(
        self, inner: DataProducer, background_produce_kwargs: dict | None = None
    ):
        self._inner = inner
        self._depth = inner.config.prefetch_depth
        self._warmup_remaining = inner.config.sync_warmup_rollouts
        self._background_kwargs = background_produce_kwargs or {}
        self._executor = concurrent.futures.ThreadPoolExecutor(
            max_workers=1, thread_name_prefix="async-producer"
        )
        self._queue: deque[concurrent.futures.Future] = deque()
        self._initialized = False
        # Lock held by the background thread during vLLM generation.
        # The main thread acquires this lock for weight sync to ensure
        # merge_adapter/unmerge_adapter don't overlap with generation.
        self._generate_lock = threading.Lock()
        # Detected at first produce() call
        self._num_processes: int | None = None
        self._is_main: bool | None = None

    @property
    def config(self) -> ProducerConfig:
        return self._inner.config

    def produce(self, model: Any, global_step: int, **kwargs) -> Dataset:
        """Return the next dataset, blocking if the prefetch hasn't finished."""
        # Detect multi-process on first call
        if self._num_processes is None:
            accelerator = kwargs.get("accelerator")
            if accelerator is not None:
                self._num_processes = accelerator.num_processes
                self._is_main = accelerator.is_main_process
            else:
                self._num_processes = 1
                self._is_main = True

        # During warmup, produce synchronously (on-policy)
        if self._warmup_remaining > 0:
            self._warmup_remaining -= 1
            _dp_logger.info(
                f"AsyncDataProducer: sync warmup rollout (remaining={self._warmup_remaining})"
            )
            return self._inner.produce(model, global_step, **kwargs)

        if not self._initialized:
            dataset = self._inner.produce(model, global_step, **kwargs)
            bg_kwargs = {**kwargs, **self._background_kwargs}
            # With FSDP (multi-process), only submit BG tasks on rank 0.
            # Non-rank-0 processes will receive data via broadcast.
            if self._num_processes > 1:
                bg_kwargs["_rank0_only"] = True
            for i in range(1, self._depth + 1):
                self._queue.append(
                    self._executor.submit(
                        self._locked_produce, model, global_step + i, **bg_kwargs
                    )
                )
            self._initialized = True
            return dataset

        # Get the pre-generated dataset from the BG thread
        dataset = self._queue.popleft().result()

        # With FSDP: BG thread only ran on rank 0. Broadcast to all ranks.
        if self._num_processes > 1:
            dataset = self._broadcast_dataset(dataset)

        bg_kwargs = {**kwargs, **self._background_kwargs}
        if self._num_processes > 1:
            bg_kwargs["_rank0_only"] = True
        next_step = global_step + self._depth
        self._queue.append(
            self._executor.submit(self._locked_produce, model, next_step, **bg_kwargs)
        )
        return dataset

    def _broadcast_dataset(self, dataset) -> Dataset:
        """Broadcast a prefetched dataset from rank 0 to all ranks (main thread).

        Rank 0 has a full RolloutDataset from BG generation; other ranks have None.
        After broadcast, tensors are moved to each rank's local device.
        """
        import torch.distributed as dist

        if not dist.is_initialized():
            return dataset

        # Rank 0 sends _data dict; others receive it
        obj_list = [dataset._data if self._is_main else None]
        dist.broadcast_object_list(obj_list, src=0)

        data: dict[str, Any] = obj_list[0]  # type: ignore[assignment]

        # Move tensors to local device (broadcast_object_list deserializes to CPU)
        accelerator = self._inner._trainer.accelerator  # type: ignore[attr-defined]
        device = accelerator.device
        for key, val in data.items():
            if isinstance(val, torch.Tensor) and val.device != device:
                data[key] = val.to(device)

        if not self._is_main:
            from axolotl.core.trainers.grpo.async_trainer import RolloutDataset

            dataset = RolloutDataset(data)
        else:
            # Rank 0 already has the dataset, but update _data with device-moved tensors
            dataset._data = data
        return dataset

    def _locked_produce(self, model: Any, global_step: int, **kwargs) -> Dataset:
        """Run produce while holding the generate lock."""
        with self._generate_lock:
            return self._inner.produce(model, global_step, **kwargs)

    def on_rollout_begin(self, global_step: int) -> None:
        if hasattr(self._inner, "on_rollout_begin"):
            self._inner.on_rollout_begin(global_step)

    def on_rollout_end(self, dataset: Dataset, global_step: int) -> None:
        if hasattr(self._inner, "on_rollout_end"):
            self._inner.on_rollout_end(dataset, global_step)

    def shutdown(self) -> None:
        """Shut down the background thread pool and cancel pending futures."""
        for future in self._queue:
            future.cancel()
        self._queue.clear()
        self._executor.shutdown(wait=False)


class DataProducerCallback:
    """Marker class: if a DataProducer also inherits from this, the Trainer will
    automatically register it as a callback."""

    pass


# ---------------------------------------------------------------------------
# RolloutDataset + GRPODataProducer
# ---------------------------------------------------------------------------


class RolloutDataset(Dataset):
    """A Dataset wrapping the output dict from _generate_and_score_completions.

    Per-sample tensors are sliced by index; shared metadata is passed through.
    """

    _ALWAYS_SHARED = frozenset(
        {"num_items_in_batch", "_pending_policy_logps", "_rank0_only"}
    )

    def __init__(self, data: dict[str, Any]):
        self._data = data
        self._shared_keys: set[str] = set()
        self._sample_keys: set[str] = set()

        for key, val in data.items():
            if key in self._ALWAYS_SHARED:
                self._shared_keys.add(key)
            elif not isinstance(val, torch.Tensor):
                self._shared_keys.add(key)
            elif val.dim() == 0:
                self._shared_keys.add(key)
            else:
                self._sample_keys.add(key)

        self._num_samples = 0
        for key in self._sample_keys:
            n = data[key].size(0)
            if self._num_samples == 0:
                self._num_samples = n
            elif n != self._num_samples:
                raise ValueError(
                    f"Inconsistent sample count: key '{key}' has {n}, expected {self._num_samples}"
                )
        if self._num_samples == 0:
            raise ValueError("No per-sample tensors found in rollout data")

    def __len__(self) -> int:
        return self._num_samples

    def __getitem__(self, idx: int) -> dict[str, Any]:
        item: dict[str, Any] = {}
        for key in self._sample_keys:
            item[key] = self._data[key][idx]
        for key in self._shared_keys:
            item[key] = self._data[key]
        return item


def make_rollout_collator(shared_keys: set[str]):
    """Return a collator that stacks per-sample tensors and passes shared keys through."""

    def _collate(batch: list[dict[str, Any]]) -> dict[str, Any]:
        result: dict[str, Any] = {}
        for key in batch[0]:
            if key in shared_keys:
                result[key] = batch[0][key]
            else:
                values = [item[key] for item in batch]
                if isinstance(values[0], torch.Tensor):
                    result[key] = torch.stack(values)
                else:
                    result[key] = values
        return result

    return _collate


class GRPODataProducer(BaseDataProducer):
    """Produces GRPO training rollouts using the trainer's generation pipeline.

    Created before Trainer.__init__ completes; the trainer reference is injected
    later via set_trainer().
    """

    def __init__(
        self,
        config: ProducerConfig,
        prompt_dataset,
        *,
        num_generations: int,
        generation_batch_size: int,
        train_batch_size: int,
        steps_per_generation: int,
        shuffle_dataset: bool,
        seed: int,
    ):
        super().__init__(config)
        self._dataset = prompt_dataset
        self._num_generations = num_generations
        self._generation_batch_size = generation_batch_size
        self._train_batch_size = train_batch_size
        self._steps_per_generation = steps_per_generation
        self._shuffle_dataset = shuffle_dataset
        self._seed = seed
        self._trainer: Any = None
        self._prompt_dl: Any = None
        self._prompt_iter: Any = None

    def set_trainer(self, trainer) -> None:
        """Inject the live trainer reference and create the prompt DataLoader."""
        self._trainer = trainer
        self._init_prompt_dataloader()

    def _init_prompt_dataloader(self) -> None:
        from functools import partial

        from transformers.trainer_utils import seed_worker

        trainer = self._trainer
        sampler = RepeatSampler(
            data_source=self._dataset,
            mini_repeat_count=self._num_generations,
            batch_size=self._generation_batch_size // self._num_generations,
            repeat_count=1,
            shuffle=self._shuffle_dataset,
            seed=self._seed,
        )

        # Use identity collator (same as stock GRPOTrainer)
        def _identity(x):
            return x

        dl = DataLoader(
            self._dataset,
            batch_size=self._train_batch_size * self._steps_per_generation,
            sampler=sampler,
            collate_fn=_identity,
            num_workers=trainer.args.dataloader_num_workers,
            pin_memory=trainer.args.dataloader_pin_memory,
            persistent_workers=trainer.args.dataloader_persistent_workers,
            worker_init_fn=partial(
                seed_worker,
                num_workers=trainer.args.dataloader_num_workers,
                rank=trainer.args.process_index,
            ),
        )
        self._prompt_dl = trainer.accelerator.prepare(dl)

        # Don't let accelerator track this dataloader
        acc_dls = trainer.accelerator._dataloaders
        if self._prompt_dl in acc_dls:
            acc_dls.remove(self._prompt_dl)

        self._prompt_iter = iter(self._prompt_dl)

    def produce(
        self,
        model: Any,
        global_step: int,
        *,
        skip_policy_logps: bool = False,
        processing_class: Any = None,
        accelerator: Any = None,
        args: Any = None,
        _rank0_only: bool = False,
        **kwargs,
    ) -> RolloutDataset | None:
        """Generate a fresh GRPO training rollout."""
        is_main = self._trainer.accelerator.is_main_process

        # FSDP rank0-only mode: non-rank-0 returns None (broadcast fills it later)
        if _rank0_only and not is_main:
            return None

        try:
            inputs = next(self._prompt_iter)
        except StopIteration:
            self._prompt_iter = iter(self._prompt_dl)
            inputs = next(self._prompt_iter)

        if skip_policy_logps:
            # Async path: use _generate_only (generation without scoring) which
            # works on stock TRL (no skip_policy_logps parameter needed).
            output = self._trainer._generate_only(inputs, rank0_only=_rank0_only)
        else:
            # Sync path: full generation + scoring
            output = self._trainer._generate_and_score_completions(inputs)

            # Strip non-sequence metadata before shuffling
            metadata = {}
            for key in list(output.keys()):
                val = output[key]
                if not isinstance(val, (torch.Tensor, list)):
                    metadata[key] = output.pop(key)
                elif isinstance(val, torch.Tensor) and val.dim() == 0:
                    metadata[key] = output.pop(key)

            output = shuffle_sequence_dict(output)
            output.update(metadata)

        return RolloutDataset(output)


# ---------------------------------------------------------------------------
# Trainer
# ---------------------------------------------------------------------------


class AsyncGRPOTrainer(GRPOTrainer):
    """GRPOTrainer with async prefetch, streaming scoring, and IS correction.

    Drop-in replacement: pass ``AsyncGRPOConfig`` as ``args`` and use this trainer
    instead of ``GRPOTrainer``.
    """

    def __init__(self, *args, **kwargs):
        # When using native LoRA sync, skip the NCCL communicator init in VLLMGeneration.
        # The communicator is not needed because weight sync happens via filesystem + HTTP,
        # and it fails when vLLM and a trainer rank share the same CUDA device.
        training_args = kwargs.get("args") or (args[1] if len(args) > 1 else None)
        if training_args is not None and getattr(
            training_args, "vllm_lora_sync", False
        ):
            from trl.generation.vllm_generation import VLLMGeneration

            _orig_init_vllm = VLLMGeneration._init_vllm

            def _init_vllm_no_communicator(self_vllm):
                """Init vLLM client without NCCL communicator (LoRA sync uses filesystem)."""
                if self_vllm.mode == "server" and self_vllm.accelerator.is_main_process:
                    from trl.generation.vllm_client import VLLMClient

                    if self_vllm.server_base_url is not None:
                        base_url = self_vllm.server_base_url
                    else:
                        base_url = (
                            f"http://{self_vllm.server_host}:{self_vllm.server_port}"
                        )
                    self_vllm.vllm_client = VLLMClient(
                        base_url=base_url,
                        group_port=self_vllm.group_port,
                        connection_timeout=self_vllm.server_timeout,
                    )
                    # Deliberately skip init_communicator — no NCCL needed
                elif self_vllm.mode != "server":
                    _orig_init_vllm(self_vllm)

            VLLMGeneration._init_vllm = _init_vllm_no_communicator

        super().__init__(*args, **kwargs)

        # FP8 models: zero out the pad token embedding so that padding
        # positions have zero hidden states throughout the network.
        # FP8 linear layers produce NaN on non-zero inputs at masked
        # positions (the Triton fp8 matmul kernel can't handle the
        # extreme values that accumulate at unattended positions).
        self._zero_pad_embedding_for_fp8()

        # Ensure custom attributes exist (stock GRPOTrainer.__init__ may not set them).
        for attr, cfg_key, default in [
            (
                "vllm_importance_sampling_correction",
                "vllm_importance_sampling_correction",
                True,
            ),
            (
                "vllm_importance_sampling_mode",
                "vllm_importance_sampling_mode",
                "token_truncate",
            ),
            ("vllm_importance_sampling_cap", "vllm_importance_sampling_cap", 3.0),
            ("off_policy_mask_threshold", "off_policy_mask_threshold", None),
        ]:
            if not hasattr(self, attr):
                setattr(self, attr, getattr(self.args, cfg_key, default))

        # Async state
        self._async_queue: queue.Queue | None = None
        self._executor: concurrent.futures.ThreadPoolExecutor | None = None
        self._prompt_iter = None
        self._last_synced_step = -1
        self._buffered_inputs: list | None = None  # override stock attr

        # Data producer (the proper architecture for async generation)
        self.data_producer = None
        if getattr(self.args, "use_data_producer", False):
            self.data_producer = self._create_data_producer(
                kwargs["args"], kwargs["train_dataset"]
            )

        if self.args.async_prefetch and self.data_producer is None:
            # Legacy path: direct _prepare_inputs override without data producer
            self._setup_async()

    def _create_data_producer(self, args, train_dataset):
        """Create and return the GRPODataProducer (possibly wrapped in AsyncDataProducer)."""
        producer_config = ProducerConfig(
            mini_epochs=args.num_iterations,
            max_rollouts=None,
            eval_during_produce=False,
            empty_cache_before_produce=True,
            empty_cache_after_produce=True,
            async_prefetch=args.async_prefetch,
            prefetch_depth=args.prefetch_depth,
        )
        data_producer = GRPODataProducer(
            config=producer_config,
            prompt_dataset=train_dataset,
            num_generations=self.num_generations,
            generation_batch_size=args.generation_batch_size,
            train_batch_size=args.per_device_train_batch_size,
            steps_per_generation=args.steps_per_generation,
            shuffle_dataset=getattr(self, "shuffle_dataset", True),
            seed=args.seed,
        )
        data_producer.set_trainer(self)

        if args.async_prefetch:
            data_producer = AsyncDataProducer(
                data_producer,
                background_produce_kwargs={"skip_policy_logps": True},
            )
        return data_producer

    # ------------------------------------------------------------------
    # Async setup / teardown
    # ------------------------------------------------------------------

    def _setup_async(self):
        """Create background thread pool, prompt iterator, and pre-fill the async queue."""
        gen_batch_size = getattr(
            self.args,
            "generation_batch_size",
            self._train_batch_size * self.args.gradient_accumulation_steps,
        )
        # RepeatSampler groups prompts with num_generations repetitions each.
        # DataLoader batches the yielded indices into generation-sized batches.
        sampler = RepeatSampler(
            data_source=self.train_dataset,
            mini_repeat_count=self.num_generations,
            batch_size=gen_batch_size // self.num_generations,
            repeat_count=10_000,  # effectively infinite
            shuffle=True,
            seed=self.args.seed,
        )
        self._prompt_dataloader = DataLoader(
            self.train_dataset,
            batch_size=gen_batch_size,
            sampler=sampler,
            collate_fn=self.data_collator,
            num_workers=0,
        )
        self._prompt_iter = iter(self._prompt_dataloader)
        self._async_queue = queue.Queue(maxsize=self.args.prefetch_depth)
        self._executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)

        # Pre-submit generations to fill the queue
        for _ in range(self.args.prefetch_depth):
            self._submit_generation()

        atexit.register(self._shutdown_async)

    def _shutdown_async(self):
        if self._executor is not None:
            self._executor.shutdown(wait=False, cancel_futures=True)
            self._executor = None

    def _submit_generation(self):
        """Submit the next background generation job."""
        batch = next(self._prompt_iter)
        future = self._executor.submit(self._generate_only, batch)
        self._async_queue.put(future)

    # ------------------------------------------------------------------
    # Weight sync
    # ------------------------------------------------------------------

    def _sync_peft_weights_no_merge(self):
        """Thread-safe weight sync: compute merged LoRA weights without in-place modification.

        Required for FP8 models where merge_adapter() fails (addmm not implemented
        for Float8), and also safe for concurrent use since it never modifies base
        weights in-place.
        """
        model = self.vllm_generation.model
        accelerator = self.vllm_generation.accelerator
        vllm_client = self.vllm_generation.vllm_client
        fix_name = self.vllm_generation._fix_param_name_to_vllm

        if not (self.vllm_generation.mode == "server" and accelerator.is_main_process):
            return

        # Build lookup: module_path -> (A, B, scaling) for all active LoRA layers
        lora_info = {}
        for mod_name, module in model.base_model.model.named_modules():
            if not hasattr(module, "lora_A") or not hasattr(module, "active_adapters"):
                continue
            active = module.active_adapters[0]
            if active not in module.lora_A:
                continue
            lora_info[mod_name] = (
                module.lora_A[active].weight.data,
                module.lora_B[active].weight.data,
                module.scaling[active],
            )

        # Build lookup for FP8 scale_inv parameters (needed for dequantization)
        scale_inv_lookup = {}
        for pname, pparam in model.named_parameters():
            if "weight_scale_inv" in pname:
                # Map weight name -> scale_inv tensor
                weight_name = pname.replace(".weight_scale_inv", ".weight")
                scale_inv_lookup[weight_name] = pparam.data

        # Iterate all parameters, computing merged weights for LoRA layers.
        # Skip LoRA-specific params and FP8 scale params (scales will be
        # recomputed by vLLM when it receives the merged bf16 weight).
        params_to_sync = []
        for name, param in model.named_parameters():
            vllm_name = name.removeprefix("base_model.model.").replace(
                ".base_layer", ""
            )
            if model.prefix in vllm_name:
                continue
            if "original_module" in vllm_name:
                continue
            # Skip FP8 quantization scale parameters - they are recomputed
            # on the vLLM side when we update the weight itself
            if "weight_scale_inv" in vllm_name or "input_scale" in vllm_name:
                continue
            vllm_name = fix_name(vllm_name, extra_prefixes=["modules_to_save.default."])

            data = param.data
            compute_dtype = torch.bfloat16

            if vllm_name.endswith(".weight"):
                # Dequantize FP8 weights before merging
                if data.dtype == torch.float8_e4m3fn and name in scale_inv_lookup:
                    scale_inv = scale_inv_lookup[name]
                    # Block dequantization: weight * scale_inv (with broadcasting)
                    fp8_bf16 = data.to(compute_dtype)
                    if scale_inv.dim() == 2 and fp8_bf16.dim() == 2:
                        # Block-quantized: scale_inv shape (rows/block, cols/block)
                        sr, sc = scale_inv.shape
                        br = fp8_bf16.shape[0] // sr  # block height
                        bc = fp8_bf16.shape[1] // sc  # block width
                        # Reshape → multiply by block scale → reshape back
                        data = (
                            fp8_bf16.reshape(sr, br, sc, bc)
                            * scale_inv[:, None, :, None].to(compute_dtype)
                        ).reshape(fp8_bf16.shape)
                    elif scale_inv.dim() <= 1:
                        # Per-tensor or per-channel scale
                        data = fp8_bf16 * scale_inv.to(compute_dtype)
                    else:
                        data = fp8_bf16
                elif data.dtype == torch.float8_e4m3fn:
                    # FP8 but no scale found - just cast (lossy)
                    data = data.to(compute_dtype)

                mod_path = vllm_name[: -len(".weight")]
                if mod_path in lora_info:
                    A, B, s = lora_info[mod_path]
                    merged = data.to(compute_dtype) + s * (
                        B.to(compute_dtype) @ A.to(compute_dtype)
                    )
                    data = merged

            params_to_sync.append((vllm_name, data))

        # Batch sync all params in one HTTP+NCCL call (vs individual calls)
        if params_to_sync:
            vllm_client.batch_update_named_params(params_to_sync)

        # Reset prefix cache after weight update
        vllm_client.reset_prefix_cache()

    def _sync_lora_adapter(self):
        """Sync LoRA adapter to vLLM via filesystem (native LoRA mode).

        Saves the PEFT adapter to a temp directory and POSTs the path to vLLM's
        /set_lora_adapter/ endpoint. vLLM loads the adapter natively using Punica
        kernels, avoiding the need to merge weights and NCCL-broadcast the full model.

        Syncs only the LoRA adapter weights via filesystem instead of the full merged model via NCCL.

        FSDP/DeepSpeed: All ranks must participate in the state_dict gather.
        accelerator.get_state_dict() handles this (FSDP uses FullStateDictConfig
        with rank0_only=True). Only rank 0 gets the full dict, writes files, and
        does the HTTP POST.
        """
        import os
        import tempfile

        accelerator = self.vllm_generation.accelerator
        model = self.vllm_generation.model

        if self.vllm_generation.mode != "server":
            return

        is_main = accelerator.is_main_process

        # Increment adapter version (all ranks, kept in sync)
        if not hasattr(self, "_lora_sync_version"):
            self._lora_sync_version = 0
            if is_main:
                self._lora_sync_dir = tempfile.mkdtemp(prefix="lora_sync_")
            else:
                self._lora_sync_dir = None
            # Broadcast sync dir from rank 0 to all ranks
            if accelerator.num_processes > 1:
                import torch.distributed as dist

                if dist.is_initialized():
                    obj_list = [self._lora_sync_dir]
                    dist.broadcast_object_list(obj_list, src=0)
                    self._lora_sync_dir = obj_list[0]
        self._lora_sync_version += 1

        adapter_path = os.path.join(self._lora_sync_dir, f"v{self._lora_sync_version}")

        # Gather state dict from all ranks (FSDP/DeepSpeed gather, rank0_only)
        # All ranks must participate even though only rank 0 gets the result.
        # Use self.model_wrapped (the DeepSpeed/FSDP engine) for get_state_dict,
        # since it has the necessary hooks (e.g. zero_gather_16bit_weights_on_model_save).
        # self.vllm_generation.model is the unwrapped PEFT model which lacks these.
        wrapped_model = getattr(self, "model_wrapped", model)
        state_dict = accelerator.get_state_dict(wrapped_model)

        if is_main:
            # Unwrap to access PEFT's save_pretrained
            unwrapped = accelerator.unwrap_model(model)
            unwrapped.save_pretrained(adapter_path, state_dict=state_dict)

            import requests

            vllm_client = self.vllm_generation.vllm_client
            url = f"{vllm_client.base_url}/set_lora_adapter/"
            response = requests.post(
                url,
                json={
                    "lora_name": "active_lora",
                    "lora_int_id": self._lora_sync_version,
                    "lora_path": adapter_path,
                },
                timeout=30,
            )
            if response.status_code != 200:
                logger.warning(
                    "Failed to set LoRA adapter: %s %s",
                    response.status_code,
                    response.text,
                )
                return

            # Reset prefix cache after adapter update
            vllm_client.reset_prefix_cache()

            # Clean up old adapter versions (keep only current)
            if self._lora_sync_version > 1:
                old_path = os.path.join(
                    self._lora_sync_dir, f"v{self._lora_sync_version - 1}"
                )
                if os.path.exists(old_path):
                    import shutil

                    shutil.rmtree(old_path, ignore_errors=True)

            logger.info(
                "Synced LoRA adapter v%d to vLLM (%s)",
                self._lora_sync_version,
                adapter_path,
            )

        # Barrier to ensure all ranks complete before resuming forward passes.
        # Without this, rank 1 may start a forward pass (triggering FSDP unshard)
        # while rank 0 is still doing save_pretrained, causing FSDP all-gather deadlock.
        if accelerator.num_processes > 1:
            import torch.distributed as dist

            if dist.is_initialized():
                dist.barrier()

    def _maybe_sync_vllm_weights(self):
        """Sync model weights to vLLM if the interval has elapsed.

        Dispatches to one of three strategies:
        - vllm_lora_sync: saves adapter to filesystem, vLLM loads natively
        - PEFT no-merge: computes merged weights as new tensors, NCCL broadcast
        - Non-PEFT: stock sync_weights via merge_adapter + NCCL
        """
        if not (self.use_vllm and self.args.async_prefetch):
            return
        step = self.state.global_step
        interval = self.args.vllm_sync_interval
        if step != self._last_synced_step and step % interval == 0:
            if getattr(self.args, "vllm_lora_sync", False):
                if step == 0:
                    logger.info("Skipping LoRA sync at step 0 (no training yet)")
                    self._last_synced_step = step
                    return
                # Native LoRA sync: save adapter to filesystem, vLLM loads it directly
                self._sync_lora_adapter()
            else:
                from accelerate.utils import is_peft_model

                use_no_merge = is_peft_model(self.vllm_generation.model)

                if use_no_merge:
                    # No-merge sync: computes merged weights as new tensors
                    # (doesn't modify base weights in-place), so it's safe to
                    # run concurrently with BG generation — no lock needed.
                    self._sync_peft_weights_no_merge()
                else:
                    # Non-PEFT: use stock sync (acquires lock to avoid overlap)
                    if self.data_producer is not None and hasattr(
                        self.data_producer, "_generate_lock"
                    ):
                        with self.data_producer._generate_lock:
                            self.vllm_generation.sync_weights()
                    elif self._async_queue is not None:
                        pending = list(self._async_queue.queue)
                        for f in pending:
                            if isinstance(f, concurrent.futures.Future):
                                f.result()
                        self.vllm_generation.sync_weights()
                    else:
                        self.vllm_generation.sync_weights()
            self._last_synced_step = step

    def _zero_pad_embedding_for_fp8(self):
        """Zero out the pad token embedding for FP8 models.

        FP8 linear layers produce NaN when processing positions with
        attention_mask=0 (the hidden states at those positions have
        unconstrained values that overflow FP8 range during
        quantization). By setting the pad token embedding to zeros,
        padding positions start with zero hidden states and stay zero
        through masked attention, preventing NaN from FP8 matmul.
        """
        model = self.accelerator.unwrap_model(self.model)
        # Check if model has FP8 weights
        has_fp8 = any(
            p.dtype == torch.float8_e4m3fn
            for p in model.parameters()
            if not p.requires_grad
        )
        if not has_fp8:
            return

        # Find the embedding layer
        if hasattr(model, "model") and hasattr(model.model, "embed_tokens"):
            embed = model.model.embed_tokens
        elif hasattr(model, "base_model") and hasattr(model.base_model, "model"):
            m = model.base_model.model
            if hasattr(m, "model") and hasattr(m.model, "embed_tokens"):
                embed = m.model.embed_tokens
            else:
                return
        else:
            return

        pad_id = self.processing_class.pad_token_id
        if pad_id is not None and pad_id < embed.weight.shape[0]:
            with torch.no_grad():
                embed.weight.data[pad_id].zero_()
            import logging

            logging.getLogger("async_grpo").info(
                f"Zeroed pad token embedding (id={pad_id}) for FP8 NaN prevention"
            )

    # ------------------------------------------------------------------
    # Background-thread generation (no scoring)
    # ------------------------------------------------------------------

    def _generate_single_turn(self, prompts, **kwargs):
        """Override to prevent weight sync from background thread and to use
        no-merge sync for PEFT models (FP8 models can't merge_adapter)."""
        is_bg = threading.current_thread() is not threading.main_thread()
        saved_step = None

        if is_bg and self.use_vllm:
            # Trick: match _last_loaded_step so the stock sync check is a no-op
            saved_step = getattr(self, "_last_loaded_step", None)
            self._last_loaded_step = self.state.global_step

        # Permanently replace vllm_generation.sync_weights with our custom
        # sync to avoid merge_adapter (fails on FP8 / races with training).
        # For LoRA sync mode, make it a no-op here since _maybe_sync_vllm_weights
        # handles the sync with proper interval tracking.
        if not getattr(self, "_patched_sync_weights", False):
            if self.use_vllm and hasattr(self, "vllm_generation"):
                if getattr(self.args, "vllm_lora_sync", False):
                    # No-op: LoRA sync is driven by _maybe_sync_vllm_weights
                    self.vllm_generation.sync_weights = lambda: None
                    self._patched_sync_weights = True
                else:
                    from accelerate.utils import is_peft_model

                    if is_peft_model(self.vllm_generation.model):

                        def _no_merge_sync():
                            self._sync_peft_weights_no_merge()

                        self.vllm_generation.sync_weights = _no_merge_sync
                        self._patched_sync_weights = True

        try:
            return super()._generate_single_turn(prompts, **kwargs)
        finally:
            if saved_step is not None:
                self._last_loaded_step = saved_step

    def _generate_rank0_only(self, prompts):
        """Generate using vLLM directly on rank 0 without cross-rank collectives.

        Called from BG thread in FSDP mode. Bypasses ``gather_object`` /
        ``broadcast_object_list`` since the main thread may be running FSDP
        collectives concurrently.

        Returns the same tuple as ``_generate``.
        """
        import copy

        prompts = copy.deepcopy(prompts)

        # Duplicate prompts for num_generations (same as TRL's gather+unique pattern)
        num_generations = self.num_generations
        unique_prompts = prompts[::num_generations]

        # Build sampling params
        vg = self.vllm_generation
        sampling_params = {
            "n": num_generations,
            "repetition_penalty": vg.repetition_penalty,
            "temperature": vg.temperature,
            "top_p": vg.top_p,
            "top_k": vg.top_k,
            "min_p": 0.0 if vg.min_p is None else vg.min_p,
            "max_tokens": vg.max_completion_length,
            "logprobs": vg.logprobs,
            "structured_outputs_regex": vg.structured_outputs_regex,
            "generation_kwargs": vg.generation_kwargs,
        }

        # Call vLLM directly (no collectives)
        from trl.data_utils import is_conversational

        if is_conversational({"prompt": unique_prompts[0]}):
            output = vg.vllm_client.chat(
                messages=unique_prompts,
                **sampling_params,
                chat_template_kwargs=vg.chat_template_kwargs,
                tools=vg.tools,
                chat_template=vg.chat_template,
            )
        else:
            output = vg.vllm_client.generate(prompts=unique_prompts, **sampling_params)

        # vLLM returns 1 prompt_ids per unique prompt, but num_generations completion_ids.
        # Duplicate prompt_ids to match completions (one per generation).
        raw_prompt_ids = output["prompt_ids"]
        prompt_ids = [pid for pid in raw_prompt_ids for _ in range(num_generations)]
        completion_ids = output["completion_ids"]
        logprobs_raw = output["logprobs"]
        extra_fields = {
            k: v
            for k, v in output.items()
            if k
            not in {"prompt_ids", "completion_ids", "logprobs", "logprob_token_ids"}
        }

        # Extract top-1 logprob per token
        logprobs = [[lp[0] for lp in seq] for seq in logprobs_raw]

        # Decode completions
        if is_conversational({"prompt": prompts[0]}):
            contents = self.processing_class.batch_decode(
                completion_ids, skip_special_tokens=True
            )
            completions = [[{"role": "assistant", "content": c}] for c in contents]
        else:
            completions = self.processing_class.batch_decode(
                completion_ids, skip_special_tokens=True
            )

        tool_mask = extra_fields.pop("env_mask", None)

        # Compute total completion tokens locally (no gather)
        total_completion_tokens = sum(len(ids) for ids in completion_ids)

        return (
            prompt_ids,
            completion_ids,
            tool_mask,
            completions,
            total_completion_tokens,
            logprobs,
            extra_fields,
        )

    def _generate_only(self, inputs, rank0_only=False):
        """Generate completions without scoring.  Runs on background thread.

        Mirrors the first half of ``_generate_and_score_completions`` (prompt
        extraction → vLLM generation → tensor padding) and returns a deferred
        output dict for main-thread scoring.

        When ``rank0_only=True`` (FSDP mode), bypasses ``gather_object`` /
        ``broadcast_object_list`` collectives and calls vLLM directly on rank 0.
        Results are broadcast to other ranks on the main thread later.

        Args:
            inputs: list of dicts (one per sample), as yielded by the DataLoader
                    with ``identity`` collate_fn.
        """
        device = self.accelerator.device

        prompts = [x["prompt"] for x in inputs]

        # --- Handle images (multimodal) ---
        if "images" in inputs[0]:
            images = [ex.get("images") for ex in inputs]
        elif "image" in inputs[0]:
            images = [
                [ex.get("image")] if ex.get("image") is not None else None
                for ex in inputs
            ]
        else:
            images = None
        if images is not None and all(img == [] for img in images):
            images = None

        if images is not None:
            if not is_conversational(inputs[0]):
                raise ValueError("Multimodal training requires conversational prompts.")
            prompts = [
                prepare_multimodal_messages(p, il)
                for p, il in zip(prompts, images, strict=True)
            ]

        # --- Generate completions ---
        if rank0_only:
            # FSDP mode: call vLLM directly without cross-rank collectives
            (
                prompt_ids_list,
                completion_ids_list,
                tool_mask_list,
                completions,
                num_items_in_batch,
                sampling_per_token_logps_list,
                extra_fields,
            ) = self._generate_rank0_only(prompts)
        else:
            (
                prompt_ids_list,
                completion_ids_list,
                tool_mask_list,
                completions,
                num_items_in_batch,
                sampling_per_token_logps_list,
                extra_fields,
            ) = self._generate(prompts)
            # _generate gathers prompts from all ranks internally. Gather inputs
            # to match the full-batch output size.
            if self.accelerator.num_processes > 1:
                from accelerate.utils import gather_object

                inputs = gather_object(inputs)
                prompts = [x["prompt"] for x in inputs]

        # --- Pad to tensors ---
        prompt_ids = [torch.tensor(ids, device=device) for ids in prompt_ids_list]
        prompt_mask = [torch.ones_like(ids, dtype=torch.long) for ids in prompt_ids]
        prompt_ids = pad(
            prompt_ids, padding_value=self.pad_token_id, padding_side="left"
        )
        prompt_mask = pad(prompt_mask, padding_value=0, padding_side="left")

        completion_ids = [
            torch.tensor(ids, device=device) for ids in completion_ids_list
        ]
        completion_mask = [
            torch.ones_like(ids, dtype=torch.long) for ids in completion_ids
        ]
        completion_ids = pad(
            completion_ids, padding_value=self.pad_token_id, padding_side="right"
        )
        completion_mask = pad(completion_mask, padding_value=0, padding_side="right")

        if sampling_per_token_logps_list is not None:
            sampling_logps = [
                torch.tensor(lp, device=device) for lp in sampling_per_token_logps_list
            ]
            sampling_per_token_logps = pad(
                sampling_logps, padding_value=0.0, padding_side="right"
            )
        else:
            sampling_per_token_logps = None

        if tool_mask_list is not None:
            tool_mask = [torch.tensor(m, device=device) for m in tool_mask_list]
            tool_mask = pad(tool_mask, padding_value=1, padding_side="right")
        else:
            tool_mask = None

        # --- Mask truncated completions ---
        if self.mask_truncated_completions:
            eos_and_pad = [self.eos_token_id, self.pad_token_id]
            is_trunc = torch.tensor(
                [ids[-1] not in eos_and_pad for ids in completion_ids_list],
                device=device,
            )
            completion_mask = completion_mask * (~is_trunc).unsqueeze(1).int()
            if tool_mask is not None:
                tool_mask = tool_mask * (~is_trunc).unsqueeze(1).int()

        # --- Multimodal forward kwargs ---
        num_images = [len(il) for il in images] if images is not None else None
        if images is not None:
            prompts_text = [
                apply_chat_template(
                    {"prompt": p},
                    self.processing_class,
                    tools=self.tools,
                    **self.chat_template_kwargs,
                )["prompt"]
                for p in prompts
            ]
            prompt_inputs = self.processing_class(
                images=images, text=prompts_text, padding=True, return_tensors="pt"
            )
            forward_kwargs = {
                k: v.to(device) if isinstance(v, torch.Tensor) else v
                for k, v in prompt_inputs.items()
                if k not in ("input_ids", "attention_mask")
            }
        else:
            forward_kwargs = {}

        # Extend token_type_ids / mm_token_type_ids for completion tokens
        for ttid_key in ("token_type_ids", "mm_token_type_ids"):
            if ttid_key in forward_kwargs:
                tt = forward_kwargs[ttid_key]
                forward_kwargs[ttid_key] = torch.cat(
                    [tt, tt.new_zeros(completion_ids.shape)], dim=1
                )

        # Merge extra_fields from rollout_func into inputs
        if extra_fields:
            for i, inp in enumerate(inputs):
                for key, values in extra_fields.items():
                    if isinstance(values, list) and i < len(values):
                        inp[key] = values[i]
                    elif not isinstance(values, list):
                        inp[key] = values

        # No explicit CUDA sync needed here — both threads share the
        # default stream, so operations are naturally ordered.

        # --- Construct deferred output ---
        output = {
            "prompt_ids": prompt_ids,
            "prompt_mask": prompt_mask,
            "completion_ids": completion_ids,
            "completion_mask": completion_mask,
            "num_items_in_batch": num_items_in_batch,
            "advantages": torch.zeros(completion_ids.size(0), device=device),
            # Sentinels for deferred scoring
            "_pending_policy_logps": True,
            "_deferred_inputs": inputs,
            "_deferred_prompts": prompts,
            "_deferred_completions": completions,
            "_deferred_completion_ids_list": completion_ids_list,
            "_rank0_only": rank0_only,
        }
        if sampling_per_token_logps is not None:
            output["sampling_per_token_logps"] = sampling_per_token_logps
        if tool_mask is not None:
            output["tool_mask"] = tool_mask
        if images is not None:
            output["num_images"] = num_images
        for k in (
            "pixel_values",
            "image_grid_thw",
            "pixel_attention_mask",
            "image_sizes",
            "token_type_ids",
            "mm_token_type_ids",
        ):
            if k in forward_kwargs:
                output[k] = forward_kwargs[k]
        return output

    # ------------------------------------------------------------------
    # Hooks (overridden by subclasses like FastAsyncGRPOTrainer)
    # ------------------------------------------------------------------

    def _compute_rewards_for_batch(
        self, inputs, prompts, completions, completion_ids_list
    ):
        """Compute rewards for a batch. Override for parallel workers, caching, etc."""
        return self._calculate_rewards(
            inputs, prompts, completions, completion_ids_list
        )

    def _launch_reward_workers(self, inputs, prompts, completions, completion_ids_list):
        """Launch reward computation in background. Override for parallel dispatch.

        Default: no-op (rewards computed synchronously in _collect_reward_workers).
        """
        self._pending_reward_args = (inputs, prompts, completions, completion_ids_list)

    def _collect_reward_workers(
        self, inputs, prompts, completions, completion_ids_list
    ):
        """Collect reward results. Override to collect from parallel workers.

        Default: compute rewards synchronously now.
        """
        args = getattr(self, "_pending_reward_args", None)
        if args is not None:
            self._pending_reward_args = None
            return self._compute_rewards_for_batch(*args)
        return self._compute_rewards_for_batch(
            inputs, prompts, completions, completion_ids_list
        )

    def _post_advantage_hook(
        self,
        data: dict,
        rewards_per_func,
        advantages,
        inputs: list,
        num_generations: int,
        mode: str,
        s_start: int | None = None,
        s_end: int | None = None,
        is_last_chunk: bool = True,
    ) -> None:
        """Called after advantages are computed. Override for replay buffer, re-roll, etc."""

    # ------------------------------------------------------------------
    # Main-thread scoring
    # ------------------------------------------------------------------

    @torch.no_grad()
    def _compute_deferred_scores(self, rollout: dict) -> dict:
        """Compute rewards, advantages, policy logprobs, and IS ratio on the main thread.

        Takes the deferred output from ``_generate_only`` and produces a fully
        scored dict ready for ``split_tensor_dict`` → micro-batches.
        """
        device = self.accelerator.device
        batch_size = self.args.per_device_train_batch_size
        num_generations = self.num_generations
        mode = "train"

        # --- Extract deferred data ---
        data = rollout
        inputs = data.pop("_deferred_inputs")
        prompts = data.pop("_deferred_prompts")
        completions = data.pop("_deferred_completions")
        completion_ids_list = data.pop("_deferred_completion_ids_list")
        rank0_only = data.pop("_rank0_only", False)
        del data["_pending_policy_logps"]

        prompt_ids = data["prompt_ids"]
        completion_ids = data["completion_ids"]
        prompt_mask = data["prompt_mask"]
        completion_mask = data["completion_mask"]
        prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
        logits_to_keep = completion_ids.size(1)

        # Multimodal forward kwargs
        forward_kwargs = {}
        for key in (
            "pixel_values",
            "image_grid_thw",
            "pixel_attention_mask",
            "image_sizes",
            "token_type_ids",
            "mm_token_type_ids",
        ):
            if key in data:
                forward_kwargs[key] = data[key]
        num_images = data.get("num_images")

        # --- Launch rewards in parallel with logprobs ---
        self._launch_reward_workers(inputs, prompts, completions, completion_ids_list)

        # --- Policy logprobs ---
        logprob_batch_size = min(batch_size * 4, len(prompt_ids))
        with disable_gradient_checkpointing(
            self.model, self.args.gradient_checkpointing_kwargs
        ):
            generate_every = self.args.steps_per_generation * self.num_iterations
            if self.args.gradient_accumulation_steps % generate_every != 0 or (
                self.use_vllm
                and getattr(self, "vllm_importance_sampling_correction", False)
            ):
                old_per_token_logps, _ = self._get_per_token_logps_and_entropies(
                    self.model,
                    prompt_completion_ids,
                    attention_mask,
                    logits_to_keep,
                    logprob_batch_size,
                    num_images=num_images,
                    **forward_kwargs,
                )
                data["old_per_token_logps"] = old_per_token_logps
            else:
                old_per_token_logps = None

            # Reference model logprobs
            if self.beta != 0.0:
                if self.ref_model is not None:
                    ref_logps, _ = self._get_per_token_logps_and_entropies(
                        self.ref_model,
                        prompt_completion_ids,
                        attention_mask,
                        logits_to_keep,
                        batch_size,
                        num_images=num_images,
                        **forward_kwargs,
                    )
                else:
                    unwrapped = self.accelerator.unwrap_model(self.model)
                    adapter_name = (
                        "ref"
                        if hasattr(unwrapped, "peft_config")
                        and "ref" in unwrapped.peft_config
                        else None
                    )
                    with use_adapter(unwrapped, adapter_name=adapter_name):
                        ref_logps, _ = self._get_per_token_logps_and_entropies(
                            self.model,
                            prompt_completion_ids,
                            attention_mask,
                            logits_to_keep,
                            batch_size,
                            num_images=num_images,
                            **forward_kwargs,
                        )
                data["ref_per_token_logps"] = ref_logps

        # --- IS ratio ---
        if (
            self.use_vllm
            and getattr(self, "vllm_importance_sampling_correction", False)
            and old_per_token_logps is not None
            and "sampling_per_token_logps" in data
        ):
            sampling_logps = data["sampling_per_token_logps"]
            is_mask = (
                completion_mask
                if "tool_mask" not in data
                else completion_mask * data["tool_mask"]
            )
            per_token_logps_diff = (old_per_token_logps - sampling_logps) * is_mask

            is_mode = getattr(self, "vllm_importance_sampling_mode", "token_truncate")
            is_cap = getattr(self, "vllm_importance_sampling_cap", 3.0)
            sequence_level_is = is_mode in ("sequence_mask", "sequence_truncate")
            if sequence_level_is:
                logps_diff = per_token_logps_diff.sum(dim=-1, keepdim=True)
            else:
                logps_diff = per_token_logps_diff

            is_ratio = torch.exp(logps_diff)
            if is_mode in ("sequence_truncate", "token_truncate"):
                is_ratio = torch.clamp(is_ratio, max=is_cap)
            elif is_mode in ("sequence_mask", "token_mask"):
                is_ratio = is_ratio.masked_fill(is_ratio > is_cap, value=0.0)
            data["importance_sampling_ratio"] = is_ratio

        # --- Collect rewards (launched before logprobs, should be done) ---
        rewards_per_func = self._collect_reward_workers(
            inputs, prompts, completions, completion_ids_list
        )
        # In rank0_only mode, all ranks compute the same rewards on identical data.
        # _calculate_rewards / _collect_reward_workers always `gather()` across ranks,
        # which duplicates the rows (N_local * num_processes).  De-duplicate so that
        # rewards_per_func matches the data dict (which has N_local rows).
        if rank0_only and rewards_per_func.size(0) > len(prompts):
            rewards_per_func = rewards_per_func[: len(prompts)]

        # --- Advantages ---
        if self.multi_objective_aggregation == "sum_then_normalize":
            rewards = (
                rewards_per_func * self.reward_weights.to(device).unsqueeze(0)
            ).nansum(dim=1)
            mean_grouped = (
                rewards.view(-1, num_generations)
                .mean(dim=1)
                .repeat_interleave(num_generations)
            )
            if self.scale_rewards in ("group", "none"):
                if num_generations > 1:
                    std_rewards = (
                        rewards.view(-1, num_generations)
                        .std(dim=1)
                        .repeat_interleave(num_generations)
                    )
                else:
                    std_rewards = torch.zeros_like(rewards)
            elif self.scale_rewards == "batch":
                std_rewards = (
                    rewards.std().expand_as(rewards)
                    if rewards.numel() > 1
                    else torch.zeros_like(rewards)
                )
            else:
                raise ValueError(f"Invalid scale_rewards: {self.scale_rewards}")
            advantages = rewards - mean_grouped
            if self.scale_rewards != "none":
                advantages = advantages / (std_rewards + 1e-4)
            is_std_zero = torch.isclose(std_rewards, torch.zeros_like(std_rewards))

        elif self.multi_objective_aggregation == "normalize_then_sum":
            grouped = rewards_per_func.view(-1, num_generations, len(self.reward_funcs))
            mean_k = torch.nanmean(grouped, dim=1, keepdim=True)
            std_k = (
                nanstd(grouped, dim=1, keepdim=True)
                if num_generations > 1
                else torch.zeros_like(mean_k)
            )
            reward_k = (grouped - mean_k) / (std_k + 1e-4)
            reward_k = reward_k.view(-1, len(self.reward_funcs))
            rewards = (reward_k * self.reward_weights.to(device).unsqueeze(0)).nansum(
                dim=1
            )
            std_rewards = (
                rewards.std().expand_as(rewards)
                if rewards.numel() > 1
                else torch.zeros_like(rewards)
            )
            advantages = (rewards - rewards.mean()) / (std_rewards + 1e-4)
            is_std_zero = torch.isclose(std_rewards, torch.zeros_like(std_rewards))
        else:
            raise ValueError(
                f"Invalid multi_objective_aggregation: {self.multi_objective_aggregation}"
            )

        # Slice for local process
        # In rank0_only mode, all ranks already have identical data from broadcast,
        # so no slicing needed. Otherwise, each rank takes its portion.
        if rank0_only:
            process_slice = slice(0, len(prompts))
        else:
            process_slice = slice(
                self.accelerator.process_index * len(prompts),
                (self.accelerator.process_index + 1) * len(prompts),
            )
        all_advantages = advantages.clone()
        advantages = advantages[process_slice]
        data["advantages"] = advantages

        # --- Post-advantage hook (for replay buffer, re-roll, etc.) ---
        self._post_advantage_hook(
            data,
            rewards_per_func,
            advantages,
            inputs,
            num_generations,
            mode,
        )

        # --- Metrics ---
        for i, name in enumerate(self.reward_func_names):
            self._metrics[mode][f"rewards/{name}/mean"].append(
                torch.nanmean(rewards_per_func[:, i]).item()
            )
            self._metrics[mode][f"rewards/{name}/std"].append(
                nanstd(rewards_per_func[:, i]).item()
            )
        agg_rewards = rewards_per_func.nansum(dim=1)
        self._metrics[mode]["reward"].append(agg_rewards.mean().item())
        self._metrics[mode]["reward_std"].append(agg_rewards.std().item())
        self._metrics[mode]["frac_reward_zero_std"].append(
            is_std_zero.float().mean().item()
        )

        # Token counting
        total_prompt = self.accelerator.gather(prompt_mask.sum()).sum()
        total_completion = self.accelerator.gather(completion_mask.sum()).sum()
        self.state.num_input_tokens_seen += (total_prompt + total_completion).item()
        self._metrics[mode]["num_tokens"] = [self.state.num_input_tokens_seen]

        # Completion length metrics
        comp_lengths = completion_mask.sum(dim=1)
        agg_lengths = self.accelerator.gather(comp_lengths)
        self._metrics[mode]["completions/mean_length"].append(
            agg_lengths.float().mean().item()
        )
        self._metrics[mode]["completions/min_length"].append(
            agg_lengths.float().min().item()
        )
        self._metrics[mode]["completions/max_length"].append(
            agg_lengths.float().max().item()
        )

        eos_and_pad = [self.eos_token_id, self.pad_token_id]
        is_trunc = torch.tensor(
            [ids[-1].item() not in eos_and_pad for ids in completion_ids], device=device
        )
        agg_trunc = self.accelerator.gather(is_trunc)
        self._metrics[mode]["completions/clipped_ratio"].append(
            agg_trunc.float().mean().item()
        )
        term_lengths = agg_lengths[~agg_trunc]
        if len(term_lengths) == 0:
            term_lengths = torch.zeros(1, device=device)
        self._metrics[mode]["completions/mean_terminated_length"].append(
            term_lengths.float().mean().item()
        )
        self._metrics[mode]["completions/min_terminated_length"].append(
            term_lengths.float().min().item()
        )
        self._metrics[mode]["completions/max_terminated_length"].append(
            term_lengths.float().max().item()
        )

        # IS metrics
        if "importance_sampling_ratio" in data and "sampling_per_token_logps" in data:
            old_lp = data["old_per_token_logps"]
            samp_lp = data["sampling_per_token_logps"]
            mask = completion_mask.bool()
            delta = torch.abs(old_lp - samp_lp)
            delta_m = delta[mask]
            md = (
                torch.mean(delta_m)
                if delta_m.numel() > 0
                else torch.tensor(0.0, device=device)
            )
            xd = (
                torch.max(delta_m)
                if delta_m.numel() > 0
                else torch.tensor(0.0, device=device)
            )
            self._metrics[mode]["sampling/sampling_logp_difference/mean"].append(
                self.accelerator.gather(md).mean().item()
            )
            self._metrics[mode]["sampling/sampling_logp_difference/max"].append(
                self.accelerator.gather(xd).max().item()
            )
            isr = data["importance_sampling_ratio"]
            is_mode = getattr(self, "vllm_importance_sampling_mode", "token_truncate")
            if is_mode in ("sequence_mask", "sequence_truncate"):
                flat_isr = isr.flatten()
            else:
                flat_isr = isr[mask]
            if flat_isr.numel() > 0:
                self._metrics[mode]["sampling/importance_sampling_ratio/min"].append(
                    nanmin(self.accelerator.gather(torch.min(flat_isr))).item()
                )
                self._metrics[mode]["sampling/importance_sampling_ratio/mean"].append(
                    self.accelerator.gather(torch.mean(flat_isr)).nanmean().item()
                )
                self._metrics[mode]["sampling/importance_sampling_ratio/max"].append(
                    nanmax(self.accelerator.gather(torch.max(flat_isr))).item()
                )

        # Log prompt/completion texts
        prompts_text = self.processing_class.batch_decode(
            prompt_ids, skip_special_tokens=True
        )
        completions_text = self.processing_class.batch_decode(
            completion_ids, skip_special_tokens=True
        )
        if gather_object is not None:
            self._logs["prompt"].extend(gather_object(prompts_text))
            self._logs["completion"].extend(gather_object(completions_text))
        for i, name in enumerate(self.reward_func_names):
            self._logs["rewards"][name].extend(rewards_per_func[:, i].tolist())
        self._logs["advantages"].extend(all_advantages.tolist())

        # Remove deferred keys
        for k in list(data.keys()):
            if k.startswith("_deferred") or k == "_pending_policy_logps":
                data.pop(k, None)

        return data

    @torch.no_grad()
    def _compute_streaming_group_scores(
        self,
        data,
        s_start,
        s_end,
        inputs,
        prompts,
        completions,
        completion_ids_list,
        is_last_chunk,
        rank0_only=False,
    ):
        """Score a chunk of prompt groups: rewards, policy logprobs, advantages.

        Called during streaming scoring to incrementally score groups.
        Writes results directly into ``data`` at positions ``s_start:s_end``.
        """
        device = self.accelerator.device
        batch_size = self.args.per_device_train_batch_size
        num_generations = self.num_generations
        mode = "train"
        chunk_size = s_end - s_start

        # --- Policy logprobs for this chunk ---
        chunk_prompt_ids = data["prompt_ids"][s_start:s_end]
        chunk_completion_ids = data["completion_ids"][s_start:s_end]
        chunk_prompt_mask = data["prompt_mask"][s_start:s_end]
        chunk_completion_mask = data["completion_mask"][s_start:s_end]
        prompt_completion_ids = torch.cat(
            [chunk_prompt_ids, chunk_completion_ids], dim=1
        )
        attention_mask = torch.cat([chunk_prompt_mask, chunk_completion_mask], dim=1)
        logits_to_keep = chunk_completion_ids.size(1)

        # Slice multimodal forward kwargs for this chunk
        forward_kwargs = {}
        for key in (
            "pixel_values",
            "image_grid_thw",
            "pixel_attention_mask",
            "image_sizes",
            "token_type_ids",
            "mm_token_type_ids",
        ):
            if key in data:
                val = data[key]
                if (
                    isinstance(val, torch.Tensor)
                    and val.dim() > 0
                    and val.size(0) == len(data["prompt_ids"])
                ):
                    forward_kwargs[key] = val[s_start:s_end]
                else:
                    forward_kwargs[key] = val
        num_images = data.get("num_images")
        if (
            num_images is not None
            and hasattr(num_images, "__getitem__")
            and len(num_images) == len(data["prompt_ids"])
        ):
            num_images = num_images[s_start:s_end]

        # --- Launch rewards in parallel with logprobs ---
        self._launch_reward_workers(inputs, prompts, completions, completion_ids_list)

        # --- Policy logprobs for this chunk (GPU, overlaps with BG rewards) ---
        logprob_batch_size = min(batch_size * 2, chunk_size)
        with disable_gradient_checkpointing(
            self.model, self.args.gradient_checkpointing_kwargs
        ):
            generate_every = self.args.steps_per_generation * self.num_iterations
            if self.args.gradient_accumulation_steps % generate_every != 0 or (
                self.use_vllm
                and getattr(self, "vllm_importance_sampling_correction", False)
            ):
                old_logps, _ = self._get_per_token_logps_and_entropies(
                    self.model,
                    prompt_completion_ids,
                    attention_mask,
                    logits_to_keep,
                    logprob_batch_size,
                    num_images=num_images,
                    **forward_kwargs,
                )
                if "old_per_token_logps" not in data:
                    total = len(data["prompt_ids"])
                    data["old_per_token_logps"] = torch.zeros(
                        total, old_logps.size(1), device=device, dtype=old_logps.dtype
                    )
                data["old_per_token_logps"][s_start:s_end] = old_logps

                # Compute IS ratio for this chunk
                if "sampling_per_token_logps" in data:
                    samp_chunk = data["sampling_per_token_logps"][s_start:s_end]
                    is_mask = (
                        chunk_completion_mask
                        if "tool_mask" not in data
                        else (chunk_completion_mask * data["tool_mask"][s_start:s_end])
                    )
                    diff = (old_logps - samp_chunk) * is_mask
                    is_mode = getattr(
                        self, "vllm_importance_sampling_mode", "token_truncate"
                    )
                    is_cap = getattr(self, "vllm_importance_sampling_cap", 3.0)
                    seq_is = is_mode in ("sequence_mask", "sequence_truncate")
                    logps_diff = diff.sum(dim=-1, keepdim=True) if seq_is else diff
                    is_ratio = torch.exp(logps_diff)
                    if is_mode in ("sequence_truncate", "token_truncate"):
                        is_ratio = torch.clamp(is_ratio, max=is_cap)
                    elif is_mode in ("sequence_mask", "token_mask"):
                        is_ratio = is_ratio.masked_fill(is_ratio > is_cap, value=0.0)
                    if "importance_sampling_ratio" not in data:
                        total = len(data["prompt_ids"])
                        shape = (total, 1) if seq_is else (total, is_ratio.size(1))
                        data["importance_sampling_ratio"] = torch.ones(
                            *shape, device=device, dtype=is_ratio.dtype
                        )
                    data["importance_sampling_ratio"][s_start:s_end] = is_ratio

            # Reference logprobs
            if self.beta != 0.0:
                if self.ref_model is not None:
                    ref_logps, _ = self._get_per_token_logps_and_entropies(
                        self.ref_model,
                        prompt_completion_ids,
                        attention_mask,
                        logits_to_keep,
                        batch_size,
                        num_images=num_images,
                        **forward_kwargs,
                    )
                else:
                    unwrapped = self.accelerator.unwrap_model(self.model)
                    adapter_name = (
                        "ref"
                        if hasattr(unwrapped, "peft_config")
                        and "ref" in unwrapped.peft_config
                        else None
                    )
                    with use_adapter(unwrapped, adapter_name=adapter_name):
                        ref_logps, _ = self._get_per_token_logps_and_entropies(
                            self.model,
                            prompt_completion_ids,
                            attention_mask,
                            logits_to_keep,
                            batch_size,
                            num_images=num_images,
                            **forward_kwargs,
                        )
                if "ref_per_token_logps" not in data:
                    total = len(data["prompt_ids"])
                    data["ref_per_token_logps"] = torch.zeros(
                        total, ref_logps.size(1), device=device, dtype=ref_logps.dtype
                    )
                data["ref_per_token_logps"][s_start:s_end] = ref_logps

        # --- Collect rewards (should already be done, ran in parallel with logprobs) ---
        rewards_per_func = self._collect_reward_workers(
            inputs, prompts, completions, completion_ids_list
        )
        # De-duplicate gathered rewards when all ranks computed the same data.
        # _calculate_rewards always gather()s, which duplicates rows in rank0_only mode.
        if rewards_per_func.size(0) > chunk_size:
            rewards_per_func = rewards_per_func[:chunk_size]

        # --- Advantages (group-level normalization) ---
        if self.multi_objective_aggregation == "sum_then_normalize":
            rewards = (
                rewards_per_func * self.reward_weights.to(device).unsqueeze(0)
            ).nansum(dim=1)
            mean_g = (
                rewards.view(-1, num_generations)
                .mean(dim=1)
                .repeat_interleave(num_generations)
            )
            if num_generations > 1:
                std_r = (
                    rewards.view(-1, num_generations)
                    .std(dim=1)
                    .repeat_interleave(num_generations)
                )
            else:
                std_r = torch.zeros_like(rewards)
            advantages = rewards - mean_g
            if self.scale_rewards != "none":
                advantages = advantages / (std_r + 1e-4)
            is_std_zero = torch.isclose(std_r, torch.zeros_like(std_r))

        elif self.multi_objective_aggregation == "normalize_then_sum":
            grouped = rewards_per_func.view(-1, num_generations, len(self.reward_funcs))
            mean_k = torch.nanmean(grouped, dim=1, keepdim=True)
            std_k = (
                nanstd(grouped, dim=1, keepdim=True)
                if num_generations > 1
                else torch.zeros_like(mean_k)
            )
            reward_k = ((grouped - mean_k) / (std_k + 1e-4)).view(
                -1, len(self.reward_funcs)
            )
            rewards = (reward_k * self.reward_weights.to(device).unsqueeze(0)).nansum(
                dim=1
            )
            std_r = (
                rewards.view(-1, num_generations)
                .std(dim=1)
                .repeat_interleave(num_generations)
            )
            mean_r = (
                rewards.view(-1, num_generations)
                .mean(dim=1)
                .repeat_interleave(num_generations)
            )
            advantages = (rewards - mean_r) / (std_r + 1e-4)
            is_std_zero = torch.isclose(std_r, torch.zeros_like(std_r))
        else:
            raise ValueError(
                f"Invalid multi_objective_aggregation: {self.multi_objective_aggregation}"
            )

        if rank0_only:
            process_slice = slice(0, len(prompts))
        else:
            process_slice = slice(
                self.accelerator.process_index * len(prompts),
                (self.accelerator.process_index + 1) * len(prompts),
            )
        advantages = advantages[process_slice]

        if "advantages" not in data or not isinstance(data["advantages"], torch.Tensor):
            data["advantages"] = torch.zeros(len(data["prompt_ids"]), device=device)
        data["advantages"][s_start:s_end] = advantages

        # --- Post-advantage hook (for replay buffer, re-roll, etc.) ---
        self._post_advantage_hook(
            data,
            rewards_per_func,
            advantages,
            inputs,
            num_generations,
            mode,
            s_start=s_start,
            s_end=s_end,
            is_last_chunk=is_last_chunk,
        )

        # --- Chunk metrics ---
        for i, name in enumerate(self.reward_func_names):
            self._metrics[mode][f"rewards/{name}/mean"].append(
                torch.nanmean(rewards_per_func[:, i]).item()
            )
            self._metrics[mode][f"rewards/{name}/std"].append(
                nanstd(rewards_per_func[:, i]).item()
            )
        agg_rewards = rewards_per_func.nansum(dim=1)
        self._metrics[mode]["reward"].append(agg_rewards.mean().item())
        self._metrics[mode]["reward_std"].append(agg_rewards.std().item())
        self._metrics[mode]["frac_reward_zero_std"].append(
            is_std_zero.float().mean().item()
        )

        # --- Full-batch metrics on last chunk ---
        if is_last_chunk:
            all_prompt_mask = data["prompt_mask"]
            all_completion_mask = data["completion_mask"]
            all_completion_ids = data["completion_ids"]
            total_p = self.accelerator.gather(all_prompt_mask.sum()).sum()
            total_c = self.accelerator.gather(all_completion_mask.sum()).sum()
            self.state.num_input_tokens_seen += (total_p + total_c).item()
            self._metrics[mode]["num_tokens"] = [self.state.num_input_tokens_seen]

            comp_lengths = all_completion_mask.sum(dim=1)
            agg_lengths = self.accelerator.gather(comp_lengths)
            self._metrics[mode]["completions/mean_length"].append(
                agg_lengths.float().mean().item()
            )
            self._metrics[mode]["completions/min_length"].append(
                agg_lengths.float().min().item()
            )
            self._metrics[mode]["completions/max_length"].append(
                agg_lengths.float().max().item()
            )

            eos_and_pad = [self.eos_token_id, self.pad_token_id]
            is_trunc = torch.tensor(
                [ids[-1].item() not in eos_and_pad for ids in all_completion_ids],
                device=device,
            )
            agg_trunc = self.accelerator.gather(is_trunc)
            self._metrics[mode]["completions/clipped_ratio"].append(
                agg_trunc.float().mean().item()
            )
            term = agg_lengths[~agg_trunc]
            if len(term) == 0:
                term = torch.zeros(1, device=device)
            self._metrics[mode]["completions/mean_terminated_length"].append(
                term.float().mean().item()
            )
            self._metrics[mode]["completions/min_terminated_length"].append(
                term.float().min().item()
            )
            self._metrics[mode]["completions/max_terminated_length"].append(
                term.float().max().item()
            )

            # IS metrics
            if (
                self.use_vllm
                and getattr(self, "vllm_importance_sampling_correction", False)
                and "sampling_per_token_logps" in data
                and "old_per_token_logps" in data
            ):
                old_lp = data["old_per_token_logps"]
                samp_lp = data["sampling_per_token_logps"]
                mask = all_completion_mask.bool()
                delta = torch.abs(old_lp - samp_lp)[mask]
                md = (
                    torch.mean(delta)
                    if delta.numel() > 0
                    else torch.tensor(0.0, device=device)
                )
                xd = (
                    torch.max(delta)
                    if delta.numel() > 0
                    else torch.tensor(0.0, device=device)
                )
                self._metrics[mode]["sampling/sampling_logp_difference/mean"].append(
                    self.accelerator.gather(md).mean().item()
                )
                self._metrics[mode]["sampling/sampling_logp_difference/max"].append(
                    self.accelerator.gather(xd).max().item()
                )
                is_mode = getattr(
                    self, "vllm_importance_sampling_mode", "token_truncate"
                )
                isr = data["importance_sampling_ratio"]
                flat = (
                    isr.flatten()
                    if is_mode in ("sequence_mask", "sequence_truncate")
                    else isr[mask]
                )
                if flat.numel() > 0:
                    self._metrics[mode][
                        "sampling/importance_sampling_ratio/min"
                    ].append(nanmin(self.accelerator.gather(torch.min(flat))).item())
                    self._metrics[mode][
                        "sampling/importance_sampling_ratio/mean"
                    ].append(self.accelerator.gather(torch.mean(flat)).nanmean().item())
                    self._metrics[mode][
                        "sampling/importance_sampling_ratio/max"
                    ].append(nanmax(self.accelerator.gather(torch.max(flat))).item())

    def _score_streaming(self, rollout: dict) -> list[dict]:
        """Score a rollout using streaming group scoring.  Returns list of micro-batches."""
        data = rollout
        num_gen = self.num_generations
        n_groups = len(data["prompt_ids"]) // num_gen
        batch_size = self.args.per_device_train_batch_size
        min_groups = max(1, self.args.streaming_min_groups)

        # Extract deferred data
        inputs = data.pop("_deferred_inputs")
        prompts = data.pop("_deferred_prompts")
        completions = data.pop("_deferred_completions")
        completion_ids_list = data.pop("_deferred_completion_ids_list")
        rank0_only = data.pop("_rank0_only", False)
        del data["_pending_policy_logps"]

        all_micro_batches = []
        shared_keys = {"num_items_in_batch"}

        for chunk_start_g in range(0, n_groups, min_groups):
            chunk_end_g = min(chunk_start_g + min_groups, n_groups)
            s_start = chunk_start_g * num_gen
            s_end = chunk_end_g * num_gen

            self._compute_streaming_group_scores(
                data=data,
                s_start=s_start,
                s_end=s_end,
                inputs=inputs[s_start:s_end],
                prompts=prompts[s_start:s_end],
                completions=completions[s_start:s_end],
                completion_ids_list=completion_ids_list[s_start:s_end],
                is_last_chunk=(chunk_end_g == n_groups),
                rank0_only=rank0_only,
            )

            # Yield micro-batches from this scored chunk
            chunk_size = s_end - s_start
            perm = torch.randperm(chunk_size)
            for mb_off in range(0, chunk_size, batch_size):
                mb_idx = perm[mb_off : mb_off + batch_size]
                abs_idx = mb_idx + s_start
                mb = {}
                for key in data:
                    if key.startswith("_"):
                        continue
                    val = data[key]
                    if key in shared_keys:
                        mb[key] = val
                    elif isinstance(val, torch.Tensor) and val.dim() > 0:
                        mb[key] = val[abs_idx]
                    else:
                        mb[key] = val
                all_micro_batches.append(mb)

        # Repeat for num_iterations
        return all_micro_batches * self.num_iterations

    # ------------------------------------------------------------------
    # _prepare_inputs override
    # ------------------------------------------------------------------

    def _prepare_inputs(self, generation_batch):
        """Override to support data producer and async prefetch paths."""
        mode = "train" if self.model.training else "eval"

        # --- Data producer path ---
        if mode == "train" and self.data_producer is not None:
            return self._prepare_inputs_data_producer(generation_batch)

        # --- Legacy async prefetch path (no data producer) ---
        if mode == "train" and self.args.async_prefetch:
            return self._prepare_inputs_legacy_async(generation_batch)

        # --- Stock path ---
        return super()._prepare_inputs(generation_batch)

    def _prepare_inputs_data_producer(self, generation_batch):
        """Data producer path: produce rollout, score deferred logps, split into micro-batches."""
        # Return from buffer if available
        if self._buffered_inputs:
            return self._buffered_inputs.pop(0)

        # Produce a new rollout
        self._maybe_sync_vllm_weights()

        rollout_dataset = self.data_producer.produce(
            self.model,
            self.state.global_step,
            processing_class=self.processing_class,
            accelerator=self.accelerator,
            args=self.args,
        )

        # Convert RolloutDataset back to a dict for scoring/splitting
        rollout = rollout_dataset._data

        # If async (skip_policy_logps=True), score deferred logps on main thread
        if rollout.get("_pending_policy_logps"):
            if self.args.streaming_partial_batch:
                micro_batches = self._score_streaming(rollout)
            else:
                scored = self._compute_deferred_scores(rollout)
                scored = split_pixel_values_by_grid(scored)
                scored = shuffle_sequence_dict(scored)
                batches = split_tensor_dict(scored, self.args.steps_per_generation)
                micro_batches = [unsplit_pixel_values_by_grid(b) for b in batches]
                micro_batches = micro_batches * self.num_iterations
        else:
            # Sync path: data is already fully scored
            rollout = split_pixel_values_by_grid(rollout)
            batches = split_tensor_dict(rollout, self.args.steps_per_generation)
            micro_batches = [unsplit_pixel_values_by_grid(b) for b in batches]
            micro_batches = micro_batches * self.num_iterations

        self._buffered_inputs = micro_batches[1:]
        return micro_batches[0]

    def _prepare_inputs_legacy_async(self, generation_batch):
        """Legacy async path: direct queue-based prefetch without data producer."""
        # Return from buffer if available
        if self._buffered_inputs:
            return self._buffered_inputs.pop(0)

        # Need a new rollout
        self._maybe_sync_vllm_weights()
        future = self._async_queue.get()
        rollout = future.result()
        self._submit_generation()

        if self.args.streaming_partial_batch:
            micro_batches = self._score_streaming(rollout)
        else:
            scored = self._compute_deferred_scores(rollout)
            scored = split_pixel_values_by_grid(scored)
            scored = shuffle_sequence_dict(scored)
            batches = split_tensor_dict(scored, self.args.steps_per_generation)
            micro_batches = [unsplit_pixel_values_by_grid(b) for b in batches]
            micro_batches = micro_batches * self.num_iterations

        self._buffered_inputs = micro_batches[1:]

        # Release cached CUDA memory from scoring
        # before training allocations begin, reducing peak reserved memory.
        torch.cuda.empty_cache()

        return micro_batches[0]

    @profiling_decorator
    def _get_per_token_logps_and_entropies(
        self,
        model,
        input_ids,
        attention_mask,
        logits_to_keep,
        batch_size=None,
        compute_entropy=False,
        pixel_values=None,
        image_grid_thw=None,
        num_images=None,
        pixel_attention_mask=None,
        image_sizes=None,
        token_type_ids=None,
        mm_token_type_ids=None,
    ) -> tuple[Any, torch.Tensor | None]:
        """Compute log-probs and (optionally) entropies for each token.

        When running under no_grad (scoring path), bypasses accelerate's
        ConvertOutputsToFp32 wrapper to avoid a fp32 copy of the
        logits tensor.
        """
        # Bypass accelerate's ConvertOutputsToFp32 wrapper which converts the
        # entire (B, L, V) logits tensor from bf16 to fp32 — unnecessary and
        # extremely wasteful for large vocabularies.
        # Skip unwrapping for FSDP — parameters are only valid inside FSDP's
        # forward context; unwrapping exposes flattened/sharded tensors.
        if not self.is_fsdp_enabled:
            model = self.accelerator.unwrap_model(model, keep_fp32_wrapper=False)
        autocast_ctx = torch.autocast(
            device_type=input_ids.device.type, dtype=torch.bfloat16
        )

        # Use Liger's Triton kernel in scoring path (no grad): fuses
        # temperature + log_softmax + gather into a single kernel pass.
        use_fused = (
            self.use_liger_kernel
            and _fused_selective_log_softmax is not None
            and not torch.is_grad_enabled()
        )

        batch_size = batch_size or input_ids.size(0)
        all_logps = []
        all_entropies = []
        with autocast_ctx:
            for start in range(0, input_ids.size(0), batch_size):
                input_ids_batch = input_ids[start : start + batch_size]
                attention_mask_batch = attention_mask[start : start + batch_size]

                # Build model inputs
                model_inputs = {
                    "input_ids": input_ids_batch,
                    "attention_mask": attention_mask_batch,
                }
                if image_grid_thw is not None and pixel_values is not None:
                    rows_per_image = image_grid_thw.prod(dim=-1)
                    rows_per_sample = torch.split(rows_per_image, num_images)
                    rows_per_sample = torch.stack([s.sum() for s in rows_per_sample])
                    cum_rows = torch.cat(
                        [
                            torch.tensor([0], device=rows_per_sample.device),
                            rows_per_sample.cumsum(0),
                        ]
                    )
                    row_start, row_end = (
                        cum_rows[start].item(),
                        cum_rows[start + batch_size].item(),
                    )
                    model_inputs["pixel_values"] = pixel_values[row_start:row_end]
                    cum_imgs = torch.tensor([0] + num_images).cumsum(0)
                    img_start, img_end = cum_imgs[start], cum_imgs[start + batch_size]
                    model_inputs["image_grid_thw"] = image_grid_thw[img_start:img_end]
                elif pixel_values is not None:
                    model_inputs["pixel_values"] = pixel_values[
                        start : start + batch_size
                    ]
                if pixel_attention_mask is not None:
                    model_inputs["pixel_attention_mask"] = pixel_attention_mask[
                        start : start + batch_size
                    ]
                if image_sizes is not None:
                    model_inputs["image_sizes"] = image_sizes[
                        start : start + batch_size
                    ]
                if token_type_ids is not None:
                    model_inputs["token_type_ids"] = token_type_ids[
                        start : start + batch_size
                    ]
                if mm_token_type_ids is not None:
                    model_inputs["mm_token_type_ids"] = mm_token_type_ids[
                        start : start + batch_size
                    ]

                if "logits_to_keep" in self.model_kwarg_keys:
                    model_inputs["logits_to_keep"] = logits_to_keep + 1

                model_inputs["use_cache"] = False

                logits = model(**model_inputs).logits
                completion_ids = input_ids_batch[:, -logits_to_keep:]
                # FP8 models produce NaN logits at positions where
                # attention_mask=0 (padding). Replace NaN with 0 so
                # log_softmax yields uniform distribution for those positions.
                # The completion_mask ensures these don't affect the loss.
                logits = torch.nan_to_num(logits, nan=0.0)

                if use_fused:
                    logits = logits[:, -(logits_to_keep + 1) :, :]
                    if not logits.is_contiguous():
                        logits = logits.contiguous()
                    logps = _fused_selective_log_softmax(
                        logits, completion_ids, self.temperature
                    )
                    all_logps.append(logps)
                else:
                    logits = logits[:, :-1, :]
                    logits = logits[:, -logits_to_keep:, :]
                    logits.div_(self.temperature)
                    logps = selective_log_softmax(logits, completion_ids)
                    all_logps.append(logps)

                    if compute_entropy:
                        with torch.no_grad():
                            entropies = entropy_from_logits(logits)
                        all_entropies.append(entropies)

        logps = torch.cat(all_logps, dim=0)
        entropies = torch.cat(all_entropies, dim=0) if compute_entropy else None
        return logps, entropies

    # ------------------------------------------------------------------
    # Loss override (adds IS ratio + OPSM)
    # ------------------------------------------------------------------

    @staticmethod
    def get_off_policy_mask(
        advantages,
        per_token_logps,
        sampling_per_token_logps,
        mask,
        off_policy_threshold,
    ):
        """OPSM from DeepSeek-V3.2: drop sequences with negative advantage + high KL."""
        kl_div = sampling_per_token_logps - per_token_logps.detach()
        seq_kl = (kl_div * mask).sum(dim=1, keepdim=True) / mask.sum(
            dim=1, keepdim=True
        ).clamp(min=1.0)
        is_pos_adv = advantages >= 0
        is_low_kl = seq_kl <= off_policy_threshold
        return (is_pos_adv | is_low_kl).to(dtype=mask.dtype)

    def _compute_loss(self, model, inputs):
        """Override to add IS ratio correction and off-policy sequence masking."""
        prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
        completion_ids, completion_mask = (
            inputs["completion_ids"],
            inputs["completion_mask"],
        )
        input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
        logits_to_keep = completion_ids.size(1)
        mask = (
            completion_mask
            if "tool_mask" not in inputs
            else completion_mask * inputs["tool_mask"]
        )

        per_token_logps, entropies = self._get_per_token_logps_and_entropies(
            model,
            input_ids,
            attention_mask,
            logits_to_keep,
            compute_entropy=True,
            pixel_values=inputs.get("pixel_values"),
            image_grid_thw=inputs.get("image_grid_thw"),
            num_images=inputs.get("num_images"),
            pixel_attention_mask=inputs.get("pixel_attention_mask"),
            image_sizes=inputs.get("image_sizes"),
            token_type_ids=inputs.get("token_type_ids"),
            mm_token_type_ids=inputs.get("mm_token_type_ids"),
        )
        if self.top_entropy_quantile < 1.0:
            entropy_mask = self.get_high_entropy_mask(
                entropies, mask, 1 - self.top_entropy_quantile
            )
        else:
            entropy_mask = None

        advantages = inputs["advantages"]
        if advantages.dim() == 1:
            advantages = advantages.unsqueeze(1)

        old_per_token_logps = inputs.get("old_per_token_logps")
        old_per_token_logps = (
            per_token_logps.detach()
            if old_per_token_logps is None
            else old_per_token_logps
        )

        # --- OPSM (off-policy sequence mask) ---
        off_policy_mask = None
        if getattr(self, "off_policy_mask_threshold", None) is not None:
            sampling_per_token_logps = inputs.get(
                "sampling_per_token_logps", old_per_token_logps
            )
            off_policy_mask = self.get_off_policy_mask(
                advantages=advantages,
                per_token_logps=per_token_logps,
                sampling_per_token_logps=sampling_per_token_logps,
                mask=mask,
                off_policy_threshold=self.off_policy_mask_threshold,
            )

        # --- Importance weights ---
        log_ratio = per_token_logps - old_per_token_logps
        is_level = getattr(
            self,
            "importance_sampling_level",
            getattr(self.args, "importance_sampling_level", "token"),
        )
        if is_level == "token":
            log_importance_weights = log_ratio
        elif is_level == "sequence":
            log_importance_weights = (log_ratio * mask).sum(-1) / mask.sum(-1).clamp(
                min=1.0
            )
            log_importance_weights = log_importance_weights.unsqueeze(-1)
        else:
            raise ValueError(f"Unknown importance sampling level: {is_level}")

        coef_1 = torch.exp(log_importance_weights)

        # --- KL divergence ---
        if self.beta != 0.0:
            ref_per_token_logps = inputs["ref_per_token_logps"]
            per_token_kl = (
                torch.exp(ref_per_token_logps - per_token_logps)
                - (ref_per_token_logps - per_token_logps)
                - 1
            )
            if getattr(self.args, "use_bias_correction_kl", False):
                per_token_kl = per_token_kl * coef_1

        # --- Per-token loss ---
        if self.loss_type == "cispo":
            clamped = torch.clamp(coef_1, max=self.epsilon_high).detach()
            per_token_loss = -clamped * advantages * per_token_logps
        elif self.loss_type in ("grpo", "bnpo", "dr_grpo", "dapo", "luspo"):
            coef_2 = torch.clamp(coef_1, 1 - self.epsilon_low, 1 + self.epsilon_high)
            if self.args.delta is not None:
                coef_1_c = torch.clamp(coef_1, max=self.args.delta)
            else:
                coef_1_c = coef_1
            per_token_loss = -torch.min(coef_1_c * advantages, coef_2 * advantages)
        elif self.loss_type == "sapo":
            temps = torch.where(
                advantages > 0,
                self.args.sapo_temperature_pos,
                self.args.sapo_temperature_neg,
            )
            soft = torch.sigmoid(temps * (coef_1 - 1)) * 4 / temps
            per_token_loss = -soft * advantages
        else:
            raise ValueError(f"Unknown loss type: {self.loss_type}")

        # --- Apply masks ---
        if off_policy_mask is not None:
            per_token_loss = per_token_loss * off_policy_mask
        if entropy_mask is not None:
            per_token_loss = per_token_loss * entropy_mask

        # --- IS ratio correction (vLLM distribution mismatch) ---
        if (
            self.use_vllm
            and getattr(self, "vllm_importance_sampling_correction", False)
            and "importance_sampling_ratio" in inputs
        ):
            per_token_loss = per_token_loss * inputs["importance_sampling_ratio"]

        if self.beta != 0.0:
            per_token_loss = per_token_loss + self.beta * per_token_kl

        # --- Aggregate loss ---
        mode = "train" if self.model.training else "eval"
        normalizer = (
            self.current_gradient_accumulation_steps if mode == "train" else 1.0
        )

        if self.loss_type in ("grpo", "sapo"):
            loss = (
                (per_token_loss * mask).sum(-1) / mask.sum(-1).clamp(min=1.0)
            ).mean() / normalizer
        elif self.loss_type == "bnpo":
            loss = (
                (per_token_loss * mask).sum() / mask.sum().clamp(min=1.0) / normalizer
            )
        elif self.loss_type == "dr_grpo":
            loss = (
                (per_token_loss * mask).sum()
                / (per_token_loss.size(0) * self.max_completion_length)
                / normalizer
            )
        elif self.loss_type in ("cispo", "dapo"):
            norm = inputs["num_items_in_batch"] / self.accelerator.num_processes
            loss = (per_token_loss * mask).sum() / norm
        elif self.loss_type == "luspo":
            loss = (per_token_loss * mask.sum(1, keepdim=True)).mean() / normalizer
        else:
            raise ValueError(f"Unknown loss type: {self.loss_type}")

        # --- Metrics ---
        completion_token_count = mask.sum().clamp(min=1.0)

        def masked_batch_mean(x):
            return (
                x.mean()
                if x.shape[1] == 1
                else (x * mask).sum() / completion_token_count
            )

        if self.beta != 0.0:
            mean_kl = masked_batch_mean(per_token_kl)
            self._metrics[mode]["kl"].append(
                self.accelerator.gather(mean_kl).nanmean().item()
            )

        mean_entropy = masked_batch_mean(entropies)
        self._metrics[mode]["entropy"].append(
            self.accelerator.gather(mean_entropy).nanmean().item()
        )

        if self.loss_type in ("grpo", "bnpo", "dr_grpo", "dapo", "luspo"):
            is_low = (coef_1 < 1 - self.epsilon_low) & (advantages < 0)
            is_high = (coef_1 > 1 + self.epsilon_high) & (advantages > 0)
            is_region = is_low | is_high
            low_clip = masked_batch_mean(is_low.float())
            high_clip = masked_batch_mean(is_high.float())
            clip_ratio = masked_batch_mean(is_region.float())
            g_low = self.accelerator.gather(low_clip)
            self._metrics[mode]["clip_ratio/low_mean"].append(g_low.nanmean().item())
            self._metrics[mode]["clip_ratio/low_min"].append(nanmin(g_low).item())
            g_high = self.accelerator.gather(high_clip)
            self._metrics[mode]["clip_ratio/high_mean"].append(g_high.nanmean().item())
            self._metrics[mode]["clip_ratio/high_max"].append(nanmax(g_high).item())
            g_clip = self.accelerator.gather(clip_ratio)
            self._metrics[mode]["clip_ratio/region_mean"].append(
                g_clip.nanmean().item()
            )
        elif self.loss_type == "cispo":
            is_cispo = (coef_1 > self.epsilon_high) & (advantages > 0)
            cr = masked_batch_mean(is_cispo.float())
            self._metrics[mode]["cispo_clip_ratio"].append(
                self.accelerator.gather(cr).nanmean().item()
            )

        return loss


================================================
FILE: src/axolotl/core/trainers/grpo/fast_async_trainer.py
================================================
# Copyright 2020-2026 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Experimental GRPO extensions: parallel reward workers, replay buffer,
deferred re-roll, and zero-advantage skipping.

These features are built as subclasses of GRPOTrainer and GRPODataProducer,
using the hook system (_compute_rewards_for_batch, _post_advantage_hook,
_pre_produce_hook) defined in the base classes.
"""

from __future__ import annotations

import asyncio
import logging
import threading
from dataclasses import dataclass, field

import torch
from torch import nn
from trl import GRPOTrainer

from axolotl.core.trainers.grpo.async_trainer import (
    AsyncGRPOConfig,
    AsyncGRPOTrainer,
    GRPODataProducer,
)
from axolotl.core.trainers.grpo.replay_buffer import ReplayBuffer

logger = logging.getLogger(__name__)


# ---------------------------------------------------------------------------
# Extended config
# ---------------------------------------------------------------------------


@dataclass
class FastAsyncGRPOConfig(AsyncGRPOConfig):
    """GRPOConfig with additional experimental parameters."""

    reward_num_workers: int = field(
        default=1,
        metadata={
            "help": "Number of persistent subprocess workers for parallel reward computation. Each worker has its "
            "own main thread so signal.alarm() (used by math_verify) works correctly. Work is sharded across "
            "workers by prompt groups. Only used with use_data_producer=True and non-nn.Module reward functions."
        },
    )
    replay_buffer_size: int = field(
        default=0,
        metadata={
            "help": "[Experimental, disabled by default] Size of the replay buffer for storing high-signal rollout "
            "groups. When > 0, groups with reward variance are cached and used to replace zero-signal groups "
            "(where all rewards are identical). Set to 0 to disable. Only used with use_data_producer=True."
        },
    )
    replay_recompute_logps: bool = field(
        default=True,
        metadata={
            "help": "When True (default), recompute old_per_token_logps for replayed groups using the current "
            "training model. This fixes the importance sampling mismatch that occurs when replaying stale data. "
            "Only relevant when replay_buffer_size > 0."
        },
    )
    reroll_start_fraction: float = field(
        default=0.5,
        metadata={
            "help": "Fraction of total training steps after which deferred re-rolling begins. Zero-signal prompts "
            "(where all rewards in a group are identical) are buffered and re-injected into later batches when the "
            "model is more likely to solve them. Set to 1.0 to disable. Only used with use_data_producer=True."
        },
    )
    reroll_max_groups: int = field(
        default=1,
        metadata={
            "help": "Maximum number of prompt groups to replace with re-roll candidates per batch. Higher values "
            "increase data utilization but reduce prompt diversity. Only used with use_data_producer=True."
        },
    )
    skip_zero_advantage_batches: bool = field(
        default=True,
        metadata={
            "help": "When True, skip gradient computation for micro-batches where all advantages are zero (no learning "
            "signal). This avoids the forward/backward pass entirely when no learning signal is present. The step is "
            "logged with skipped_zero_adv_batches=1 for monitoring."
        },
    )
    vllm_lora_sync: bool = field(
        default=False,
        metadata={
            "help": "When True, sync LoRA adapter weights to vLLM via filesystem instead of merging into base model "
            "and NCCL-broadcasting all parameters. vLLM loads the adapter natively using Punica kernels. "
            "Requires vllm_serve_lora serve module (auto-selected when this is True). "
            "Syncs only LoRA adapter weights (much smaller) vs full merged model. Legacy merge behavior is used when False."
        },
    )


# ---------------------------------------------------------------------------
# Extended data producer with re-roll injection
# ---------------------------------------------------------------------------


class RerollDataProducer(GRPODataProducer):
    """GRPODataProducer that injects re-roll candidates into prompt batches.

    Reads from the trainer's ``_reroll_buffer`` (populated by
    ``GRPOExperimentalTrainer._post_advantage_hook``) and replaces the
    last N prompt groups with previously-failed prompts.
    """

    def _pre_produce_hook(self, inputs: list, global_step: int) -> list:
        trainer = self._trainer
        reroll_buf = getattr(trainer, "_reroll_buffer", None)
        reroll_lock = getattr(trainer, "_reroll_lock", None)
        if reroll_buf is None or reroll_lock is None:
            return inputs

        max_steps = getattr(trainer.args, "max_steps", -1)
        start_frac = getattr(trainer.args, "reroll_start_fraction", 1.0)
        max_groups = getattr(trainer.args, "reroll_max_groups", 1)
        reroll_start_step = (
            max(1, int(max_steps * start_frac)) if max_steps > 0 else float("inf")
        )

        if global_step < reroll_start_step:
            return inputs

        with reroll_lock:
            n_to_take = min(max_groups, len(reroll_buf))
            reroll_prompts = [reroll_buf.pop(0) for _ in range(n_to_take)]

        if reroll_prompts:
            num_gen = self._num_generations
            n_groups = len(inputs) // num_gen
            for i, reroll_prompt in enumerate(reroll_prompts):
                group_idx = n_groups - 1 - i
                if group_idx < 0:
                    break
                start = group_idx * num_gen
                for j in range(num_gen):
                    inputs[start + j] = reroll_prompt
            logger.info(
                f"[REROLL] Step {global_step}: replaced {len(reroll_prompts)}/{n_groups} prompt groups "
                f"with deferred re-roll candidates ({len(reroll_buf)} remaining)"
            )

        return inputs


# ---------------------------------------------------------------------------
# Persistent reward subprocess pool
# ---------------------------------------------------------------------------


def _persistent_reward_worker(conn):
    """Long-lived reward worker. Receives work items, returns results."""
    while True:
        try:
            msg = conn.recv()
        except EOFError:
            break
        if msg is None:  # Shutdown signal
            break
        (
            reward_funcs,
            prompts,
            completions,
            completion_ids_list,
            inputs,
            reward_func_names,
        ) = msg
        try:
            keys = [
                key
                for key in inputs[0]
                if key not in ["prompt", "completion", "completion_ids"]
            ]
            reward_kwargs = {key: [example[key] for example in inputs] for key in keys}
            results = []
            for reward_func, _reward_func_name in zip(
                reward_funcs, reward_func_names, strict=True
            ):
                output = reward_func(
                    prompts=prompts,
                    completions=completions,
                    completion_ids=completion_ids_list,
                    **reward_kwargs,
                )
                results.append(
                    [float(r) if r is not None else float("nan") for r in output]
                )
            conn.send(results)
        except Exception:
            conn.send(None)


# ---------------------------------------------------------------------------
# Extended trainer
# ---------------------------------------------------------------------------


class FastAsyncGRPOTrainer(AsyncGRPOTrainer):
    """GRPOTrainer with experimental extensions.

    Adds:
    - Parallel reward subprocess workers (``reward_num_workers``)
    - Replay buffer for high-signal group reuse (``replay_buffer_size``)
    - Deferred re-roll of failed prompts (``reroll_start_fraction``)
    - Zero-advantage micro-batch skipping
    """

    def __init__(self, *args, **kwargs):
        # These must be initialized before super().__init__() because
        # _create_data_producer (called during super().__init__) needs them.
        self._reroll_buffer: list = []
        self._reroll_lock = threading.Lock()

        # Temporarily suppress the base class's Liger + OPSM validation check,
        # since this subclass supports it via a custom compute_liger_loss override.
        grpo_args = kwargs.get("args")
        if grpo_args is None:
            for a in args:
                if hasattr(a, "off_policy_mask_threshold"):
                    grpo_args = a
                    break
        saved_threshold = None
        if grpo_args is not None and getattr(grpo_args, "use_liger_kernel", False):
            saved_threshold = grpo_args.off_policy_mask_threshold
            grpo_args.off_policy_mask_threshold = None

        super().__init__(*args, **kwargs)

        if saved_threshold is not None:
            grpo_args.off_policy_mask_threshold = saved_threshold
            self.off_policy_mask_threshold = saved_threshold

        # Replay buffer
        if getattr(self.args, "replay_buffer_size", 0) > 0:
            self._replay_buffer = ReplayBuffer(max_size=self.args.replay_buffer_size)
        else:
            self._replay_buffer = None
        self._replay_recompute_logps = getattr(
            self.args, "replay_recompute_logps", True
        )

        # Reward worker pool (lazy-initialized)
        self._reward_workers = None

    # -- Factory override: use RerollDataProducer ----------------------------

    def _create_data_producer(self, args, train_dataset):
        """Override to use RerollDataProducer for re-roll prompt injection."""
        from axolotl.core.trainers.grpo.async_trainer import (
            AsyncDataProducer,
            ProducerConfig,
        )

        producer_config = ProducerConfig(
            mini_epochs=args.num_iterations,
            max_rollouts=None,
            eval_during_produce=False,
            empty_cache_before_produce=True,
            empty_cache_after_produce=True,
            async_prefetch=args.async_prefetch,
            prefetch_depth=args.prefetch_depth,
        )
        data_producer = RerollDataProducer(
            config=producer_config,
            prompt_dataset=train_dataset,
            num_generations=self.num_generations,
            generation_batch_size=args.generation_batch_size,
            train_batch_size=args.per_device_train_batch_size,
            steps_per_generation=args.steps_per_generation,
            shuffle_dataset=self.shuffle_dataset,
            seed=args.seed,
        )
        data_producer.set_trainer(self)
        if args.async_prefetch:
            data_producer = AsyncDataProducer(
                data_producer,
                background_produce_kwargs={"skip_policy_logps": True},
            )
        return data_producer

    # -- Reward worker pool --------------------------------------------------

    def _get_reward_workers(self):
        """Return a list of persistent reward worker subprocesses (lazy-initialized)."""
        import multiprocessing as _mp

        num_workers = getattr(self.args, "reward_num_workers", 1)
        if num_workers < 1:
            num_workers = 1

        if self._reward_workers is not None:
            alive = all(proc.is_alive() for conn, proc in self._reward_workers)
            if alive and len(self._reward_workers) == num_workers:
                return self._reward_workers
            self._shutdown_reward_workers()

        workers = []
        for _ in range(num_workers):
            parent_conn, child_conn = _mp.Pipe()
            proc = _mp.Process(
                target=_persistent_reward_worker, args=(child_conn,), daemon=True
            )
            proc.start()
            child_conn.close()
            workers.append((parent_conn, proc))

        self._reward_workers = workers
        return workers

    def _shutdown_reward_workers(self):
        """Shut down all persistent reward workers."""
        if self._reward_workers is None:
            return
        for conn, proc in self._reward_workers:
            try:
                conn.send(None)
                proc.join(timeout=5)
            except Exception:
                pass
            try:
                conn.close()
            except Exception:
                pass
        self._reward_workers = None

    # -- Hook overrides ------------------------------------------------------

    def _compute_rewards_for_batch(
        self, inputs, prompts, completions, completion_ids_list
    ):
        """Dispatch rewards to parallel subprocess workers (synchronous wrapper)."""
        self._launch_reward_workers(inputs, prompts, completions, completion_ids_list)
        return self._collect_reward_workers(
            inputs, prompts, completions, completion_ids_list
        )

    def _launch_reward_workers(self, inputs, prompts, completions, completion_ids_list):
        """Send reward work to subprocess workers (non-blocking).

        Results are collected later by _collect_reward_workers, allowing GPU
        logprob computation to overlap with CPU reward computation.
        """
        reward_can_bg = all(
            callable(rf)
            and not isinstance(rf, nn.Module)
            and not asyncio.iscoroutinefunction(rf)
            for rf in self.reward_funcs
        )
        num_workers = getattr(self.args, "reward_num_workers", 1)

        if not reward_can_bg or num_workers <= 1:
            # Can't parallelize — store args for sync fallback in collect
            self._reward_workers_used = None
            self._pending_reward_args = (
                inputs,
                prompts,
                completions,
                completion_ids_list,
            )
            return

        workers = self._get_reward_workers()
        num_generations = self.num_generations
        num_prompts = len(prompts)
        num_groups = num_prompts // num_generations

        # Shard by prompt groups across workers
        groups_per_worker = max(1, (num_groups + len(workers) - 1) // len(workers))
        workers_used = []
        for w_idx, (conn, _proc) in enumerate(workers):
            g_start = w_idx * groups_per_worker
            g_end = min((w_idx + 1) * groups_per_worker, num_groups)
            if g_start >= num_groups:
                break
            s_start = g_start * num_generations
            s_end = g_end * num_generations
            conn.send(
                (
                    self.reward_funcs,
                    prompts[s_start:s_end],
                    completions[s_start:s_end],
                    completion_ids_list[s_start:s_end],
                    inputs[s_start:s_end],
                    self.reward_func_names,
                )
            )
            workers_used.append(conn)

        self._reward_workers_used = workers_used
        self._pending_reward_args = (inputs, prompts, completions, completion_ids_list)

    def _collect_reward_workers(
        self, inputs, prompts, completions, completion_ids_list
    ):
        """Collect reward results from subprocess workers (blocks until done)."""
        from accelerate.utils import gather

        workers_used = getattr(self, "_reward_workers_used", None)
        args = getattr(self, "_pending_reward_args", None)
        self._reward_workers_used = None
        self._pending_reward_args = None

        if workers_used is None:
            # Sync fallback — compute on main thread
            if args is not None:
                return self._calculate_rewards(*args)
            return self._calculate_rewards(
                inputs, prompts, completions, completion_ids_list
            )

        device = self.accelerator.device
        num_prompts = len(args[1]) if args else len(prompts)

        # Collect results from workers
        all_worker_results = []
        any_failed = False
        for conn in workers_used:
            result = conn.recv()
            if result is None:
                any_failed = True
                # Drain remaining workers to prevent stale results in pipes
                for remaining_conn in workers_used:
                    if remaining_conn is not conn:
                        try:
                            remaining_conn.recv()
                        except Exception:
                            pass
                break
            all_worker_results.append(result)

        if not any_failed:
            rewards_per_func = torch.zeros(
                num_prompts, len(self.reward_funcs), device=device
            )
            offset = 0
            for worker_result in all_worker_results:
                chunk_size = len(worker_result[0])
                for i, result in enumerate(worker_result):
                    rewards_per_func[offset : offset + chunk_size, i] = torch.tensor(
                        result, dtype=torch.float32, device=device
                    )
                offset += chunk_size
            return gather(rewards_per_func)

        # Fallback to main thread on failure
        if args is not None:
            return self._calculate_rewards(*args)
        return self._calculate_rewards(
            inputs, prompts, completions, completion_ids_list
        )

    def _post_advantage_hook(
        self,
        data: dict,
        rewards_per_func,
        advantages,
        inputs: list,
        num_generations: int,
        mode: str,
        s_start: int | None = None,
        s_end: int | None = None,
        is_last_chunk: bool = True,
    ) -> None:
        """Replay buffer store/replace + re-roll buffering."""
        from trl.models.utils import disable_gradient_checkpointing

        # -- Replay buffer: store high-signal groups --
        if self._replay_buffer is not None:
            local_grouped = rewards_per_func.view(
                -1, num_generations, len(self.reward_funcs)
            )
            per_group_std = local_grouped.std(dim=1)
            has_signal = (per_group_std > 0).any(dim=1)
            offset = s_start or 0

            if has_signal.any():
                grouped_adv = advantages.view(-1, num_generations)
                replay_scores = grouped_adv.abs().sum(dim=1) * per_group_std.sum(dim=1)
                for group_idx in has_signal.nonzero(as_tuple=True)[0]:
                    gi = group_idx.item()
                    start = offset + gi * num_generations
                    end = start + num_generations
                    group_data = {}
                    for key in data:
                        val = data[key]
                        if (
                            isinstance(val, torch.Tensor)
                            and val.dim() > 0
                            and val.size(0) >= end
                        ):
                            group_data[key] = val[start:end].clone()
                    self._replay_buffer.add(replay_scores[gi].item(), group_data)

            # Replace zero-signal groups with high-signal replay buffer entries
            # Only in non-streaming path (s_start is None) — streaming scores
            # groups incrementally, so replacement + logprob recompute would be
            # too expensive per chunk.
            n_replaced = 0
            if s_start is None:
                no_signal = ~has_signal
                replaced_ranges = []
                if no_signal.any() and len(self._replay_buffer) > 0:
                    for group_idx in no_signal.nonzero(as_tuple=True)[0]:
                        sampled = self._replay_buffer.sample(1)
                        if sampled is None:
                            break
                        sampled_group = sampled[0]
                        gi = group_idx.item()
                        start = offset + gi * num_generations
                        end = start + num_generations
                        for key, val in sampled_group.items():
                            if key in data and isinstance(data[key], torch.Tensor):
                                src = val.to(data[key].device)
                                tgt_seq_len = (
                                    data[key].size(1) if data[key].dim() > 1 else None
                                )
                                if start >= data[key].size(0) or end > data[key].size(
                                    0
                                ):
                                    continue
                                if tgt_seq_len is not None:
                                    if src.size(1) <= tgt_seq_len:
                                        data[key][start:end] = 0
                                        data[key][start:end, : src.size(1)] = src
                                    else:
                                        data[key][start:end] = src[:, :tgt_seq_len]
                                else:
                                    data[key][start:end] = src
                        replaced_ranges.append((start, end))
                        n_replaced += 1

                # Recompute old_per_token_logps for replayed groups
                if (
                    n_replaced > 0
                    and self._replay_recompute_logps
                    and "old_per_token_logps" in data
                ):
                    with (
                        torch.no_grad(),
                        disable_gradient_checkpointing(
                            self.model, self.args.gradient_checkpointing_kwargs
                        ),
                    ):
                        for r_start, r_end in replaced_ranges:
                            r_ids = torch.cat(
                                [
                                    data["prompt_ids"][r_start:r_end],
                                    data["completion_ids"][r_start:r_end],
                                ],
                                dim=1,
                            )
                            r_mask = torch.cat(
                                [
                                    data["prompt_mask"][r_start:r_end],
                                    data["completion_mask"][r_start:r_end],
                                ],
                                dim=1,
                            )
                            r_logits_to_keep = data["completion_ids"].size(1)
                            r_fwd_kwargs = {}
                            for fk in (
                                "pixel_values",
                                "image_grid_thw",
                                "pixel_attention_mask",
                                "image_sizes",
                                "token_type_ids",
                                "mm_token_type_ids",
                            ):
                                if fk in data:
                                    r_fwd_kwargs[fk] = data[fk]
                            r_logps, _ = self._get_per_token_logps_and_entropies(
                                self.model,
                                r_ids,
                                r_mask,
                                r_logits_to_keep,
                                r_end - r_start,
                                **r_fwd_kwargs,
                            )
                            data["old_per_token_logps"][r_start:r_end] = r_logps

                if n_replaced > 0:
                    self._metrics[mode]["replay_buffer_replacements"].append(
                        float(n_replaced)
                    )

            if is_last_chunk:
                self._metrics[mode]["replay_buffer_size"].append(
                    float(len(self._replay_buffer))
                )

        # -- Re-roll buffer: store failed prompts --
        if getattr(self.args, "reroll_start_fraction", 1.0) < 1.0:
            grouped_rewards = rewards_per_func.view(
                -1, num_generations, len(self.reward_funcs)
            )
            per_group_std = grouped_rewards.std(dim=1)
            per_group_mean = grouped_rewards.mean(dim=1)
            zero_signal = (per_group_std == 0).all(dim=1)
            all_failed = (per_group_mean.abs() < 1e-6).all(dim=1)
            should_reroll = zero_signal & all_failed
            _n_buffered = 0
            with self._reroll_lock:
                for group_idx in should_reroll.nonzero(as_tuple=True)[0]:
                    idx = group_idx.item() * num_generations
                    if idx >= len(inputs):
                        continue
                    prompt_input = inputs[idx]
                    self._reroll_buffer.append(prompt_input)
                    _n_buffered += 1
            if _n_buffered > 0:
                self._metrics[mode]["reroll_buffered"].append(float(_n_buffered))
            if is_last_chunk:
                self._metrics[mode]["reroll_buffer_size"].append(
                    float(len(self._reroll_buffer))
                )

    # -- Zero-advantage skipping + Liger OPSM ---------------------------------

    def compute_liger_loss(self, unwrapped_model, inputs):
        """Liger loss with zero-adv skipping and off-policy sequence masking (OPSM).

        The base class Liger path doesn't support OPSM because the fused kernel
        doesn't expose per-token logprobs needed for the KL computation. This
        override computes them via chunked lm_head matmul (no grad, low memory)
        and applies the OPSM to the loss mask before calling the kernel.
        """
        if self.args.skip_zero_advantage_batches and torch.all(
            inputs["advantages"] == 0
        ):
            mode = "train" if self.model.training else "eval"
            self._metrics[mode]["skipped_zero_adv_batches"].append(1.0)
            return torch.tensor(
                0.0, device=inputs["advantages"].device, requires_grad=True
            )

        if self.off_policy_mask_threshold is None:
            return super().compute_liger_loss(unwrapped_model, inputs)

        # OPSM path: need per_token_logps for KL, which Liger kernel doesn't provide
        prompt_ids, prompt_mask = inputs["prompt_ids"], inputs["prompt_mask"]
        completion_ids, completion_mask = (
            inputs["completion_ids"],
            inputs["completion_mask"],
        )
        input_ids = torch.cat([prompt_ids, completion_ids], dim=1)
        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)
        logits_to_keep = completion_ids.size(1)

        last_hidden_state = self._get_last_hidden_state(
            unwrapped_model,
            input_ids,
            attention_mask,
            logits_to_keep,
            inputs.get("pixel_values"),
            inputs.get("image_grid_thw"),
            inputs.get("pixel_attention_mask"),
            inputs.get("image_sizes"),
        )

        loss_mask = (
            completion_mask
            if "tool_mask" not in inputs
            else completion_mask * inputs["tool_mask"]
        )

        # Compute per_token_logps via chunked lm_head matmul (no grad, low memory)
        lm_weight = unwrapped_model.lm_head.weight
        lm_bias = unwrapped_model.lm_head.bias
        with torch.no_grad():
            per_token_logps_chunks = []
            for i in range(last_hidden_state.size(0)):
                chunk_logits = torch.matmul(last_hidden_state[i : i + 1], lm_weight.t())
                if lm_bias is not None:
                    chunk_logits = chunk_logits + lm_bias
                chunk_lps = (
                    chunk_logits.float()
                    .log_softmax(-1)
                    .gather(-1, completion_ids[i : i + 1].unsqueeze(-1))
                    .squeeze(-1)
                )
                per_token_logps_chunks.append(chunk_lps)
                del chunk_logits
            per_token_logps = torch.cat(per_token_logps_chunks, dim=0)

        advantages = inputs["advantages"]
        if advantages.dim() == 1:
            advantages_2d = advantages.unsqueeze(1)
        else:
            advantages_2d = advantages

        sampling_per_token_logps = inputs.get("sampling_per_token_logps")
        if sampling_per_token_logps is None:
            sampling_per_token_logps = inputs.get("old_per_token_logps")
        if sampling_per_token_logps is None:
            sampling_per_token_logps = per_token_logps

        off_policy_mask = GRPOTrainer.get_off_policy_mask(
            advantages=advantages_2d,
            per_token_logps=per_token_logps,
            sampling_per_token_logps=sampling_per_token_logps,
            mask=loss_mask,
            off_policy_threshold=self.off_policy_mask_threshold,
        )
        loss_mask = loss_mask * off_policy_mask

        # Call the Liger fused kernel with OPSM-modified mask
        loss, metrics = self.liger_grpo_loss(
            _input=last_hidden_state,
            lin_weight=unwrapped_model.lm_head.weight,
            selected_token_ids=completion_ids,
            attention_mask=loss_mask,
            advantages=inputs["advantages"],
            bias=unwrapped_model.lm_head.bias,
            old_per_token_logps=inputs.get("old_per_token_logps"),
            ref_per_token_logps=inputs.get("ref_per_token_logps"),
            vllm_is_ratio=inputs.get("importance_sampling_ratio"),
        )

        mean_kl = metrics[0] if self.beta != 0.0 else None
        clip_ratio = metrics[-1]

        mode = "train" if self.model.training else "eval"
        if self.beta != 0.0:
            self._metrics[mode]["kl"].append(
                self.accelerator.gather(mean_kl).mean().item()
            )
        self._metrics[mode]["clip_ratio"].append(
            self.accelerator.gather(clip_ratio).mean().item()
        )
        normalizer = (
            self.current_gradient_accumulation_steps if mode == "train" else 1.0
        )
        return loss / normalizer

    def _compute_loss(self, model, inputs):
        if self.args.skip_zero_advantage_batches and torch.all(
            inputs["advantages"] == 0
        ):
            mode = "train" if self.model.training else "eval"
            self._metrics[mode]["skipped_zero_adv_batches"].append(1.0)
            # Create zero loss with grad_fn. DeepSpeed requires grad_fn != None.
            # With ZeRO-3, parameters are partitioned (shape=[0], requires_grad=False)
            # so we can't just do `(p * 0).sum()`. Instead, do a tiny forward pass
            # with a single token to create a proper computation graph.
            prompt_ids = inputs["prompt_ids"][:1, :1]  # (1, 1)
            attn = torch.ones_like(prompt_ids)
            with torch.amp.autocast(device_type="cuda", dtype=torch.bfloat16):
                out = model(input_ids=prompt_ids, attention_mask=attn)
            return out.logits.sum() * 0
        return super()._compute_loss(model, inputs)


================================================
FILE: src/axolotl/core/trainers/grpo/replay_buffer.py
================================================
"""Simple replay buffer for storing and sampling high-signal rollout groups."""

import heapq

import torch


class ReplayBuffer:
    """Min-heap replay buffer that keeps the highest-scoring rollout groups.
    Groups are scored by signal quality (advantage magnitude * reward variance).
    When sampling, groups are drawn proportional to their scores.
    """

    def __init__(self, max_size: int):
        self.max_size = max_size
        self._heap: list[tuple[float, int, dict]] = []  # min-heap of (score, id, data)
        self._counter = 0  # unique tiebreaker for heap

    def __len__(self):
        return len(self._heap)

    def add(self, score: float, data: dict):
        """Add a group to the buffer. If full, replaces lowest-scoring entry."""
        if self.max_size <= 0:
            return
        self._counter += 1
        if len(self._heap) < self.max_size:
            heapq.heappush(self._heap, (score, self._counter, data))
        elif score > self._heap[0][0]:
            heapq.heapreplace(self._heap, (score, self._counter, data))

    def sample(self, num_samples: int) -> list[dict] | None:
        """Sample groups weighted by their scores. Returns None if buffer is empty."""
        if self.max_size <= 0 or not self._heap:
            return None

        scores = torch.tensor([item[0] for item in self._heap], dtype=torch.float32)
        scores = scores.clamp(min=1e-8)  # avoid zero probabilities
        probs = scores / scores.sum()
        replacement = num_samples > len(self._heap)
        indices = torch.multinomial(
            probs, num_samples, replacement=replacement
        ).tolist()
        return [self._heap[i][2] for i in indices]


================================================
FILE: src/axolotl/core/trainers/grpo/sampler.py
================================================
"""Repeat random sampler (similar to the one implemented in
https://github.com/huggingface/trl/blob/main/trl/trainer/grpo_trainer.py) that adds
sequence parallelism functionality; i.e., duplicating data across ranks in the same
sequence parallel group.
"""

from typing import Iterator, Sized

import torch
from torch.utils.data import Sampler


class SequenceParallelRepeatRandomSampler(Sampler):
    """Sampler for GRPO training with sequence parallelism.

    This sampler ensures:
    - Ranks in the same sequence parallel (SP) group receive identical data.
    - Each index is repeated multiple times for sampling different completions.
    - Entire batches are repeated for reuse in multiple updates.
    - Data is properly distributed across SP groups.

    In the table below, the values represent dataset indices. Each SP group has
    `context_parallel_size = 2` GPUs working together on the same data. There are 2
    SP groups (SP0 and SP1), with `world_size = 4` total GPUs.

                                               Sequence Parallel Groups
                                        |       SP0        |       SP1        |
                                        |  GPU 0  |  GPU 1 |  GPU 2  |  GPU 3 |
                    global_step  step    <---> mini_repeat_count=3
                                            <----------> batch_size=2 per SP group
    grad_accum=2   ▲  ▲  0       0         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- SP groups get different data
                   ▼  |  0       1         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- Same data for each SP group GPU
                      |
                      |  1       2         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- Repeat same indices for iterations
    num_iterations=2  ▼  1       3         [0 0 0  1 1 1]     [2 2 2  3 3 3]   <- When using gradient accumulation

                         2       4         [4 4 4  5 5 5]     [6 6 6  7 7 7]   <- New batch of data indices
                         2       5         [4 4 4  5 5 5]     [6 6 6  7 7 7]
                                            ...

    Args:
        dataset: Dataset to sample from.
        mini_repeat_count: How many times to repeat each sample immediately.
        world_size: Total number of processes.
        rank: Rank of current process.
        batch_size: Number of samples per batch.
        repeat_count: How many times to repeat the full sampling process.
        context_parallel_size: Number of ranks in a sequence parallel group.
        shuffle: Whether to shuffle the dataset.
        seed: Random seed for shuffling.
        drop_last: Whether to drop the last incomplete batch.
    """

    def __init__(
        self,
        dataset: Sized,
        mini_repeat_count: int,
        world_size: int,
        rank: int,
        batch_size: int = 1,
        repeat_count: int = 1,
        context_parallel_size: int = 1,
        shuffle: bool = True,
        seed: int = 0,
        drop_last: bool = False,
    ):
        self.dataset = dataset
        self.mini_repeat_count = mini_repeat_count
        self.batch_size = batch_size
        self.repeat_count = repeat_count
        self.shuffle = shuffle
        self.seed = seed
        self.drop_last = drop_last
        self.epoch = 0

        self.world_size = world_size
        self.rank = rank

        # Sequence parallelism parameters
        self.context_parallel_size = context_parallel_size
        self.num_sp_groups = world_size // context_parallel_size
        self.sp_group_id = rank // context_parallel_size

        # Adjust dataset size for distributed sampling
        self.num_samples = len(self.dataset)
        self.total_size = self.num_samples

        # Calculate effective number of samples per SP group
        if (
            self.drop_last
            and self.total_size % (self.num_sp_groups * self.batch_size) != 0
        ):
            # Drop last incomplete batch if drop_last is True
            self.num_samples_per_sp_group = (
                self.total_size // self.batch_size // self.num_sp_groups
            ) * self.batch_size
        else:
            # Round up to include last batch if drop_last is False
            self.num_samples_per_sp_group = (
                (self.total_size + self.batch_size * self.num_sp_groups - 1)
                // (self.batch_size * self.num_sp_groups)
                * self.batch_size
            )

        if shuffle:
            self.generator = torch.Generator()
            self.generator.manual_seed(seed)

    def __iter__(self) -> Iterator[int]:
        """Creates iterator over dataset indices.

        Returns:
            Iterator that yields indices into the dataset.
        """
        # Deterministically shuffle based on epoch and seed
        if self.shuffle:
            indices = torch.randperm(
                self.num_samples, generator=self.generator
            ).tolist()
        else:
            indices = list(range(self.num_samples))

        # Add extra samples to make it evenly divisible by batch_size
        if len(indices) % self.batch_size != 0:
            padding = indices[: self.batch_size - len(indices) % self.batch_size]
            indices += padding

        # Subsample based on SP group ID
        # Each SP group gets distinct batches of data
        batch_indices = []
        for i in range(0, len(indices), self.batch_size * self.num_sp_groups):
            start_idx = i + self.sp_group_id * self.batch_size
            end_idx = min(start_idx + self.batch_size, len(indices))
            if start_idx < len(indices):
                for j in range(self.batch_size):
                    if start_idx + j < end_idx:
                        batch_indices.append(indices[start_idx + j])

        # Make sure batch_indices is exactly batch_size * num_batches_per_sp_group
        if self.drop_last:
            num_batches_per_sp_group = self.num_samples_per_sp_group // self.batch_size
            target_len = self.batch_size * num_batches_per_sp_group
            if len(batch_indices) > target_len:
                batch_indices = batch_indices[:target_len]

        # Apply the GRPO repeat pattern
        final_indices = []
        for _ in range(self.repeat_count):
            for idx in batch_indices:
                for _ in range(self.mini_repeat_count):
                    final_indices.append(idx)

        return iter(final_indices)

    def __len__(self) -> int:
        """Returns the total length of the iterable including repetitions.

        Returns:
            Total number of samples.
        """
        # Total length including all repetitions
        return (
            self.num_samples_per_sp_group * self.mini_repeat_count * self.repeat_count
        )

    def set_epoch(self, epoch: int) -> None:
        """Sets the epoch for this sampler.

        Args:
            epoch: Epoch number to use for shuffling.
        """
        self.epoch = epoch


================================================
FILE: src/axolotl/core/trainers/grpo/trainer.py
================================================
"""Axolotl GRPO trainers (with and without sequence parallelism handling)"""

import warnings
from functools import partial
from typing import Any

import datasets
import torch
import torch.distributed as dist
import torch.utils.data
from accelerate.utils import (
    broadcast_object_list,
    gather,
    gather_object,
    is_peft_available,
)
from datasets import Dataset, IterableDataset
from torch import nn
from torch.utils.data import (
    BatchSampler,
    DataLoader,
    Sampler,
)
from transformers import (
    PreTrainedModel,
    PreTrainedTokenizerBase,
    Trainer,
    TrainerCallback,
)
from transformers.trainer_utils import seed_worker
from trl import GRPOTrainer
from trl.data_utils import (
    apply_chat_template,
    is_conversational,
    maybe_apply_chat_template,
)
from trl.extras.profiling import profiling_context
from trl.models import unwrap_model_for_generation
from trl.trainer.grpo_config import GRPOConfig
from trl.trainer.grpo_trainer import RewardFunc, nanstd
from trl.trainer.utils import pad

from axolotl.core.trainers.grpo.fast_async_trainer import FastAsyncGRPOTrainer
from axolotl.core.trainers.grpo.sampler import SequenceParallelRepeatRandomSampler
from axolotl.core.trainers.mixins import (
    DistributedParallelMixin,
    RngLoaderMixin,
    SchedulerMixin,
)
from axolotl.core.trainers.mixins.optimizer import OptimizerInitMixin, OptimizerMixin
from axolotl.monkeypatch.ring_attn import get_ring_attn_group

if is_peft_available():
    from peft import PeftConfig


class AxolotlGRPOTrainer(
    RngLoaderMixin,
    SchedulerMixin,
    OptimizerMixin,
    OptimizerInitMixin,
    DistributedParallelMixin,
    GRPOTrainer,
):
    """Extend the base GRPOTrainer for axolotl helpers"""

    _tag_names = ["trl", "grpo", "axolotl"]


class AxolotlAsyncGRPOTrainer(
    RngLoaderMixin,
    SchedulerMixin,
    OptimizerMixin,
    OptimizerInitMixin,
    DistributedParallelMixin,
    FastAsyncGRPOTrainer,
):
    """Extend AsyncGRPOTrainer with axolotl helpers"""

    _tag_names = ["trl", "grpo", "async", "axolotl"]


class AxolotlGRPOSequenceParallelTrainer(AxolotlGRPOTrainer):
    """Extend the base GRPOTrainer for sequence parallelism handling"""

    def __init__(
        self,
        model: str | PreTrainedModel,
        reward_funcs: RewardFunc | list[RewardFunc],
        args: GRPOConfig | None = None,
        train_dataset: Dataset | IterableDataset | None = None,
        eval_dataset: (
            Dataset | IterableDataset | dict[str, Dataset | IterableDataset] | None
        ) = None,
        processing_class: PreTrainedTokenizerBase | None = None,
        reward_processing_classes: (
            PreTrainedTokenizerBase | list[PreTrainedTokenizerBase] | None
        ) = None,
        callbacks: list[TrainerCallback] | None = None,
        optimizers: tuple[
            torch.optim.Optimizer | None, torch.optim.lr_scheduler.LambdaLR | None
        ] = (None, None),
        peft_config: "PeftConfig | None" = None,
        optimizer_cls_and_kwargs: tuple[type, dict] | None = None,
    ):
        # First call the superclass constructor with all arguments
        super().__init__(
            model=model,
            reward_funcs=reward_funcs,
            args=args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            processing_class=processing_class,
            reward_processing_classes=reward_processing_classes,
            callbacks=callbacks,
            optimizers=optimizers,
            peft_config=peft_config,
            optimizer_cls_and_kwargs=optimizer_cls_and_kwargs,
        )

        # Get number of SP groups (number of processes divided by SP degree)
        num_processes = self.accelerator.num_processes
        num_sp_groups = num_processes // self.args.context_parallel_size

        # Calculate batch size per SP group (not per process)
        sp_group_batch_size = self.args.per_device_train_batch_size * num_sp_groups
        possible_values = [
            n_gen
            for n_gen in range(2, sp_group_batch_size + 1)
            if (sp_group_batch_size) % n_gen == 0
        ]

        if self.num_generations not in possible_values:
            raise ValueError(
                f"The batch size per SP group ({num_sp_groups} x "
                f"{self.args.per_device_train_batch_size}) must be evenly divisible by "
                f"the number of generations per prompt ({self.num_generations}). Given "
                "the current configuration, the valid values for the number of "
                f"generations are: {possible_values}."
            )

        if self.args.eval_strategy != "no":
            # If sequence parallelism is enabled, calculate batch size per SP group
            sp_group_eval_batch_size = args.per_device_eval_batch_size * num_sp_groups  # type: ignore[union-attr]
            possible_values = [
                n_gen
                for n_gen in range(2, sp_group_eval_batch_size + 1)
                if (sp_group_eval_batch_size) % n_gen == 0
            ]

            if self.num_generations not in possible_values:
                raise ValueError(
                    f"With sequence parallelism (degree {self.args.context_parallel_size}), "
                    f"the eval batch size per SP group ({num_sp_groups} x {self.args.per_device_eval_batch_size}) "
                    f"must be evenly divisible by the number of generations per prompt "
                    f"({self.num_generations}). Given the current eval batch size, "
                    f"the valid values for the number of generations are: {possible_values}."
                )

        self.sp_group = None
        self.rank = dist.get_rank()
        self.world_size = dist.get_world_size()
        self.local_rank = 0
        self.local_world_size = 1

    def train(self, *args, **kwargs):
        # Initialize the SP group
        self.sp_group = get_ring_attn_group()
        self.rank = dist.get_rank()
        self.world_size = dist.get_world_size()
        self.local_rank = dist.get_rank(group=self.sp_group)
        self.local_world_size = dist.get_world_size(group=self.sp_group)

        return super().train(*args, **kwargs)

    def _get_train_sampler(self) -> Sampler:
        effective_batch_size = (
            self.args.per_device_train_batch_size
            * self.world_size
            * self.args.gradient_accumulation_steps
        )

        return SequenceParallelRepeatRandomSampler(
            dataset=self.train_dataset,
            mini_repeat_count=self.num_generations,
            world_size=self.world_size,
            rank=self.rank,
            batch_size=effective_batch_size
            // self.num_generations
            // self.args.context_parallel_size,
            repeat_count=self.num_iterations * self.args.gradient_accumulation_steps,
            context_parallel_size=self.args.context_parallel_size,
            shuffle=True,
            seed=self.args.seed,
            drop_last=True,
        )

    def _create_dataloader_params(self, is_eval=False, custom_batch_size=None):
        """Create common dataloader parameters for train or eval."""
        batch_size = custom_batch_size or (
            self.args.eval_batch_size if is_eval else self._train_batch_size
        )

        params = {
            "batch_size": batch_size,
            "collate_fn": self.data_collator,
            "num_workers": self.args.dataloader_num_workers,
            "pin_memory": self.args.dataloader_pin_memory,
        }

        # Add persistent workers only for training
        if not is_eval and hasattr(self.args, "dataloader_persistent_workers"):
            params["persistent_workers"] = self.args.dataloader_persistent_workers

        # Add prefetch factor if specified
        if self.args.dataloader_prefetch_factor:
            params["prefetch_factor"] = self.args.dataloader_prefetch_factor

        return params

    def _prepare_dataloader(
        self, dataset, sampler, is_eval=False, custom_batch_size=None
    ):
        """Prepare a dataloader with the given dataset and sampler."""
        # Get base parameters
        dataloader_params = self._create_dataloader_params(is_eval, custom_batch_size)

        # Add sampler configuration
        if not isinstance(dataset, torch.utils.data.IterableDataset):
            if isinstance(sampler, BatchSampler):
                # batch_size and batch_sampler are mutually exclusive
                dataloader_params["batch_sampler"] = sampler
                del dataloader_params["batch_size"]
            else:
                dataloader_params["sampler"] = sampler
                dataloader_params["drop_last"] = self.args.dataloader_drop_last

            if not is_eval:
                dataloader_params["worker_init_fn"] = partial(
                    seed_worker,
                    num_workers=self.args.dataloader_num_workers,
                    rank=self.args.process_index,
                )

        # Create the dataloader
        dataloader = DataLoader(dataset, **dataloader_params)

        if self.args.sample_packing and (
            (not is_eval and not self.args.pretraining)
            or (is_eval and self.args.eval_sample_packing is not False)
        ):
            self.accelerator.even_batches = False

        # Return unprepared dataloader if using sequence parallelism
        # TODO(djsaunde): We might be able to use `accelerate`'s dataloader preparation
        # if we use `dispatch_batches` and `slice_fn_for_dispatch` properly (i.e.,
        # slice each batch along the sequence dimension).
        if self.args.context_parallel_size > 1:
            return dataloader

        # Otherwise prepare with accelerator
        return self.accelerator.prepare_data_loader(dataloader)

    def get_train_dataloader(self) -> DataLoader:
        """Get dataloader for training"""
        train_dataset = self.train_dataset

        data_collator = self.data_collator  # type: ignore

        # Handle dataset preprocessing
        if isinstance(train_dataset, datasets.Dataset):
            # Add debug print before any modifications
            if self.args.sample_packing and not self.args.pretraining:
                train_dataset = train_dataset.remove_columns(["length"])
            if not self.args.sample_packing or self.args.pretraining:
                train_dataset = self._remove_unused_columns(
                    train_dataset, description="training"
                )
        else:
            self.data_collator = self._get_collator_with_removed_columns(
                data_collator,
                description="training",
            )

        # Get sampler and create dataloader
        sampler = self._get_train_sampler()
        dataloader = self._prepare_dataloader(train_dataset, sampler, is_eval=False)

        return dataloader

    def _generate_and_score_completions(
        self, inputs: list[dict[str, torch.Tensor | Any]]
    ) -> dict[str, torch.Tensor | Any]:
        device = self.accelerator.device
        mode = "eval" if self.control.should_evaluate else "train"

        prompts = [x["prompt"] for x in inputs]
        prompts_text = [
            maybe_apply_chat_template(example, self.processing_class)["prompt"]
            for example in inputs
        ]
        prompt_inputs = self.processing_class(
            text=prompts_text,
            return_tensors="pt",
            padding=True,
            padding_side="left",
            add_special_tokens=False,
        )
        prompt_inputs = Trainer._prepare_inputs(self, prompt_inputs)
        prompt_ids, prompt_mask = (
            prompt_inputs["input_ids"],
            prompt_inputs["attention_mask"],
        )

        if self.max_prompt_length is not None:
            prompt_ids = prompt_ids[:, -self.max_prompt_length :]
            prompt_mask = prompt_mask[:, -self.max_prompt_length :]

        # Generate completions using either vLLM or regular generation
        if self.args.use_vllm:
            # First, have main process load weights if needed

            if self.state.global_step != self._last_loaded_step:  # type: ignore[has-type]
                self._move_model_to_vllm()

                self._last_loaded_step = self.state.global_step

            # Generate completions using vLLM: gather all prompts and use them in a single call in the main process
            all_prompts_text = gather_object(prompts_text)
            if self.accelerator.is_main_process:
                if self.args.context_parallel_size > 1:
                    # Calculate sequence parallel group information
                    world_size = self.accelerator.num_processes
                    context_parallel_size = self.args.context_parallel_size
                    num_sp_groups = world_size // context_parallel_size

                    # Since processes in the same SP group have the same prompts, we need to ensure
                    # we only take one copy of each prompt from each SP group
                    ordered_set_of_prompts = []
                    for sp_group_id in range(num_sp_groups):
                        # Get the first process from each SP group (typically the group leader)
                        group_leader_rank = sp_group_id * context_parallel_size

                        # Extract prompts from this SP group, accounting for num_generations duplicates
                        # We only need prompts from one rank in each SP group
                        group_prompts = all_prompts_text[
                            group_leader_rank * len(prompts_text) : (
                                group_leader_rank + 1
                            )
                            * len(prompts_text) : self.num_generations
                        ]

                        ordered_set_of_prompts.extend(group_prompts)
                else:
                    # Since 'prompts' contains 'num_generations' duplicates, we first take unique prompts, and generate
                    # num_generations outputs for each one. This is faster than generating outputs for each duplicate
                    # prompt individually.
                    ordered_set_of_prompts = all_prompts_text[
                        :: self.num_generations * self.args.context_parallel_size
                    ]

                with profiling_context(self, "vLLM.generate"):
                    completion_ids = self.vllm_client.generate(
                        prompts=ordered_set_of_prompts,
                        n=self.num_generations,
                        repetition_penalty=self.repetition_penalty,
                        temperature=self.temperature,
                        top_p=self.top_p,
                        top_k=-1 if self.top_k is None else self.top_k,
                        min_p=0.0 if self.min_p is None else self.min_p,
                        max_tokens=self.max_completion_length,
                        guided_decoding_regex=self.guided_decoding_regex,
                    )
            else:
                completion_ids = [None] * (
                    len(all_prompts_text) // self.args.context_parallel_size
                )

            # Broadcast the completions from the main process to all processes
            completion_ids = broadcast_object_list(completion_ids, from_process=0)

            # Determine the appropriate slice based on sequence parallelism
            if self.args.context_parallel_size > 1:
                # Calculate SP group ID (which group of ranks this rank belongs to)
                sp_group_id = self.accelerator.process_index // self.local_world_size

                # Calculate the start index for this SP group
                sp_group_start = sp_group_id * len(prompts) * self.local_world_size

                # All ranks in the same SP group get the same data slice
                process_slice = slice(
                    sp_group_start,
                    sp_group_start + len(prompts),
                )
                completion_ids = completion_ids[process_slice]
            else:
                # Original behavior for non-sequence parallel case
                process_slice = slice(
                    self.accelerator.process_index * len(prompts),
                    (self.accelerator.process_index + 1) * len(prompts),
                )
                completion_ids = completion_ids[process_slice]

            # Pad the completions, and concatenate them with the prompts
            completion_ids = [
                torch.tensor(ids, device=device) for ids in completion_ids
            ]
            completion_ids = pad(
                completion_ids, padding_value=self.processing_class.pad_token_id
            )
            prompt_completion_ids = torch.cat([prompt_ids, completion_ids], dim=1)
        else:
            # Regular generation path
            with unwrap_model_for_generation(
                self.model_wrapped,
                self.accelerator,
                gather_deepspeed3_params=self.args.ds3_gather_for_generation,
            ) as unwrapped_model:
                prompt_completion_ids = unwrapped_model.generate(
                    prompt_ids,
                    attention_mask=prompt_mask,
                    generation_config=self.generation_config,
                )

            # Compute prompt length and extract completion ids
            prompt_length = prompt_ids.size(1)
            prompt_ids = prompt_completion_ids[:, :prompt_length]
            completion_ids = prompt_completion_ids[:, prompt_length:]

        # Mask everything after the first EOS token
        is_eos = completion_ids == self.processing_class.eos_token_id
        eos_idx = torch.full(
            (is_eos.size(0),), is_eos.size(1), dtype=torch.long, device=device
        )
        eos_idx[is_eos.any(dim=1)] = is_eos.int().argmax(dim=1)[is_eos.any(dim=1)]
        sequence_indices = torch.arange(is_eos.size(1), device=device).expand(
            is_eos.size(0), -1
        )
        completion_mask = (sequence_indices <= eos_idx.unsqueeze(1)).int()

        # If mask_truncated_completions is enabled, zero out truncated completions in completion_mask
        if self.args.mask_truncated_completions:
            truncated_completions = ~is_eos.any(dim=1)
            completion_mask = (
                completion_mask * (~truncated_completions).unsqueeze(1).int()
            )

        # Concatenate prompt_mask with completion_mask for logit computation
        attention_mask = torch.cat([prompt_mask, completion_mask], dim=1)  # (B, P+C)

        logits_to_keep = completion_ids.size(
            1
        )  # we only need to compute the logits for the completion tokens
        batch_size = (
            self.args.per_device_train_batch_size
            if mode == "train"
            else self.args.per_device_eval_batch_size
        )

        with torch.no_grad():
            # When using num_iterations == 1, old_per_token_logps == per_token_logps, so we can skip it's
            # computation here, and use per_token_logps.detach() instead.
            if self.num_iterations > 1:
                old_per_token_logps = self._get_per_token_logps(
                    self.model,
                    prompt_completion_ids,
                    attention_mask,
                    logits_to_keep,
                    batch_size,
                )
            else:
                old_per_token_logps = None

            if self.beta == 0.0:
                ref_per_token_logps = None
            elif self.ref_model is not None:
                ref_per_token_logps = self._get_per_token_logps(
                    self.ref_model,
                    prompt_completion_ids,
                    attention_mask,
                    logits_to_keep,
                    batch_size,
                )
            else:
                with self.accelerator.unwrap_model(self.model).disable_adapter():
                    ref_per_token_logps = self._get_per_token_logps(
                        self.model,
                        prompt_completion_ids,
                        attention_mask,
                        logits_to_keep,
                        batch_size,
                    )

        # Decode the generated completions
        completions_text = self.processing_class.batch_decode(
            completion_ids, skip_special_tokens=True
        )
        if is_conversational(inputs[0]):
            completions = []
            for prompt, completion in zip(prompts, completions_text, strict=False):
                bootstrap = (
                    prompt.pop()["content"] if prompt[-1]["role"] == "assistant" else ""
                )
                completions.append(
                    [{"role": "assistant", "content": bootstrap + completion}]
                )
        else:
            completions = completions_text

        rewards_per_func = torch.zeros(
            len(prompts), len(self.reward_funcs), device=device
        )
        for i, (reward_func, reward_processing_class, reward_func_name) in enumerate(
            zip(
                self.reward_funcs,
                self.reward_processing_classes,
                self.reward_func_names,
                strict=False,
            )
        ):
            with profiling_context(self, reward_func_name):
                if isinstance(
                    reward_func, nn.Module
                ):  # Module instead of PretrainedModel for compat with compiled models
                    if is_conversational(inputs[0]):
                        messages = [
                            {"messages": p + c}
                            for p, c in zip(prompts, completions, strict=False)
                        ]
                        texts = [
                            apply_chat_template(x, reward_processing_class)["text"]
                            for x in messages
                        ]
                    else:
                        texts = [
                            p + c for p, c in zip(prompts, completions, strict=False)
                        ]
                    reward_inputs = reward_processing_class(
                        text=texts,
                        return_tensors="pt",
                        padding=True,
                        padding_side="right",
                        add_special_tokens=False,
                    )
                    reward_inputs = Trainer._prepare_inputs(self, reward_inputs)
                    with torch.inference_mode():
                        rewards_per_func[:, i] = reward_func(**reward_inputs).logits[
                            :, 0
                        ]  # Shape (B*G,)
                else:
                    # Repeat all input columns (but "prompt" and "completion") to match the number of generations
                    keys = [
                        key for key in inputs[0] if key not in ["prompt", "completion"]
                    ]
                    reward_kwargs = {
                        key: [example[key] for example in inputs] for key in keys
                    }
                    output_reward_func = reward_func(
                        prompts=prompts, completions=completions, **reward_kwargs
                    )
                    # Convert None values to NaN
                    output_reward_func = [
                        reward if reward is not None else torch.nan
                        for reward in output_reward_func
                    ]

                    rewards_per_func[:, i] = torch.tensor(
                        output_reward_func, dtype=torch.float32, device=device
                    )

        # If all reward functions return None for a given row, issue a detailed warning
        if torch.isnan(rewards_per_func).all(dim=1).any():
            nan_row_idx = (
                torch.isnan(rewards_per_func).all(dim=1).nonzero(as_tuple=True)[0][0]
            )
            row_reward_kwargs = {
                key: value[nan_row_idx] for key, value in reward_kwargs.items()
            }
            row_reward_kwargs["prompt"] = prompts[nan_row_idx]
            row_reward_kwargs["completion"] = completions[nan_row_idx]
            warnings.warn(
                f"All reward functions returned None for the following kwargs: {row_reward_kwargs}. "
                "Please ensure that at least one reward function returns a valid reward.",
                stacklevel=2,
            )

        # Gather the reward per function: this part is crucial, because the rewards are normalized per group and the
        # completions may be distributed across processes
        rewards_per_func = gather(rewards_per_func)

        # Apply weights to each reward function's output and sum
        rewards = (
            rewards_per_func * self.reward_weights.to(device).unsqueeze(0)
        ).nansum(dim=1)

        # Compute grouped-wise rewards
        mean_grouped_rewards = rewards.view(-1, self.num_generations).mean(dim=1)
        std_grouped_rewards = rewards.view(-1, self.num_generations).std(dim=1)

        # Normalize the rewards to compute the advantages
        mean_grouped_rewards = mean_grouped_rewards.repeat_interleave(
            self.num_generations, dim=0
        )
        std_grouped_rewards = std_grouped_rewards.repeat_interleave(
            self.num_generations, dim=0
        )
        advantages = rewards - mean_grouped_rewards
        if self.args.scale_rewards:
            advantages = advantages / (std_grouped_rewards + 1e-4)

        # Slice to keep only the local part of the data
        if self.args.context_parallel_size > 1:
            # Calculate SP group ID (which group of ranks this rank belongs to)
            sp_group_id = self.accelerator.process_index // self.local_world_size

            # Calculate the start index for this SP group
            sp_group_start = sp_group_id * len(prompts) * self.local_world_size

            # All ranks in the same SP group get the same data slice
            process_slice = slice(
                sp_group_start,
                sp_group_start + len(prompts),
            )
        else:
            # Original behavior for non-sequence parallel case
            process_slice = slice(
                self.accelerator.process_index * len(prompts),
                (self.accelerator.process_index + 1) * len(prompts),
            )
        advantages = advantages[process_slice]

        # Log the metrics
        if mode == "train":
            self._total_train_tokens += (
                self.accelerator.gather_for_metrics(attention_mask.sum()).sum().item()
            )
        self._metrics[mode]["num_tokens"] = [self._total_train_tokens]

        # log completion lengths, mean, min, max
        agg_completion_mask = self.accelerator.gather_for_metrics(
            completion_mask.sum(1)
        )
        self._metrics[mode]["completions/mean_length"].append(
            agg_completion_mask.float().mean().item()
        )
        self._metrics[mode]["completions/min_length"].append(
            agg_completion_mask.float().min().item()
        )
        self._metrics[mode]["completions/max_length"].append(
            agg_completion_mask.float().max().item()
        )

        # identify sequences that terminated with EOS and log their lengths
        agg_terminated_with_eos = self.accelerator.gather_for_metrics(is_eos.any(dim=1))
        term_completion_mask = agg_completion_mask[agg_terminated_with_eos]
        clipped_completions_ratio = 1 - len(term_completion_mask) / len(
            agg_completion_mask
        )
        self._metrics[mode]["completions/clipped_ratio"].append(
            clipped_completions_ratio
        )
        if len(term_completion_mask) == 0:
            # edge case where no completed sequences are found
            term_completion_mask = torch.zeros(1, device=device)
        self._metrics[mode]["completions/mean_terminated_length"].append(
            term_completion_mask.float().mean().item()
        )
        self._metrics[mode]["completions/min_terminated_length"].append(
            term_completion_mask.float().min().item()
        )
        self._metrics[mode]["completions/max_terminated_length"].append(
            term_completion_mask.float().max().item()
        )

        # Calculate mean reward per function, but only for samples where the function was applied (non-NaN values)
        for i, reward_func_name in enumerate(self.reward_func_names):
            mean_rewards = torch.nanmean(rewards_per_func[:, i]).item()
            self._metrics[mode][f"rewards/{reward_func_name}/mean"].append(mean_rewards)
            std_rewards = nanstd(rewards_per_func[:, i]).item()
            self._metrics[mode][f"rewards/{reward_func_name}/std"].append(std_rewards)
        self._metrics[mode]["reward"].append(mean_grouped_rewards.mean().item())
        self._metrics[mode]["reward_std"].append(std_grouped_rewards.mean().item())

        # Log prompt and completion texts
        self._textual_logs["prompt"].extend(gather_object(prompts_text))
        self._textual_logs["completion"].extend(gather_object(completions_text))
        for i, name in enumerate(self.reward_func_names):
            self._textual_logs["rewards"][name].extend(rewards_per_func[:, i].tolist())

        return {
            "prompt_ids": prompt_ids,
            "prompt_mask": prompt_mask,
            "completion_ids": completion_ids,
            "completion_mask": completion_mask,
            "advantages": advantages,
            "old_per_token_logps": old_per_token_logps,
            "ref_per_token_logps": ref_per_token_logps,
        }


================================================
FILE: src/axolotl/core/trainers/mamba.py
================================================
"""Module for mamba trainer"""

import torch

from axolotl.core.trainers.base import AxolotlTrainer


class AxolotlMambaTrainer(AxolotlTrainer):
    """Mamba specific trainer to handle loss calculation"""

    tag_names = ["axolotl", "mamba"]

    def compute_loss(
        self,
        model,
        inputs,
        return_outputs=False,
        num_items_in_batch=None,
    ):
        input_ids = inputs.pop("input_ids")
        lm_logits = model(input_ids).logits

        labels = input_ids.to(lm_logits.device)
        shift_logits = lm_logits[:, :-1, :].contiguous()
        labels = labels[:, 1:].contiguous()

        loss_fct = torch.nn.CrossEntropyLoss()
        lm_loss = loss_fct(
            shift_logits.view(-1, shift_logits.size(-1)), labels.view(-1)
        )

        return lm_loss


================================================
FILE: src/axolotl/core/trainers/mixins/__init__.py
================================================
"""Init for axolotl.core.trainers.mixins"""

# flake8: noqa

from .activation_checkpointing import ActivationOffloadingMixin
from .checkpoints import CheckpointSaveMixin
from .distributed_parallel import DistributedParallelMixin
from .optimizer import OptimizerMixin
from .packing import PackingMixin
from .rng_state_loader import RngLoaderMixin
from .scheduler import SchedulerMixin


================================================
FILE: src/axolotl/core/trainers/mixins/activation_checkpointing.py
================================================
"""
Trainer mixin for activation checkpointing w offloading
"""

import contextlib

from peft import PeftModel
from torch import nn
from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
    apply_activation_checkpointing,
)
from torch.distributed.fsdp.wrap import ModuleWrapPolicy
from transformers import GradientCheckpointingLayer, Trainer
from trl.models.activation_offloading import (
    NoOpManager,
    OffloadActivations,
    get_act_offloading_ctx_manager,
)

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


class ActivationOffloadingMixin(Trainer):
    """
    Trainer mixin class for activation checkpointing w offloading
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        if self.args.activation_offloading:
            if isinstance(self.model, PeftModel):
                self.activation_offload_context = get_lora_act_offloading_ctx_manager(
                    self.model, use_streams=True
                )
            else:
                self.activation_offload_context = get_act_offloading_ctx_manager(
                    self.model, use_streams=True
                )
        else:
            self.activation_offload_context = contextlib.nullcontext()

    def training_step(self, *args, **kwargs):
        with self.activation_offload_context:
            return super().training_step(*args, **kwargs)


def ac_wrap_hf_model(model: nn.Module, **kwargs):
    auto_wrap_policy = ModuleWrapPolicy(set((GradientCheckpointingLayer,)))
    apply_activation_checkpointing(model, auto_wrap_policy=auto_wrap_policy, **kwargs)


def get_lora_act_offloading_ctx_manager(
    model: nn.Module,
    use_pin_memory: bool = True,
    use_streams: bool = True,
    min_offload_size: int = 1024,
    max_fwd_stash_size: int = 5,
    warn_if_no_head: bool = True,
) -> OffloadActivations:
    """
    Returns the activation offloading context manager for the model. All but the last output Linear in every step will
    be offloaded.

    If activation offloading is enabled, we return the OffloadActivations context manager. If activation offloading is
    disabled, we return a NoOpManager context manager.

    Args:
        model (`nn.Module`):
            Model to wrap with the activation offloading context manager.
        use_pin_memory (`bool`, *optional*, defaults to `True`):
            Whether to offloaded Tensor will be placed in pinned memory on the CPU. Pinned memory allows the Tensor to
            be moved back onto GPU more quickly but is a limited resource.
        use_streams (`bool`, *optional*, defaults to `True`):
            Whether to use streams for performance optimization where the communications get overlapped with the
            computation. Requires a torch build after torch-2.5.0.
        min_offload_size (`int`, *optional*, defaults to `1024`):
            Minimum number of bytes a Tensor must be in order to qualify for offloading. If the tensor is too small, we
            do not want to waste bandwidth and resources moving it to CPU and back.
        max_fwd_stash_size (`int`, *optional*, defaults to `5`):
            Maximum size of the forward stash, or the maximum number of consecutive activations to keep alive during
            the forward pass. This number must be at least 1. Keeping alive more activations will potentially allow
            more overlap between the communication and compute streams at the cost of increasing memory usage. Keeping
            alive fewer activations will conserve memory, but may cause poor overlap between the streams, increasing
            runtime.
        warn_if_no_head (`bool`, *optional*, defaults to `True`):
            Whether to warn if no output head is detected. If set to `False`, no warning will be raised if no output
            head is detected.

    Returns:
        `contextlib.ContextDecorator`:
            Activation offloading context manager for the model.
    """

    activations_handling_ctx = OffloadActivations(
        use_pin_memory=use_pin_memory,
        use_streams=use_streams,
        min_offload_size=min_offload_size,
        max_fwd_stash_size=max_fwd_stash_size,
    )

    # Below is our hack to disable offloading the last output Linear in every
    # step, as the cost for offloading the activation and then soon after bringing
    # it back is expensive.
    output_head_detected = False
    noop_ctx = NoOpManager()

    # Try to get the actual model if it's wrapped
    unwrapped_model = model
    if hasattr(unwrapped_model, "module"):
        unwrapped_model = unwrapped_model.module
    # check for PEFT models
    if hasattr(unwrapped_model, "base_model") and hasattr(
        unwrapped_model, "peft_config"
    ):
        unwrapped_model = unwrapped_model.base_model

    # Check for different types of output heads
    if hasattr(unwrapped_model, "output"):
        if isinstance(unwrapped_model.output, nn.Module):
            unwrapped_model.output.register_forward_pre_hook(
                lambda *args: noop_ctx.__enter__()
            )
            unwrapped_model.output.register_forward_hook(
                lambda *args: noop_ctx.__exit__(), always_call=True
            )
            output_head_detected = True
        elif hasattr(unwrapped_model.output, "linear") and isinstance(
            unwrapped_model.output.linear, nn.Module
        ):
            unwrapped_model.output.linear.register_forward_pre_hook(
                lambda *args: noop_ctx.__enter__()
            )
            unwrapped_model.output.linear.register_forward_hook(
                lambda *args: noop_ctx.__exit__(), always_call=True
            )
            output_head_detected = True

    # Check for HuggingFace model output heads
    elif hasattr(unwrapped_model, "lm_head"):
        unwrapped_model.lm_head.register_forward_pre_hook(
            lambda *args: noop_ctx.__enter__()
        )
        unwrapped_model.lm_head.register_forward_hook(
            lambda *args: noop_ctx.__exit__(), always_call=True
        )
        output_head_detected = True

    # Check for decoder-based models
    elif hasattr(unwrapped_model, "decoder"):
        decoder = unwrapped_model.decoder
        if hasattr(decoder, "output"):
            decoder.output.register_forward_pre_hook(lambda *args: noop_ctx.__enter__())
            decoder.output.register_forward_hook(
                lambda *args: noop_ctx.__exit__(), always_call=True
            )
            output_head_detected = True
        # Some models have lm_head in the decoder
        elif hasattr(decoder, "lm_head"):
            decoder.lm_head.register_forward_pre_hook(
                lambda *args: noop_ctx.__enter__()
            )
            decoder.lm_head.register_forward_hook(
                lambda *args: noop_ctx.__exit__(), always_call=True
            )
            output_head_detected = True

    # Check for transformer models with final layer norm
    elif hasattr(unwrapped_model, "final_layer_norm") or hasattr(
        unwrapped_model, "ln_f"
    ):
        final_norm = (
            getattr(unwrapped_model, "final_layer_norm", None) or unwrapped_model.ln_f
        )
        final_norm.register_forward_pre_hook(lambda *args: noop_ctx.__enter__())
        final_norm.register_forward_hook(
            lambda *args: noop_ctx.__exit__(), always_call=True
        )
        output_head_detected = True

    # Check for models with head module
    elif hasattr(unwrapped_model, "head") and isinstance(
        unwrapped_model.head, nn.Module
    ):
        unwrapped_model.head.register_forward_pre_hook(
            lambda *args: noop_ctx.__enter__()
        )
        unwrapped_model.head.register_forward_hook(
            lambda *args: noop_ctx.__exit__(), always_call=True
        )
        output_head_detected = True

    if not output_head_detected and warn_if_no_head:
        LOG.warning(
            "During activation offloading, no output head was detected. If your model has an output head, it will be "
            "offloaded. This usually greatly slows training, given the large vocabulary size. To change this "
            "behavior, set your output head as model.output and make it an nn.Module. You can disable this warning by "
            "passing `warn_if_no_head=False`."
        )

    for name, module in unwrapped_model.named_modules():
        # Disable offloading for any Liger modules
        if "liger" in name.lower():
            module.register_forward_pre_hook(lambda *args: noop_ctx.__enter__())
            module.register_forward_hook(
                lambda *args: noop_ctx.__exit__(), always_call=True
            )
        # disable offloading for any submodules to fix LoRA training
        if name.endswith("._checkpoint_wrapped_module"):
            for _, sub_module in module.named_modules():
                sub_module.register_forward_pre_hook(lambda *args: noop_ctx.__enter__())
                sub_module.register_forward_hook(
                    lambda *args: noop_ctx.__exit__(), always_call=True
                )

    return activations_handling_ctx


================================================
FILE: src/axolotl/core/trainers/mixins/checkpoints.py
================================================
"""Custom handling to not fail training if fsdp optimizer is not savable"""

from transformers import Trainer

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


class CheckpointSaveMixin(Trainer):
    """Mixin to handle saving the optimizer and scheduler if they are not savable."""

    def _save_optimizer_and_scheduler(self, output_dir):
        try:
            super()._save_optimizer_and_scheduler(output_dir)
        except (NotImplementedError, KeyError) as exc:
            # TODO: fix fsdp2 optimizer saving
            LOG.warning_once(
                f"Trainer does not support saving optimizer and scheduler:  {exc}\n"
                "Optimizer and scheduler states were not saved - resuming from checkpoints "
                "for this training run will not be possible.",
            )


================================================
FILE: src/axolotl/core/trainers/mixins/distributed_parallel.py
================================================
"""
Mixin for correctly saving fsdp
"""

from accelerate import PartialState
from transformers import Trainer


class DistributedParallelMixin(Trainer):
    """
    Mixin for correctly saving fsdp
    """

    def _save(self, output_dir: str | None = None, state_dict=None):
        if (
            state_dict is None
            and self.accelerator.parallelism_config
            and self.accelerator.parallelism_config.dp_shard_enabled
        ):
            state_dict = self.accelerator.get_state_dict(self.model)
        super()._save(output_dir, state_dict=state_dict)

    def create_accelerator_and_postprocess(self):
        super().create_accelerator_and_postprocess()
        if (
            self.accelerator.distributed_type == "FSDP"
            and self.accelerator.state.fsdp_plugin is None
        ):
            # handle Context Parallelism without FSDP
            self.accelerator.state.distributed_type = "MULTI_GPU"
            self.accelerator.state._shared_state["distributed_type"] = "MULTI_GPU"
            PartialState().distributed_type = "MULTI_GPU"


================================================
FILE: src/axolotl/core/trainers/mixins/optimizer.py
================================================
"""Module for Axolotl trainer optimizer mixin"""

from peft.optimizers import create_loraplus_optimizer
from torch import nn
from transformers.trainer import Trainer
from transformers.utils import is_sagemaker_mp_enabled

from axolotl.integrations.base import BaseOptimizerFactory
from axolotl.utils.logging import get_logger

if is_sagemaker_mp_enabled():
    import smdistributed.modelparallel.torch as smp

LOG = get_logger(__name__)


class OptimizerMixin(Trainer):
    """Mixin class for shared handling of building custom optimizers"""

    args = None  # type: "AxolotlTrainingArguments"  # type: ignore[name-defined]

    def create_optimizer_grouped_parameters(
        self, opt_model, optimizer_kwargs
    ) -> list[dict]:
        decay_parameters = self.get_decay_parameter_names(opt_model)
        params: dict = {
            "to_weight_decay": {},  # LayerNorm and bias
            "embeddings": {},  # lm_head, embed_tokens,
            "no_weight_decay": {},
        }
        lr_groups_lookup = {}
        lr_groups_learning_rates = {}
        if self.args.lr_groups:
            for lr_group in self.args.lr_groups:
                group_name = lr_group["name"]
                group_modules = lr_group["modules"]
                for module in group_modules:
                    lr_groups_lookup[module] = group_name
                lr_groups_learning_rates[group_name] = lr_group["lr"]
                params[f"to_weight_decay_{group_name}"] = {}

        for name, param in opt_model.named_parameters():
            if not param.requires_grad:
                continue
            if name.endswith("modules_to_save.default.weight") or any(
                embed_name in name for embed_name in ["embed_tokens", "lm_head"]
            ):
                params["embeddings"][name] = param
            elif name in decay_parameters:
                lr_group_modules = [
                    group_modules
                    for group_modules in lr_groups_lookup
                    if group_modules in name
                ]
                if lr_groups_lookup and any(lr_group_modules):
                    lr_group_module = lr_group_modules[0]
                    group_name = lr_groups_lookup[lr_group_module]
                    params[f"to_weight_decay_{group_name}"][name] = param
                else:
                    params["to_weight_decay"][name] = param
            else:
                params["no_weight_decay"][name] = param
        optimizer_grouped_parameters = []
        if params["to_weight_decay"]:
            optimizer_grouped_parameters.append(
                {
                    "params": list(params["to_weight_decay"].values()),
                    "weight_decay": self.args.weight_decay,
                    "lr": optimizer_kwargs["lr"],
                }
            )
        if params["embeddings"]:
            lr = optimizer_kwargs["lr"]
            if self.args.embedding_lr_scale:
                lr *= self.args.embedding_lr_scale
            elif self.args.embedding_lr:
                lr = self.args.embedding_lr
            optimizer_grouped_parameters.append(
                {
                    "params": list(params["embeddings"].values()),
                    "weight_decay": 0.0,
                    "lr": lr,
                }
            )
        if params["no_weight_decay"]:
            optimizer_grouped_parameters.append(
                {
                    "params": list(params["no_weight_decay"].values()),
                    "weight_decay": 0.0,
                    "lr": optimizer_kwargs["lr"],
                }
            )
        for group_name, group_lr in lr_groups_learning_rates.items():
            if params[f"to_weight_decay_{group_name}"]:
                optimizer_grouped_parameters.append(
                    {
                        "params": list(
                            params[f"to_weight_decay_{group_name}"].values()
                        ),
                        "weight_decay": self.args.weight_decay,
                        "lr": group_lr,
                    }
                )

        return optimizer_grouped_parameters

    def create_optimizer(self, model=None):
        if (
            self.args.loraplus_lr_ratio is None
            and self.args.embedding_lr_scale is None
            and self.args.embedding_lr is None
            and self.args.lr_groups is None
            and self.optimizer_cls_and_kwargs is None
        ):
            return super().create_optimizer(model=model)

        opt_model = self.model if model is None else model

        if (
            not self.optimizer
            and self.optimizer_cls_and_kwargs is not None
            and issubclass(self.optimizer_cls_and_kwargs[0], BaseOptimizerFactory)
        ):
            optimizer_factory_cls, optimizer_kwargs = self.optimizer_cls_and_kwargs
            self.optimizer = optimizer_factory_cls()(
                opt_model, self.args, **optimizer_kwargs
            )

        if not self.optimizer:
            if self.optimizer_cls_and_kwargs is not None:
                optimizer_cls, optimizer_kwargs = self.optimizer_cls_and_kwargs
            else:
                optimizer_cls, optimizer_kwargs = self.get_optimizer_cls_and_kwargs(
                    self.args, opt_model
                )

            optimizer_grouped_parameters = self.create_optimizer_grouped_parameters(
                opt_model, optimizer_kwargs
            )

            if self.args.loraplus_lr_ratio is not None:
                loraplus_lr_ratio = getattr(self.args, "loraplus_lr_ratio", None)
                loraplus_lr_embedding = getattr(
                    self.args, "loraplus_lr_embedding", 1e-6
                )
                self.optimizer = create_loraplus_optimizer(
                    opt_model,
                    optimizer_cls,
                    loraplus_lr_ratio=loraplus_lr_ratio,
                    loraplus_lr_embedding=loraplus_lr_embedding,
                    **optimizer_kwargs,
                )
            else:
                # Overwrite `params` in case it's created by `get_optimizer_cls_and_kwargs`
                # e.g. for GaLore optimizer.
                if "params" in optimizer_kwargs:
                    optimizer_grouped_parameters = optimizer_kwargs.pop("params")

                # Overwrite `model` in case it's created by `get_optimizer_cls_and_kwargs`
                # e.g. for LOMO optimizer.
                if "model" in optimizer_kwargs:
                    optimizer_grouped_parameters = optimizer_kwargs.pop("model")

                # For layer-wise dummy optimizers we overwrite optimizer_grouped_parameters with `optimizer_dict`
                # to avoid arguments conflicts.
                if "optimizer_dict" in optimizer_kwargs:
                    optimizer_grouped_parameters = optimizer_kwargs.pop(
                        "optimizer_dict"
                    )

                self.optimizer = optimizer_cls(
                    optimizer_grouped_parameters, **optimizer_kwargs
                )

            if optimizer_cls.__name__ == "Adam8bit":
                import bitsandbytes

                manager = bitsandbytes.optim.GlobalOptimManager.get_instance()

                skipped = 0
                for module in opt_model.modules():
                    if isinstance(module, nn.Embedding):
                        skipped += sum(
                            {
                                p.data_ptr(): p.numel() for p in module.parameters()
                            }.values()
                        )
                        LOG.info(f"skipped {module}: {skipped / 2**20}M params")
                        manager.register_module_override(
                            module, "weight", {"optim_bits": 32}
                        )
                        LOG.debug(f"bitsandbytes: will optimize {module} in fp32")
                LOG.info(f"skipped: {skipped / 2**20}M params")

        if is_sagemaker_mp_enabled():
            self.optimizer = smp.DistributedOptimizer(self.optimizer)

        return self.optimizer


class OptimizerInitMixin:
    """
    Mixin to handle common optimizer initialization logic for Trainers (mostly TRL) that do not
    accept optimizer_cls_and_kwargs as kwarg in constructor.
    """

    def __init__(self, *args, **kwargs):
        optimizer_cls_and_kwargs = kwargs.pop("optimizer_cls_and_kwargs", None)
        super().__init__(*args, **kwargs)
        if (
            optimizer_cls_and_kwargs
            and self.optimizer_cls_and_kwargs is None
            and self.optimizer is None
        ):
            self.optimizer_cls_and_kwargs = optimizer_cls_and_kwargs


================================================
FILE: src/axolotl/core/trainers/mixins/packing.py
================================================
"""Trainer mixin to support packing"""

from transformers import Trainer


class PackingMixin(Trainer):
    """
    Trainer mixin to support packing
    """

    def _set_signature_columns_if_needed(self):
        super()._set_signature_columns_if_needed()
        if (
            self._signature_columns
            and self.args.sample_packing
            and self.args.sample_packing_drop_attention_mask
        ):
            set_sig_columns = set(self._signature_columns)
            set_sig_columns.remove("attention_mask")
            self._signature_columns = list(set_sig_columns)


================================================
FILE: src/axolotl/core/trainers/mixins/rng_state_loader.py
================================================
"""
Temporary fix/override for bug in resume from checkpoint

See https://github.com/huggingface/transformers/pull/37162

TODO: Remove when upstream added PR to release
"""

import os
import random

import numpy as np
import torch
from transformers import Trainer, is_torch_npu_available
from transformers.trainer import safe_globals
from transformers.trainer_pt_utils import set_rng_state_for_device
from transformers.training_args import ParallelMode

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


class RngLoaderMixin(Trainer):
    """
    mixin for method override to load RNG states from a checkpoint
    """

    def _load_rng_state(self, checkpoint):
        # Load RNG states from `checkpoint`
        if checkpoint is None:
            return

        if self.args.world_size > 1:
            process_index = self.args.process_index
            rng_file = os.path.join(checkpoint, f"rng_state_{process_index}.pth")
            if not os.path.isfile(rng_file):
                LOG.info(
                    f"Didn't find an RNG file for process {process_index}, if you are resuming a training that "
                    "wasn't launched in a distributed fashion, reproducibility is not guaranteed."
                )
                return
        else:
            rng_file = os.path.join(checkpoint, "rng_state.pth")
            if not os.path.isfile(rng_file):
                LOG.info(
                    "Didn't find an RNG file, if you are resuming a training that was launched in a distributed "
                    "fashion, reproducibility is not guaranteed."
                )
                return

        # Use safe_globals to ensure numpy RNG states can be deserialized safely under PyTorch 2.6+,
        # which requires allowlisted classes when loading with weights_only=True.
        with safe_globals():
            checkpoint_rng_state = torch.load(rng_file)  # nosec B614
        random.setstate(checkpoint_rng_state["python"])
        np.random.set_state(checkpoint_rng_state["numpy"])
        torch.random.set_rng_state(checkpoint_rng_state["cpu"])

        is_distributed = self.args.parallel_mode == ParallelMode.DISTRIBUTED
        if torch.cuda.is_available():
            set_rng_state_for_device(
                "CUDA", torch.cuda, checkpoint_rng_state, is_distributed
            )
        if is_torch_npu_available():
            set_rng_state_for_device(
                "NPU", torch.npu, checkpoint_rng_state, is_distributed
            )


================================================
FILE: src/axolotl/core/trainers/mixins/scheduler.py
================================================
"""Module for Axolotl trainer scheduler mixin"""

import torch
from torch.optim.lr_scheduler import LRScheduler, OneCycleLR
from transformers.trainer import Trainer

from axolotl.integrations.base import PluginManager
from axolotl.utils.logging import get_logger
from axolotl.utils.schedulers import (
    JaggedLRRestartScheduler,
    RexLR,
    get_cosine_schedule_with_min_lr,
    get_cosine_schedule_with_quadratic_warmup,
    get_cosine_schedule_with_warmup_decay_constant,
)

LOG = get_logger(__name__)


class SchedulerMixin(Trainer):
    """
    Mixin class for scheduler setup in CausalTrainer.
    """

    args = None  # type: "AxolotlTrainingArguments"  # type: ignore[name-defined]

    def create_scheduler(
        self, num_training_steps: int, optimizer: None | torch.optim.Optimizer = None
    ) -> LRScheduler:
        """
        Set up the scheduler. The optimizer of the trainer must have been set up either before this method is called or
        passed as an argument.

        Args:
            num_training_steps (int): The number of training steps to do.
            optimizer (torch.optim.Optimizer): The training optimizer
        """
        use_cosine_quadratic = (
            self.args.lr_scheduler_type == "cosine"
            and self.args.lr_quadratic_warmup is True
        )

        use_cosine_min_lr = (
            self.args.lr_scheduler_type == "cosine"
            and self.args.cosine_min_lr_ratio is not None
        )

        if optimizer is None:
            if self.optimizer is None:
                raise ValueError(
                    "Optimizer must be set before calling create_scheduler or passed as an argument."
                )
            optimizer = self.optimizer

        # fmt: off
        if self.lr_scheduler is None:  # type: ignore
            # fmt: on
            plugin_manager = PluginManager.get_instance()
            lr_scheduler: LRScheduler | None = plugin_manager.create_lr_scheduler(
                trainer=self,
                optimizer=optimizer,
                num_training_steps=num_training_steps
            )
            if lr_scheduler is not None:
                LOG.info(f"Using plugin-created lr_scheduler: {lr_scheduler}")
                self.lr_scheduler = lr_scheduler
            elif self.args.alternate_lr_scheduler_type == "one_cycle":
                num_warmup_steps = self.args.get_warmup_steps(num_training_steps)
                pct_start = num_warmup_steps / num_training_steps
                extra_lr_kwargs = {}
                if "pct_start" not in self.args.lr_scheduler_kwargs:
                    extra_lr_kwargs["pct_start"] = pct_start
                if "anneal_strategy" not in self.args.lr_scheduler_kwargs:
                    extra_lr_kwargs["anneal_strategy"] = "cos"

                self.lr_scheduler = OneCycleLR(
                    optimizer,
                    max_lr=self.args.learning_rate,
                    total_steps=num_training_steps,
                    **extra_lr_kwargs,
                    **self.args.lr_scheduler_kwargs,
                )
            elif self.args.alternate_lr_scheduler_type == "rex":
                if use_cosine_min_lr:
                    assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"

                self.lr_scheduler = RexLR(
                    optimizer=optimizer,
                    max_lr=self.args.learning_rate,
                    min_lr=0 if not use_cosine_min_lr else (
                        self.args.learning_rate * self.args.cosine_min_lr_ratio),
                    total_steps=num_training_steps,
                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                )
            elif use_cosine_quadratic:
                if use_cosine_min_lr:
                    LOG.warning(
                        "Both cosine quadratic warmup and min lr detected. Using quadratic warmup.")

                self.lr_scheduler = get_cosine_schedule_with_quadratic_warmup(
                    optimizer,
                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                    num_training_steps=num_training_steps,
                )
            elif self.args.cosine_min_lr_ratio and self.args.cosine_constant_lr_ratio and use_cosine_min_lr:
                assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
                assert 0 <= self.args.cosine_constant_lr_ratio <= 1.0, "cosine_constant_lr_ratio must be between 0.0 and 1.0"
                self.lr_scheduler = get_cosine_schedule_with_warmup_decay_constant(
                    optimizer,
                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                    num_training_steps=num_training_steps,
                    min_lr_ratio=self.args.cosine_min_lr_ratio,
                    constant_lr_ratio=self.args.cosine_constant_lr_ratio,
                )
            elif self.args.cosine_min_lr_ratio and use_cosine_min_lr:
                assert 0 <= self.args.cosine_min_lr_ratio <= 1.0, "cosine_min_lr_ratio must be between 0.0 and 1.0"
                self.lr_scheduler = get_cosine_schedule_with_min_lr(
                    optimizer,
                    num_warmup_steps=self.args.get_warmup_steps(num_training_steps),
                    num_training_steps=num_training_steps,
                    min_lr_ratio=self.args.cosine_min_lr_ratio,
                )
            else:
                super().create_scheduler(num_training_steps, optimizer=optimizer)
        else:
            if use_cosine_quadratic:
                LOG.warning(
                    "axolotl's cosine scheduler with quadratic warmup not used (e.g., because of deepspeed).")

            if use_cosine_min_lr:
                LOG.warning(
                    "axolotl's cosine scheduler with min lr not used (e.g., because of deepspeed).")

        if self.args.jagged_restart_steps:
            warmup_steps = (
                self.args.jagged_restart_warmup_steps or 10
            )
            anneal_steps = (
                self.args.jagged_restart_anneal_steps or 1
            )
            if not self.lr_scheduler:
                super().create_scheduler(num_training_steps, optimizer)
            self.lr_scheduler = JaggedLRRestartScheduler(
                optimizer,
                self.lr_scheduler,
                self.args.jagged_restart_steps,
                warmup_steps,
                anneal_steps,
                min_lr_scale=self.args.cosine_min_lr_ratio or 0.001,
            )

        return self.lr_scheduler  # type: ignore


================================================
FILE: src/axolotl/core/trainers/trl.py
================================================
"""Module for TRL RL trainers"""

from trl import RewardTrainer
from trl.experimental.cpo import CPOTrainer
from trl.experimental.kto import KTOTrainer
from trl.experimental.orpo import ORPOTrainer
from trl.experimental.prm import PRMTrainer

from axolotl.core.trainers.mixins import DistributedParallelMixin, RngLoaderMixin
from axolotl.core.trainers.mixins.optimizer import OptimizerInitMixin, OptimizerMixin
from axolotl.core.trainers.mixins.scheduler import SchedulerMixin


class AxolotlORPOTrainer(
    RngLoaderMixin,
    SchedulerMixin,
    OptimizerMixin,
    OptimizerInitMixin,
    DistributedParallelMixin,
    ORPOTrainer,
):
    """
    Extend the base ORPOTrainer for axolotl helpers
    """

    tag_names = ["axolotl", "orpo"]


class AxolotlKTOTrainer(
    RngLoaderMixin,
    SchedulerMixin,
    OptimizerMixin,
    OptimizerInitMixin,
    DistributedParallelMixin,
    KTOTrainer,
):
    """
    Extend the base KTOTrainer for axolotl helpers
    """

    tag_names = ["axolotl", "kto"]


class AxolotlCPOTrainer(
    RngLoaderMixin,
    SchedulerMixin,
    OptimizerMixin,
    OptimizerInitMixin,
    DistributedParallelMixin,
    CPOTrainer,
):
    """
    Extend the base CPOTrainer for axolotl helpers
    """

    tag_names = ["axolotl", "cpo"]


class AxolotlRewardTrainer(
    RngLoaderMixin,
    SchedulerMixin,
    OptimizerMixin,
    OptimizerInitMixin,
    DistributedParallelMixin,
    RewardTrainer,
):
    """
    Extend the base RewardTrainer for axolotl helpers
    """

    tag_names = ["axolotl", "reward"]


class AxolotlPRMTrainer(
    RngLoaderMixin,
    SchedulerMixin,
    OptimizerMixin,
    OptimizerInitMixin,
    DistributedParallelMixin,
    PRMTrainer,
):
    """
    Extend the base trl.PRMTrainer for axolotl helpers
    """

    tag_names = ["axolotl", "prm"]


================================================
FILE: src/axolotl/core/trainers/utils.py
================================================
"""Utils for Axolotl trainers"""


def sanitize_kwargs_for_tagging(tag_names, kwargs=None):
    if isinstance(tag_names, str):
        tag_names = [tag_names]

    if kwargs is not None:
        if "tags" not in kwargs:
            kwargs["tags"] = tag_names
        elif "tags" in kwargs and isinstance(kwargs["tags"], list):
            kwargs["tags"].extend(tag_names)
        elif "tags" in kwargs and isinstance(kwargs["tags"], str):
            tag_names.append(kwargs["tags"])
            kwargs["tags"] = tag_names

    return kwargs


def sanitize_kwargs_for_ds_tagging(dataset_tags, kwargs=None):
    if isinstance(dataset_tags, str):
        dataset_tags = [dataset_tags]

    if (dataset_tags is not None) and (kwargs is not None):
        if "dataset_tags" not in kwargs:
            kwargs["dataset_tags"] = dataset_tags
        elif "dataset_tags" in kwargs and isinstance(kwargs["dataset_tags"], list):
            kwargs["dataset_tags"].extend(dataset_tags)
        elif "dataset_tags" in kwargs and isinstance(kwargs["dataset_tags"], str):
            dataset_tags.append(kwargs["dataset_tags"])
            kwargs["dataset_tags"] = dataset_tags

    return kwargs


================================================
FILE: src/axolotl/core/training_args.py
================================================
"""
extra axolotl specific training args
"""

from __future__ import annotations

from dataclasses import dataclass, field
from typing import Optional, Type

from transformers import TrainingArguments
from trl import RewardConfig
from trl.experimental.cpo import CPOConfig
from trl.experimental.kto import KTOConfig
from trl.experimental.orpo import ORPOConfig
from trl.experimental.prm import PRMConfig

from axolotl.integrations.config import merge_training_args

AxolotlTrainingMixins: Type = merge_training_args()


@dataclass
class AxolotlTrainingArguments(AxolotlTrainingMixins, TrainingArguments):
    """
    Training arguments for Causal trainer

    This code is duplicated due to HF TrainingArguments not setting output_dir with a
    default value so it can't be used as a mixin.
    """


@dataclass
class AxolotlORPOConfig(AxolotlTrainingMixins, ORPOConfig):
    """
    ORPO config for ORPO training
    """


@dataclass
class AxolotlKTOConfig(AxolotlTrainingMixins, KTOConfig):
    """
    KTO config for KTO training
    """


@dataclass
class AxolotlCPOConfig(AxolotlTrainingMixins, CPOConfig):
    """
    CPO config for CPO training
    """

    simpo_gamma: Optional[float] = field(
        default=None,
        metadata={"help": "simpo gamma parameter"},
    )


@dataclass
class AxolotlRewardConfig(AxolotlTrainingMixins, RewardConfig):
    """
    Reward config for Reward training
    """


@dataclass
class AxolotlPRMConfig(AxolotlTrainingMixins, PRMConfig):
    """
    PRM config for PRM training
    """


================================================
FILE: src/axolotl/core/training_args_base.py
================================================
"""
Base Axolotl Training Mixins shared across various trainer configs
"""

from dataclasses import dataclass, field
from typing import Optional

from PIL.Image import Resampling


@dataclass
class AxolotlTrainingMixins:
    """
    Mixin class for the Axolotl training args.
    """

    model_type: Optional[str] = field(
        default=None, metadata={"help": "HF model configuration model_type."}
    )
    lr_quadratic_warmup: bool = field(
        default=False,
        metadata={"help": "Use quadratic warmup for cosine scheduling."},
    )
    pretraining: bool = field(
        default=False,
        metadata={
            "help": "Indicates to trainer whether we are doing continued pretraining."
        },
    )
    sample_packing: bool = field(
        default=False,
        metadata={"help": "Use sample packing for efficient training."},
    )
    sample_packing_sequentially: bool = field(
        default=False,
        metadata={
            "help": "Use next-fit sample packing that preserves the order of samples coming from the sampler. Use in combination with curriculum_sampling for fully sequential packing."
        },
    )
    sample_packing_mp_start_method: str | None = field(
        default=None,
        metadata={"help": "The multiprocessing start method to use."},
    )
    sample_packing_drop_attention_mask: bool = field(
        default=False,
        metadata={"help": "Drop attention mask from inputs when using packing."},
    )
    multipack_real_batches: bool = field(
        default=False,
        metadata={"help": "Use real batches for efficient training."},
    )
    include_tkps: bool = field(
        default=True,
        metadata={
            "help": "Whether to include tokens per second in the training metrics."
        },
    )
    eval_sample_packing: Optional[bool] = field(
        default=None,
        metadata={"help": "Use sample packing for efficient evals."},
    )
    sample_packing_efficiency: float = field(
        default=1.0,
        metadata={"help": "Sample packing efficiency for calculating batch length."},
    )
    sample_packing_bin_size: int = field(
        default=200,
        metadata={
            "help": "The max number of samples that packed sample can contain after packing. Increase for better packing."
        },
    )
    sample_packing_group_size: int = field(
        default=100000,
        metadata={
            "help": "The number of samples to group together for packing. Increase for better packing."
        },
    )
    max_seq_length: int = field(
        default=2048,
        metadata={"help": "The maximum sequence length the model can handle"},
    )
    dataset_num_proc: int | None = field(
        default=None,
        metadata={"help": "The number of processes to use for data processing"},
    )
    relora_steps: Optional[int] = field(
        default=None,
        metadata={"help": "how often to reset for ReLoRA"},
    )
    relora_prune_ratio: Optional[float] = field(
        default=0.9,
        metadata={"help": "prune ratio for magnitude pruning of the optimizer"},
    )
    jagged_restart_steps: Optional[int] = field(
        default=None,
        metadata={"help": "how often to reset for jagged restarts"},
    )
    jagged_restart_warmup_steps: Optional[int] = field(
        default=None,
        metadata={
            "help": "how many warmup steps to take after reset for jagged restarts"
        },
    )
    jagged_restart_anneal_steps: Optional[int] = field(
        default=None,
        metadata={
            "help": "how many anneal steps to take before reset for jagged restarts"
        },
    )
    bench_split: Optional[str] = field(
        default="eval", metadata={"help": "The benchmark split to run on"}
    )
    bench_dataset: Optional[str] = field(
        default="pharaouk/dharma-1/dharma_1_mini.json",
        metadata={
            "help": "Benchmark dataset to use: options are `mmlu-zs`, `mmlu-fs`, or the full path to the dataset file"
        },
    )
    do_bench_eval: Optional[bool] = field(
        default=False, metadata={"help": "Whether to run the Benchmark evaluation."}
    )
    do_causal_lm_eval: Optional[bool] = field(
        default=False, metadata={"help": "Whether to run the Causal LM evaluation."}
    )
    max_bench_samples: Optional[int] = field(
        default=None,
        metadata={
            "help": "If set, only evaluates on `max_bench_samples` of the benchmark dataset."
        },
    )
    bench_source_max_len: int = field(
        default=2048, metadata={"help": "Maximum source sequence length for bench."}
    )
    dataloader_prefetch_factor: Optional[int] = field(
        default=None,
        metadata={"help": "prefetch_factor argument to the dataloader"},
    )
    cosine_min_lr_ratio: Optional[float] = field(
        default=None,
        metadata={"help": "Minimum learning rate is min_lr_ratio * learning_rate"},
    )
    cosine_constant_lr_ratio: Optional[float] = field(
        default=None,
        metadata={
            "help": "Starting constant learning rate step is cosine_constant_lr_ratio * max_steps"
        },
    )
    loraplus_lr_ratio: Optional[float] = field(
        default=None, metadata={"help": "loraplus learning rate ratio lr_B / lr_A."}
    )
    loraplus_lr_embedding: Optional[float] = field(
        default=1e-6,
        metadata={"help": "loraplus learning rate for lora embedding layers."},
    )
    embedding_lr_scale: Optional[float] = field(
        default=None,
        metadata={"help": "Scale the learning rate for the embedding layers."},
    )
    lr_groups: Optional[list[dict]] = field(
        default=None,
        metadata={"help": "Specify learning rate groups for with different LRs."},
    )
    embedding_lr: Optional[float] = field(
        default=None,
        metadata={"help": "absolute learning rate for the embedding layers."},
    )
    qlora: bool = field(
        default=False,
        metadata={"help": "whether this is a qlora training"},
    )
    orpo_alpha: Optional[float] = field(
        default=None,
    )
    lisa_n_layers: Optional[int] = field(
        default=None,
        metadata={"help": "the number of activate layers in LISA"},
    )
    lisa_step_interval: Optional[int] = field(
        default=None,
        metadata={"help": "how often to switch layers in LISA"},
    )
    lisa_layers_attribute: Optional[str] = field(
        default=None,
        metadata={"help": "path under the model to access the layers"},
    )
    curriculum_sampling: Optional[bool] = field(
        default=None,
        metadata={"help": "whether to use sequential sampling for curriculum learning"},
    )
    alternate_lr_scheduler_type: Optional[str] = field(
        default=None,
        metadata={
            "help": "workaround to pass an alternate lr scheduler to the HF trainer"
        },
    )
    chat_template: Optional[str] = field(
        default=None,
        metadata={"help": "Chat template converting chat messages to text"},
    )

    # kd_ce_alpha: Optional[float] = field(
    #     default=None,
    #     metadata={
    #         "help": "The alpha scaling parameter for SFT cross entropy loss when using KD"
    #     },
    # )
    #
    # kd_alpha: Optional[float] = field(
    #     default=1.0,
    #     metadata={"help": "The alpha scaling parameter for KD loss"},
    # )
    #
    # kd_temperature: Optional[float] = field(
    #     default=1.0,
    #     metadata={
    #         "help": "the temperature parameter for KL divergence loss when using KD"
    #     },
    # )

    adam_beta3: Optional[float] = field(
        default=None,
        metadata={
            "help": "The beta3 hyperparameter used in some optimizers such as CAME"
        },
    )
    adam_epsilon2: Optional[float] = field(
        default=None,
        metadata={
            "help": "The epsilon2 hyperparameter used in some optimizers such as CAME"
        },
    )

    activation_offloading: bool | None = field(
        default=None,
        metadata={"help": "Use activation offloading with CUDA streams for training."},
    )

    # multi-modal section

    image_size: int | tuple[int, int] | None = field(
        default=None,
        metadata={"help": "The size of the image to resize to"},
    )

    image_resize_algorithm: Resampling | None = field(
        default=None,
        metadata={"help": "The algorithm to use for image resizing"},
    )

    # end of multi-modal section

    dion_learning_rate: float | None = field(
        default=None,
        metadata={"help": "The learning rate for Dion"},
    )
    dion_momentum: float | None = field(
        default=None,
        metadata={"help": "The momentum for Dion"},
    )
    dion_rank_fraction: float | None = field(
        default=None,
    )
    dion_rank_multiple_of: int | None = field(
        default=None,
    )


================================================
FILE: src/axolotl/datasets.py
================================================
"""
Module containing dataset functionality.

We want this to be a wrapper for an existing dataset that we have loaded. Lets use the
concept of middlewares to wrap each dataset. We'll use the collators later on to pad the
datasets.
"""

from datasets import Dataset, IterableDataset

from axolotl.utils.logging import get_logger

from .prompt_tokenizers import PromptTokenizingStrategy

LOG = get_logger(__name__)


class TokenizedPromptDataset(Dataset):
    """Dataset that returns tokenized prompts from a stream of text files.

    Args:
        prompt_tokenizer: The prompt tokenizing method for processing the data.
        dataset: Dataset with text files.
        process_count: Number of processes to use for tokenizing.
        keep_in_memory: Whether to keep the tokenized dataset in memory.
    """

    def __init__(
        self,
        prompt_tokenizer: PromptTokenizingStrategy,
        dataset: Dataset,
        process_count: int | None = None,
        keep_in_memory: bool | None = False,
        **kwargs,
    ):
        self.prompt_tokenizer = prompt_tokenizer
        self.process_count = process_count
        self.keep_in_memory = keep_in_memory
        super().__init__(
            self.process(dataset).data,
            **kwargs,
        )

    def process(self, dataset):
        features = dataset.features.keys()

        map_kwargs = {}
        if self.prompt_tokenizer.supports_batched:
            map_kwargs["batched"] = True
            map_kwargs["batch_size"] = 1_000

        if (
            hasattr(self.prompt_tokenizer, "filter_rows")
            and self.prompt_tokenizer.filter_rows
        ):
            dataset = dataset.filter(
                self.prompt_tokenizer.filter_rows,
                num_proc=self.process_count,
                desc="Strategy Filtering Rows",
            )

        return dataset.map(
            self.prompt_tokenizer.tokenize_prompt,
            num_proc=self.process_count,
            remove_columns=features,
            keep_in_memory=self.keep_in_memory,
            desc="Tokenizing Prompts",
            **map_kwargs,
        )


def wrap_dataset_for_tokenized_prompt(
    prompt_tokenizer: PromptTokenizingStrategy,
    dataset: Dataset | IterableDataset,
    **kwargs,
):
    if isinstance(dataset, IterableDataset):
        map_kwargs = {}
        if prompt_tokenizer.supports_batched:
            map_kwargs["batched"] = True
        features = list(dataset.features.keys())
        return dataset.map(
            prompt_tokenizer.tokenize_prompt,
            remove_columns=features,
            **map_kwargs,
        )
    return TokenizedPromptDataset(prompt_tokenizer, dataset, **kwargs)


================================================
FILE: src/axolotl/evaluate.py
================================================
"""Module for evaluating models."""

import csv
import os
import sys
from pathlib import Path
from typing import Dict, Optional

import torch
from datasets import Dataset
from transformers.trainer import Trainer

from axolotl.telemetry.errors import send_errors
from axolotl.train import (
    TrainDatasetMeta,
    setup_model_and_tokenizer,
)
from axolotl.utils.dict import DictDefault
from axolotl.utils.distributed import cleanup_distributed
from axolotl.utils.logging import get_logger
from axolotl.utils.trainer import setup_trainer

project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
src_dir = os.path.join(project_root, "src")
sys.path.insert(0, src_dir)

LOG = get_logger(__name__)


def evaluate_dataset(
    trainer: Trainer, dataset: Dataset, dataset_type: str, flash_optimum: bool = False
) -> Optional[Dict[str, float]]:
    """Helper function to evaluate a single dataset.

    Args:
        trainer: The trainer instance.
        dataset: Dataset to evaluate.
        dataset_type: Type of dataset ('train' or 'eval').
        flash_optimum: Whether to use flash optimum.

    Returns:
        Dictionary of metrics or None if dataset is None.
    """
    if dataset is None:
        return None

    LOG.info(f"Starting {dataset_type} set evaluation...")

    if flash_optimum:
        with torch.backends.cuda.sdp_kernel(
            enable_flash=True,
            enable_math=True,
            enable_mem_efficient=True,
        ):
            metrics = trainer.evaluate(dataset, metric_key_prefix=dataset_type)
    else:
        metrics = trainer.evaluate(dataset, metric_key_prefix=dataset_type)

    LOG.info(f"{dataset_type.capitalize()} set evaluation completed!")
    LOG.info(f"{dataset_type.capitalize()} Metrics:")
    for key, value in metrics.items():
        LOG.info(f"{key}: {value}")

    return metrics


@send_errors
def evaluate(*, cfg: DictDefault, dataset_meta: TrainDatasetMeta) -> Dict[str, float]:
    """
    Evaluate a model on training and validation datasets.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        dataset_meta: Dataset metadata containing training and evaluation datasets.

    Returns:
        Dictionary mapping metric names to their values.
    """
    # Load tokenizer, processor and model
    LOG.debug("loading model for evaluation...")
    model, tokenizer, _, processor = setup_model_and_tokenizer(cfg)

    # Get datasets

    train_dataset = dataset_meta.train_dataset
    eval_dataset = dataset_meta.eval_dataset
    total_num_steps = dataset_meta.total_num_steps

    # Set up trainer
    trainer = setup_trainer(
        cfg=cfg,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        model=model,
        tokenizer=tokenizer,
        processor=processor,
        total_num_steps=total_num_steps,
    )

    # Evaluate datasets
    all_metrics = {}
    train_metrics = evaluate_dataset(trainer, train_dataset, "train", cfg.flash_optimum)
    eval_metrics = evaluate_dataset(trainer, eval_dataset, "eval", cfg.flash_optimum)

    if train_metrics:
        all_metrics.update(train_metrics)
    if eval_metrics:
        all_metrics.update(eval_metrics)

    # Save metrics to CSV if output directory is specified and we have metrics
    if cfg.output_dir and (train_metrics or eval_metrics):
        output_dir = Path(cfg.output_dir)
        output_dir.mkdir(parents=True, exist_ok=True)

        metrics_file = output_dir / "eval_summary.csv"
        with metrics_file.open("w", newline="", encoding="utf-8") as file:
            writer = csv.writer(file)
            writer.writerow(["metric", "training", "validation"])

            # Get unique metric names (removing prefixes) from available metrics
            train_metric_names = {
                k.replace("train_", ""): k for k in (train_metrics or {})
            }
            eval_metric_names = {
                k.replace("eval_", ""): k for k in (eval_metrics or {})
            }
            all_metric_names = sorted(
                set(train_metric_names.keys()) | set(eval_metric_names.keys())
            )

            for metric_name in all_metric_names:
                train_value = (
                    train_metrics.get(train_metric_names.get(metric_name, ""), "")
                    if train_metrics
                    else ""
                )
                eval_value = (
                    eval_metrics.get(eval_metric_names.get(metric_name, ""), "")
                    if eval_metrics
                    else ""
                )
                writer.writerow([metric_name, train_value, eval_value])

        LOG.info(f"Evaluation results saved to {metrics_file}")

    del model
    del tokenizer

    cleanup_distributed()

    return all_metrics


================================================
FILE: src/axolotl/integrations/LICENSE.md
================================================
### AXOLOTL COMMUNITY LICENSE AGREEMENT

This Axolotl Community License Agreement (“Agreement”) is entered into by and between Axolotl AI Corp. (“Axolotl”) and
any individual or entity (“Licensee”) who wishes to use the Software (as defined below) in accordance with the terms
and conditions set forth in this Agreement.

1.  Definitions
    1.1 “Licensee” refers to any individual or entity who has obtained a copy of the Software under this Agreement.
    1.2 “Plugin Integration” means independent integration software modules which may or may not be offered by Axolotl,
        which may be licensed separately by their respective  authors and/or licensors.
    1.3 “Software” refers to the specific sub-directory of the Axolotl, Inc. software located at
        https://github.com/axolotl-ai-cloud/axolotl/tree/main/src/axolotl/integrations and its subdirectories which
        permits Plugin Integrations to integrate with the Axolotl service.
2.  Grant of License
    2.1	Axolotl hereby grants Licensee a worldwide, non-exclusive, royalty-free, license to use, copy, modify, merge,
        publish, distribute, sublicense, and/or otherwise exploit the Software, subject to the following conditions:
        - Licensee must comply with all the terms and conditions of this Agreement.
        - Licensee must include the original copyright notice and disclaimer of warranty in all copies or substantial
          portions of the Software.
    2.2 Licensee may use the Software for any lawful purpose, except as restricted in Section 3.
3.  Restrictions
    3.1 Licensee shall not use the Software for any activity that constitutes a commercial activity of offering for
        free or for sale any services, platform, or equivalent  to third parties for the purposes of allowing such
        third parties to fine-tune artificial intelligence models.
    3.2 Licensee shall not:
        - Use the Software for any illegal or unauthorized purpose.
        - Reverse engineer, decompile, or disassemble the Software.
        - Remove or modify any copyright, trademark, or other proprietary notices contained in the Software.
        - Use the Software in a way that could damage, disable, overburden, or impair the functionality of the
          Software or interfere with any third-party use of the Software.
    3.3 Axolotl reserves the right to restrict certain Plugin Integrations for use with the Software. To the extent Licensee integrates a permitted, applicable Plugin Integration with the Software, Licensee shall comply with any additional terms and conditions imposed by the licensors of such Plugin Integration for use of such Plugin Integrations. Licensee shall contact Axolotl if it has questions about whether its use of the Software falls beyond the scope of this Agreement.
4.  Intellectual Property Rights
    4.1 Axolotl and its contributors retain all intellectual property rights in and to the Software. Licensee
        acknowledges that this Agreement does not transfer any ownership rights or intellectual property rights to
        Licensee.
5.  Disclaimer of Warranty
    5.1 THE SOFTWARE IS PROVIDED “AS IS,” WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
        TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. IN NO EVENT SHALL
        THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES, OR OTHER LIABILITY, WHETHER IN AN ACTION OF
        CONTRACT, TORT, OR OTHERWISE, ARISING FROM, OUT OF, OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
        DEALINGS IN THE SOFTWARE.
6.  Termination
    6.1 Axolotl may terminate this Agreement at any time if Licensee fails to comply with any of the terms and
        conditions set forth herein. Upon termination, Licensee shall cease all use of the Software and destroy any
        copies in its possession.
7.  Governing Law
    7.1 This Agreement shall be governed by and construed in accordance with the laws of the State of California,
        without regards to conflicts of laws provisions thereof.
8.  Entire Agreement
    8.1 This Agreement constitutes the entire agreement between Axolotl and Licensee with respect to the subject matter
        hereof and supersedes all prior or contemporaneous understandings or agreements between the parties concerning
        the Software, whether written or oral. Axolotl may update the terms of this Agreement from time to time, and
        Licensee’s continued use of the Software after any such updates shall constitute acceptance of updated terms
        on a go-forward basis.  Axolotl will use commercially reasonable efforts to provide Licensee notice of any
        material updates. By using the Software, Licensee acknowledges that it has read, understood, and agrees to be
        bound by the terms and conditions of this Agreement.

This Agreement was last updated on August 23, 2024.


================================================
FILE: src/axolotl/integrations/__init__.py
================================================


================================================
FILE: src/axolotl/integrations/base.py
================================================
# Copyright 2024 Axolotl AI. All rights reserved.
#
# This software may be used and distributed according to
# the terms of the Axolotl Community License Agreement (the "License");
# you may not use this file except in compliance with the License.
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

"""Base class for all plugins.

A plugin is a reusable, modular, and self-contained piece of code that extends the functionality of Axolotl.
Plugins can be used to integrate third-party models, modify the training process, or add new features.

To create a new plugin, you need to inherit from the BasePlugin class and implement the required methods.
"""

from __future__ import annotations

import collections
import importlib
import traceback
from typing import TYPE_CHECKING, Callable, OrderedDict, Union

from peft import PeftModel
from torch import nn
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LRScheduler
from transformers import PreTrainedModel, Trainer
from transformers.trainer_pt_utils import get_parameter_names

from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

if TYPE_CHECKING:
    from axolotl.common.datasets import TrainDatasetMeta


class BasePlugin:
    """Base class for all plugins. Defines the interface for plugin methods.

    A plugin is a reusable, modular, and self-contained piece of code that extends
    the functionality of Axolotl. Plugins can be used to integrate third-party models,
    modify the training process, or add new features.

    To create a new plugin, you need to inherit from the BasePlugin class and
    implement the required methods.

    Note:
        Plugin methods include:
        - register(cfg): Registers the plugin with the given configuration.
        - load_datasets(cfg): Loads and preprocesses the dataset for training.
        - pre_model_load(cfg): Performs actions before the model is loaded.
        - post_model_build(cfg, model): Performs actions after the model is loaded, but
            before LoRA adapters are applied.
        - pre_lora_load(cfg, model): Performs actions before LoRA weights are loaded.
        - post_lora_load(cfg, model): Performs actions after LoRA weights are loaded.
        - post_model_load(cfg, model): Performs actions after the model is loaded,
            inclusive of any adapters.
        - post_trainer_create(cfg, trainer): Performs actions after the trainer is
            created.
        - create_optimizer(cfg, trainer): Creates and returns an optimizer for training.
        - create_lr_scheduler(cfg, trainer, optimizer, num_training_steps): Creates and
            returns a learning rate scheduler.
        - add_callbacks_pre_trainer(cfg, model): Adds callbacks to the trainer before
            training.
        - add_callbacks_post_trainer(cfg, trainer): Adds callbacks to the trainer after
            training.
    """

    def __init__(self):
        """Initializes the BasePlugin."""

    def register(self, cfg: dict):
        """Registers the plugin with the given configuration as an unparsed dict.

        Args:
            cfg: The configuration for the plugin.
        """

    def get_input_args(self) -> str | None:
        """Returns a pydantic model for the plugin's input arguments."""

    def get_training_args_mixin(self) -> str | None:
        """
        Returns a dataclass model for the plugin's training arguments.
        """

    def load_datasets(
        self, cfg: DictDefault, preprocess: bool = False
    ) -> Union["TrainDatasetMeta", None]:
        """Loads and preprocesses the dataset for training.

        Args:
            cfg: The configuration for the plugin.
            preprocess: Whether this is the preprocess step of the datasets.

        Returns:
            dataset_meta: The metadata for the training dataset.
        """

    def pre_model_load(self, cfg: DictDefault):
        """Performs actions before the model is loaded.

        Args:
            cfg: The configuration for the plugin.
        """

    def post_model_build(self, cfg: DictDefault, model: PreTrainedModel):
        """Performs actions after the model is built/loaded, but before any adapters are applied.

        Args:
            cfg: The configuration for the plugin.
        """

    def pre_lora_load(self, cfg: DictDefault, model: PreTrainedModel):
        """Performs actions before LoRA weights are loaded.

        Args:
            cfg: The configuration for the plugin.
            model: The loaded model.
        """

    def post_lora_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
        """Performs actions after LoRA weights are loaded.

        Args:
            cfg: The configuration for the plugin.
            model: The loaded model.
        """

    def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
        """Performs actions after the model is loaded.

        Args:
            cfg: The configuration for the plugin.
            model: The loaded model.
        """

    def get_trainer_cls(self, cfg: DictDefault) -> type[Trainer] | None:
        """Returns a custom class for the trainer.

        Args:
            cfg: The global axolotl configuration.

        Returns:
            The first non-`None` trainer class returned by a plugin.
        """

    def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
        """Performs actions after the trainer is created.

        Args:
            cfg: The configuration for the plugin.
            trainer: The trainer object for training.
        """

    def get_training_args(self, cfg: DictDefault):
        """
        Returns custom training arguments to set on TrainingArgs.

        Args:
            cfg: The global axolotl configuration.

        Returns:
            object: dict containing the training arguments.
        """

    def get_collator_cls_and_kwargs(self, cfg: DictDefault, is_eval: bool = False):
        """
        Returns a custom class for the collator.

        Args:
            cfg: The global axolotl configuration.
            is_eval: Whether this is an eval split.

        Returns:
            class: The class for the collator.
        """

    def create_optimizer(self, cfg: DictDefault, trainer: Trainer) -> Optimizer | None:
        """Creates and returns an optimizer for training.

        Args:
            cfg: The configuration for the plugin.
            trainer: The trainer object for training.

        Returns:
            The created optimizer.
        """

    def create_lr_scheduler(
        self,
        cfg: DictDefault,
        trainer: Trainer,
        optimizer: Optimizer,
        num_training_steps: int,
    ) -> LRScheduler | None:
        """Creates and returns a learning rate scheduler.

        Args:
            cfg: The configuration for the plugin.
            trainer: The trainer object for training.
            optimizer: The optimizer for training.
            num_training_steps: Total number of training steps

        Returns:
            The created learning rate scheduler.
        """

    def add_callbacks_pre_trainer(
        self, cfg: DictDefault, model: PreTrainedModel
    ) -> list[Callable]:
        """Set up callbacks before creating the trainer.

        Args:
            cfg: The configuration for the plugin.
            model: The loaded model.

        Returns:
            A list of callback functions to be added to the `TrainingArgs`.
        """
        return []

    def add_callbacks_post_trainer(
        self, cfg: DictDefault, trainer: Trainer
    ) -> list[Callable]:
        """Adds callbacks to the trainer after creating the trainer. This is useful for
        callbacks that require access to the model or trainer.

        Args:
            cfg: The configuration for the plugin.
            trainer: The trainer object for training.

        Returns:
            A list of callback functions to be added
        """
        return []

    def post_train(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
        """Performs actions after training is complete.

        Args:
            cfg: The axolotl configuration.
            model: The loaded model.
        """

    def post_train_unload(self, cfg: DictDefault):
        """Performs actions after training is complete and the model is unloaded.

        Args:
            cfg: The configuration for the plugin.
        """


def load_plugin(plugin_name: str) -> BasePlugin:
    """Loads a plugin based on the given plugin name.

    The plugin name should be in the format "module_name.class_name". This function
    splits the plugin name into module and class, imports the module, retrieves the
    class from the module, and creates an instance of the class.

    Args:
        plugin_name: The name of the plugin to be loaded. The name should be in the
            format "module_name.class_name".

    Returns:
        An instance of the loaded plugin.

    Raises:
        ImportError: If the plugin module cannot be imported.
    """
    # split the plugin name into module and class
    module_name, class_name = plugin_name.rsplit(".", 1)

    # import the module
    try:
        module = importlib.import_module(module_name)
    except ModuleNotFoundError as orig_exc:
        try:
            if not module_name.startswith("axolotl.integrations."):
                module = importlib.import_module("axolotl.integrations." + module_name)
            else:
                raise orig_exc
        except ModuleNotFoundError as exc:
            raise orig_exc from exc

    # instantiate the class
    plugin_class = getattr(module, class_name)
    # create an instance of the class
    plugin = plugin_class()

    return plugin


class PluginManager:
    """The `PluginManager` class is responsible for loading and managing plugins. It
    should be a singleton so it can be accessed from anywhere in the codebase.

    Attributes:
        plugins: A list of loaded plugins.

    Note:
        Key methods include:
        - get_instance(): Static method to get the singleton instance of `PluginManager`.
        - register(plugin_name: str): Registers a new plugin by its name.
        - pre_model_load(cfg): Calls the pre_model_load method of all registered plugins.
    """

    plugins: OrderedDict[str, BasePlugin] = collections.OrderedDict()

    _instance: PluginManager | None = None
    _cfg: DictDefault | None = None

    def __new__(cls):
        """Creates a new instance of PluginManager if it doesn't exist yet."""
        if cls._instance is None:
            cls._instance = super(PluginManager, cls).__new__(cls)
            cls._instance.plugins: OrderedDict[str, BasePlugin] = (
                collections.OrderedDict()
            )
        return cls._instance

    @staticmethod
    def get_instance() -> "PluginManager":
        """Returns the singleton instance of PluginManager. If the instance doesn't
        exist, it creates a new one.
        """
        if PluginManager._instance is None:
            PluginManager()
        return PluginManager._instance  # type: ignore

    @property
    def cfg(self):
        return self._cfg

    @cfg.setter
    def cfg(self, cfg):
        self._cfg = cfg

    def register(self, plugin_name: str):
        """Registers a new plugin by its name.

        Args:
            plugin_name: The name of the plugin to be registered.

        Raises:
            ImportError: If the plugin module cannot be imported.
        """
        try:
            LOG.info(f"Attempting to load plugin: {plugin_name}")
            plugin = load_plugin(plugin_name)
            self.plugins[plugin_name] = plugin
            LOG.info(f"Plugin loaded successfully: {plugin_name}")
        except ImportError as exc:
            LOG.error(f"Failed to load plugin: {plugin_name}")
            # print stacktrace
            traceback.print_exc()
            print(f"Error: {exc}")

    def get_input_args(self) -> list[str]:
        """Returns a list of Pydantic classes for all registered plugins' input arguments.'

        Returns:
            A list of Pydantic classes for all registered plugins' input arguments.'
        """
        input_args = []
        for plugin in self.plugins.values():
            input_args_from_plugin = plugin.get_input_args()
            if input_args_from_plugin is not None:
                input_args.append(input_args_from_plugin)
        return input_args

    def get_training_args_mixin(self):
        """
        Returns a list of dataclasses for all registered plugins' training args mixins'

        Returns:
        list[str]: A list of dataclsses
        """
        training_args = []
        for plugin in self.plugins.values():
            training_args_from_plugin = plugin.get_training_args_mixin()
            if training_args_from_plugin is not None:
                training_args.append(training_args_from_plugin)
        return training_args

    def load_datasets(
        self, cfg: DictDefault, preprocess: bool = False
    ) -> Union["TrainDatasetMeta", None]:
        """Calls the load_datasets method of each registered plugin.

        Args:
            cfg: The configuration for the plugins.
            preprocess: Whether this is preprocess step of the datasets.

        Returns:
            The dataset metadata loaded from all registered plugins.
        """
        return_ds_meta = None
        for plugin in self.plugins.values():
            dataset_meta = plugin.load_datasets(cfg, preprocess)
            if dataset_meta is not None:
                if return_ds_meta is None:
                    return_ds_meta = dataset_meta
                else:
                    raise RuntimeError("Multiple plugins loaded datasets")
        return return_ds_meta

    def pre_model_load(self, cfg: DictDefault):
        """Calls the pre_model_load method of all registered plugins.

        Args:
            cfg: The configuration for the plugins.
        """
        for plugin in self.plugins.values():
            plugin.pre_model_load(cfg)

    def post_model_build(self, cfg: DictDefault, model: PreTrainedModel):
        """Calls the `post_model_build` method of all registered plugins after the
        model has been built / loaded, but before any adapters have been applied.

        Args:
            cfg: The configuration for the plugins.
            model: The loaded model.
        """
        for plugin in self.plugins.values():
            plugin.post_model_build(cfg, model)

    def pre_lora_load(self, cfg: DictDefault, model: PreTrainedModel):
        """Calls the `pre_lora_load` method of all registered plugins.

        Args:
            cfg: The configuration for the plugins.
            model: The loaded model.
        """
        for plugin in self.plugins.values():
            plugin.pre_lora_load(cfg, model)

    def post_lora_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
        """Calls the `post_lora_load` method of all registered plugins.

        Args:
            cfg: The configuration for the plugins.
            model: The loaded model.
        """
        for plugin in self.plugins.values():
            plugin.post_lora_load(cfg, model)

    def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
        """Calls the `post_model_load` method of all registered plugins after the model
        has been loaded inclusive of any adapters.

        Args:
            cfg: The configuration for the plugins.
            model: The loaded model.
        """
        for plugin in self.plugins.values():
            plugin.post_model_load(cfg, model)

    def get_trainer_cls(self, cfg: DictDefault) -> Trainer | None:
        """Calls the `get_trainer_cls` method of all registered plugins and returns the
        first non-`None` trainer class.

        Args:
            cfg: The configuration for the plugins.

        Returns:
            The first non-`None` trainer class returned by a plugin.
        """
        for plugin in self.plugins.values():
            trainer_cls = plugin.get_trainer_cls(cfg)
            if trainer_cls is not None:
                return trainer_cls
        return None

    def get_training_args(self, cfg):
        """
        Calls the get_training_args method of all registered plugins and returns the combined training arguments.

        Parameters:
        cfg (dict): The configuration for the plugins.

        Returns:
        object: The training arguments
        """
        training_args_kwargs = {}
        for plugin in self.plugins.values():
            training_args = plugin.get_training_args(cfg)
            if training_args is not None:
                training_args_kwargs.update(training_args)

        return training_args_kwargs

    def get_collator_cls_and_kwargs(self, cfg, is_eval=False):
        """
        Calls the get_collator_cls_and_kwargs method of all registered plugins and returns the first non-None collator class.

        Parameters:
        cfg (dict): The configuration for the plugins.
        is_eval (bool): Whether this is an eval split.

        Returns:
        object: The collator class, or None if none was found.
        """
        for plugin in self.plugins.values():
            collator = plugin.get_collator_cls_and_kwargs(cfg, is_eval=is_eval)
            if collator is not None:
                collator_cls, collator_kwargs = collator
                return collator_cls, collator_kwargs
        return None

    def post_trainer_create(self, cfg: DictDefault, trainer: Trainer):
        """Calls the `post_trainer_create` method of all registered plugins.

        Args:
            cfg: The configuration for the plugins.
            trainer: The trainer object for training.
        """
        for plugin in self.plugins.values():
            plugin.post_trainer_create(cfg, trainer)

    def create_optimizer(self, trainer: Trainer) -> Optimizer | None:
        """Calls the `create_optimizer` method of all registered plugins and returns
        the first non-`None` optimizer.

        Args:
            trainer: The trainer object for training.

        Returns:
            The created optimizer, or `None` if none was found.
        """
        for plugin in self.plugins.values():
            optimizer = plugin.create_optimizer(self.cfg, trainer)
            if optimizer is not None:
                return optimizer
        return None

    def create_lr_scheduler(
        self, trainer: Trainer, optimizer: Optimizer, num_training_steps: int
    ) -> LRScheduler | None:
        """Calls the `create_lr_scheduler` method of all registered plugins and returns
        the first non-`None` scheduler.

        Args:
            trainer: The trainer object for training.
            optimizer: The optimizer for training.

        Returns:
            The created learning rate scheduler, or `None` if not found.
        """
        for plugin in self.plugins.values():
            scheduler: LRScheduler | None = plugin.create_lr_scheduler(
                self.cfg,
                trainer=trainer,
                optimizer=optimizer,
                num_training_steps=num_training_steps,
            )
            if scheduler is not None:
                return scheduler
        return None

    def add_callbacks_pre_trainer(
        self, cfg: DictDefault, model: PreTrainedModel
    ) -> list[Callable]:
        """Calls the add_callbacks_pre_trainer method of all registered plugins.

        Args:
            cfg: The configuration for the plugins.
            model: The loaded model.

        Returns:
            A list of callback functions to be added to the `TrainingArgs`.
        """
        callbacks = []
        for plugin in self.plugins.values():
            plugin_callbacks = plugin.add_callbacks_pre_trainer(cfg, model)
            if plugin_callbacks:  # if the plugin returned a list of callbacks
                callbacks.extend(plugin_callbacks)
        return callbacks

    def add_callbacks_post_trainer(
        self, cfg: DictDefault, trainer: Trainer
    ) -> list[Callable]:
        """Calls the `add_callbacks_post_trainer` method of all registered plugins.

        Args:
            cfg: The configuration for the plugins.
            trainer: The trainer object for training.

        Returns:
            A list of callback functions to be added to the `TrainingArgs`.
        """
        callbacks = []
        for plugin in self.plugins.values():
            plugin_callbacks = plugin.add_callbacks_post_trainer(cfg, trainer)
            if plugin_callbacks:
                callbacks.extend(plugin_callbacks)
        return callbacks

    def post_train(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
        """Calls the post_train method of all registered plugins.

        Args:
            cfg: The configuration for the plugins.
            model: The loaded model.
        """
        for plugin in self.plugins.values():
            plugin.post_train(cfg, model)

    def post_train_unload(self, cfg: DictDefault):
        """Calls the post_train_unload method of all registered plugins.

        Args:
            cfg: The configuration for the plugins.
        """
        for plugin in self.plugins.values():
            plugin.post_train_unload(cfg)


class BaseOptimizerFactory:
    """Base class for factories to create custom optimizers"""

    def __call__(
        self, opt_model, training_args, **optimizer_kwargs
    ) -> Optimizer | None:
        pass

    # duplicated from transformers
    def get_decay_parameter_names(self, model) -> list[str]:
        """
        Get all parameter names that weight decay will be applied to.

        This function filters out parameters in two ways:
        1. By layer type (instances of layers specified in ALL_LAYERNORM_LAYERS)
        2. By parameter name patterns (containing 'bias', or variation of 'norm')
        """
        forbidden_name_patterns = [
            r"bias",
            r"layernorm",
            r"rmsnorm",
            r"(?:^|\.)norm(?:$|\.)",
            r"_norm(?:$|\.)",
        ]
        decay_parameters = get_parameter_names(
            model, [nn.LayerNorm], forbidden_name_patterns
        )
        return decay_parameters


================================================
FILE: src/axolotl/integrations/config.py
================================================
# Copyright 2024 Axolotl AI. All rights reserved.
#
# This software may be used and distributed according to
# the terms of the Axolotl Community License Agreement (the "License");
# you may not use this file except in compliance with the License.
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

"""
Module to handle merging the plugins' input arguments with the base configurations.

This was moved here to prevent circular imports.
"""

from typing import Any, Dict, List, Type

from axolotl.utils.schemas.config import (
    AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
    AxolotlInputConfig as AxolotlInputConfigBase,
)


def merge_input_args():
    """
    Merges input arguments from registered plugins with the base configurations.

    This function retrieves the input arguments from registered plugins using the PluginManager.
    It then dynamically creates new classes, AxolotlConfigWCapabilities and AxolotlInputConfig,
    that inherit from the base configurations and include the input arguments from the plugins.

    Returns:
    tuple: A tuple containing the newly created classes, AxolotlConfigWCapabilities and AxolotlInputConfig.
    """
    from axolotl.integrations.base import PluginManager

    plugin_manager = PluginManager.get_instance()
    input_args: List[str] = plugin_manager.get_input_args()
    plugin_classes = []
    dynamic_input = ""
    for plugin_args in input_args:
        plugin_module, plugin_cls = plugin_args.rsplit(".", 1)
        dynamic_input += f"from {plugin_module} import {plugin_cls}\n"
        plugin_classes.append(plugin_cls)
    if dynamic_input:
        dynamic_input += f"class AxolotlConfigWCapabilities(AxolotlConfigWCapabilitiesBase, {', '.join(plugin_classes)}):\n    pass\n"
        dynamic_input += f"class AxolotlInputConfig(AxolotlInputConfigBase, {', '.join(plugin_classes)}):\n    pass\n"

        namespace: Dict[Any, Any] = {}
        exec(dynamic_input, globals(), namespace)  # nosec B102
        AxolotlInputConfig = namespace["AxolotlInputConfig"]
        AxolotlConfigWCapabilities = namespace["AxolotlConfigWCapabilities"]
        return AxolotlConfigWCapabilities, AxolotlInputConfig
    return AxolotlConfigWCapabilitiesBase, AxolotlInputConfigBase


def merge_training_args() -> Type:
    """
    Merges training arguments from registered plugins with the base TrainingArguments.

    This function retrieves the training arguments from registered plugins using the PluginManager.
    It then dynamically creates new classes, AxolotlTrainingMixins,
    that inherit from the base configurations and include the training arguments from the plugins.

    Returns:
    tuple: A tuple containing the newly created classes, AxolotlTrainingMixins.
    """

    from axolotl.core.training_args_base import (
        AxolotlTrainingMixins as AxolotlTrainingMixinsBase,
    )
    from axolotl.integrations.base import PluginManager

    plugin_manager = PluginManager.get_instance()
    training_args_mixins: List[str] = plugin_manager.get_training_args_mixin()
    mixin_classes = []
    dynamic_input = ""
    for plugin_args in training_args_mixins:
        plugin_module, plugin_cls = plugin_args.rsplit(".", 1)
        dynamic_input += f"from {plugin_module} import {plugin_cls}\n"
        mixin_classes.append(plugin_cls)
    if dynamic_input:
        dynamic_input += f"class AxolotlTrainingMixins(AxolotlTrainingMixinsBase, {', '.join(mixin_classes)}):\n    pass\n"

        namespace: Dict[Any, Any] = {}
        local_vars = {"AxolotlTrainingMixinsBase": AxolotlTrainingMixinsBase}
        exec(dynamic_input, {**globals(), **local_vars}, namespace)  # nosec B102
        AxolotlTrainingMixins = namespace["AxolotlTrainingMixins"]
        return AxolotlTrainingMixins
    return AxolotlTrainingMixinsBase


================================================
FILE: src/axolotl/integrations/cut_cross_entropy/ACKNOWLEDGEMENTS.md
================================================
Acknowledgements

Portions of this Cut Cross Entropy Software may utilize the following copyrighted
material, the use of which is hereby acknowledged.


------


PyTorch

    From PyTorch:

    Copyright (c) 2016-     Facebook, Inc            (Adam Paszke)
    Copyright (c) 2014-     Facebook, Inc            (Soumith Chintala)
    Copyright (c) 2011-2014 Idiap Research Institute (Ronan Collobert)
    Copyright (c) 2012-2014 Deepmind Technologies    (Koray Kavukcuoglu)
    Copyright (c) 2011-2012 NEC Laboratories America (Koray Kavukcuoglu)
    Copyright (c) 2011-2013 NYU                      (Clement Farabet)
    Copyright (c) 2006-2010 NEC Laboratories America (Ronan Collobert, Leon Bottou, Iain Melvin, Jason Weston)
    Copyright (c) 2006      Idiap Research Institute (Samy Bengio)
    Copyright (c) 2001-2004 Idiap Research Institute (Ronan Collobert, Samy Bengio, Johnny Mariethoz)

    From Caffe2:

    Copyright (c) 2016-present, Facebook Inc. All rights reserved.

    All contributions by Facebook:
    Copyright (c) 2016 Facebook Inc.

    All contributions by Google:
    Copyright (c) 2015 Google Inc.
    All rights reserved.

    All contributions by Yangqing Jia:
    Copyright (c) 2015 Yangqing Jia
    All rights reserved.

    All contributions by Kakao Brain:
    Copyright 2019-2020 Kakao Brain

    All contributions by Cruise LLC:
    Copyright (c) 2022 Cruise LLC.
    All rights reserved.

    All contributions by Arm:
    Copyright (c) 2021, 2023-2024 Arm Limited and/or its affiliates

    All contributions from Caffe:
    Copyright(c) 2013, 2014, 2015, the respective contributors
    All rights reserved.

    All other contributions:
    Copyright(c) 2015, 2016 the respective contributors
    All rights reserved.

    Caffe2 uses a copyright model similar to Caffe: each contributor holds
    copyright over their contributions to Caffe2. The project versioning records
    all such contribution and copyright details. If a contributor wants to further
    mark their specific copyright on a particular contribution, they should
    indicate their copyright solely in the commit message of the change when it is
    committed.

    All rights reserved.

    Redistribution and use in source and binary forms, with or without
    modification, are permitted provided that the following conditions are met:

    1. Redistributions of source code must retain the above copyright
    notice, this list of conditions and the following disclaimer.

    2. Redistributions in binary form must reproduce the above copyright
    notice, this list of conditions and the following disclaimer in the
    documentation and/or other materials provided with the distribution.

    3. Neither the names of Facebook, Deepmind Technologies, NYU, NEC Laboratories America
    and IDIAP Research Institute nor the names of its contributors may be
    used to endorse or promote products derived from this software without
    specific prior written permission.

    THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
    AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
    LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
    CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
    SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
    INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
    CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
    ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
    POSSIBILITY OF SUCH DAMAGE.


Triton

    /*
    * Copyright 2018-2020 Philippe Tillet
    * Copyright 2020-2022 OpenAI
    *
    * Permission is hereby granted, free of charge, to any person obtaining
    * a copy of this software and associated documentation files
    * (the "Software"), to deal in the Software without restriction,
    * including without limitation the rights to use, copy, modify, merge,
    * publish, distribute, sublicense, and/or sell copies of the Software,
    * and to permit persons to whom the Software is furnished to do so,
    * subject to the following conditions:
    *
    * The above copyright notice and this permission notice shall be
    * included in all copies or substantial portions of the Software.
    *
    * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
    * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
    * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
    * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
    * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
    * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
    * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
    */


Transformers

    Copyright 2018- The Hugging Face team. All rights reserved.

                                    Apache License
                            Version 2.0, January 2004
                            http://www.apache.org/licenses/

    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

    1. Definitions.

        "License" shall mean the terms and conditions for use, reproduction,
        and distribution as defined by Sections 1 through 9 of this document.

        "Licensor" shall mean the copyright owner or entity authorized by
        the copyright owner that is granting the License.

        "Legal Entity" shall mean the union of the acting entity and all
        other entities that control, are controlled by, or are under common
        control with that entity. For the purposes of this definition,
        "control" means (i) the power, direct or indirect, to cause the
        direction or management of such entity, whether by contract or
        otherwise, or (ii) ownership of fifty percent (50%) or more of the
        outstanding shares, or (iii) beneficial ownership of such entity.

        "You" (or "Your") shall mean an individual or Legal Entity
        exercising permissions granted by this License.

        "Source" form shall mean the preferred form for making modifications,
        including but not limited to software source code, documentation
        source, and configuration files.

        "Object" form shall mean any form resulting from mechanical
        transformation or translation of a Source form, including but
        not limited to compiled object code, generated documentation,
        and conversions to other media types.

        "Work" shall mean the work of authorship, whether in Source or
        Object form, made available under the License, as indicated by a
        copyright notice that is included in or attached to the work
        (an example is provided in the Appendix below).

        "Derivative Works" shall mean any work, whether in Source or Object
        form, that is based on (or derived from) the Work and for which the
        editorial revisions, annotations, elaborations, or other modifications
        represent, as a whole, an original work of authorship. For the purposes
        of this License, Derivative Works shall not include works that remain
        separable from, or merely link (or bind by name) to the interfaces of,
        the Work and Derivative Works thereof.

        "Contribution" shall mean any work of authorship, including
        the original version of the Work and any modifications or additions
        to that Work or Derivative Works thereof, that is intentionally
        submitted to Licensor for inclusion in the Work by the copyright owner
        or by an individual or Legal Entity authorized to submit on behalf of
        the copyright owner. For the purposes of this definition, "submitted"
        means any form of electronic, verbal, or written communication sent
        to the Licensor or its representatives, including but not limited to
        communication on electronic mailing lists, source code control systems,
        and issue tracking systems that are managed by, or on behalf of, the
        Licensor for the purpose of discussing and improving the Work, but
        excluding communication that is conspicuously marked or otherwise
        designated in writing by the copyright owner as "Not a Contribution."

        "Contributor" shall mean Licensor and any individual or Legal Entity
        on behalf of whom a Contribution has been received by Licensor and
        subsequently incorporated within the Work.

    2. Grant of Copyright License. Subject to the terms and conditions of
        this License, each Contributor hereby grants to You a perpetual,
        worldwide, non-exclusive, no-charge, royalty-free, irrevocable
        copyright license to reproduce, prepare Derivative Works of,
        publicly display, publicly perform, sublicense, and distribute the
        Work and such Derivative Works in Source or Object form.

    3. Grant of Patent License. Subject to the terms and conditions of
        this License, each Contributor hereby grants to You a perpetual,
        worldwide, non-exclusive, no-charge, royalty-free, irrevocable
        (except as stated in this section) patent license to make, have made,
        use, offer to sell, sell, import, and otherwise transfer the Work,
        where such license applies only to those patent claims licensable
        by such Contributor that are necessarily infringed by their
        Contribution(s) alone or by combination of their Contribution(s)
        with the Work to which such Contribution(s) was submitted. If You
        institute patent litigation against any entity (including a
        cross-claim or counterclaim in a lawsuit) alleging that the Work
        or a Contribution incorporated within the Work constitutes direct
        or contributory patent infringement, then any patent licenses
        granted to You under this License for that Work shall terminate
        as of the date such litigation is filed.

    4. Redistribution. You may reproduce and distribute copies of the
        Work or Derivative Works thereof in any medium, with or without
        modifications, and in Source or Object form, provided that You
        meet the following conditions:

        (a) You must give any other recipients of the Work or
            Derivative Works a copy of this License; and

        (b) You must cause any modified files to carry prominent notices
            stating that You changed the files; and

        (c) You must retain, in the Source form of any Derivative Works
            that You distribute, all copyright, patent, trademark, and
            attribution notices from the Source form of the Work,
            excluding those notices that do not pertain to any part of
            the Derivative Works; and

        (d) If the Work includes a "NOTICE" text file as part of its
            distribution, then any Derivative Works that You distribute must
            include a readable copy of the attribution notices contained
            within such NOTICE file, excluding those notices that do not
            pertain to any part of the Derivative Works, in at least one
            of the following places: within a NOTICE text file distributed
            as part of the Derivative Works; within the Source form or
            documentation, if provided along with the Derivative Works; or,
            within a display generated by the Derivative Works, if and
            wherever such third-party notices normally appear. The contents
            of the NOTICE file are for informational purposes only and
            do not modify the License. You may add Your own attribution
            notices within Derivative Works that You distribute, alongside
            or as an addendum to the NOTICE text from the Work, provided
            that such additional attribution notices cannot be construed
            as modifying the License.

        You may add Your own copyright statement to Your modifications and
        may provide additional or different license terms and conditions
        for use, reproduction, or distribution of Your modifications, or
        for any such Derivative Works as a whole, provided Your use,
        reproduction, and distribution of the Work otherwise complies with
        the conditions stated in this License.

    5. Submission of Contributions. Unless You explicitly state otherwise,
        any Contribution intentionally submitted for inclusion in the Work
        by You to the Licensor shall be under the terms and conditions of
        this License, without any additional terms or conditions.
        Notwithstanding the above, nothing herein shall supersede or modify
        the terms of any separate license agreement you may have executed
        with Licensor regarding such Contributions.

    6. Trademarks. This License does not grant permission to use the trade
        names, trademarks, service marks, or product names of the Licensor,
        except as required for reasonable and customary use in describing the
        origin of the Work and reproducing the content of the NOTICE file.

    7. Disclaimer of Warranty. Unless required by applicable law or
        agreed to in writing, Licensor provides the Work (and each
        Contributor provides its Contributions) on an "AS IS" BASIS,
        WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
        implied, including, without limitation, any warranties or conditions
        of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
        PARTICULAR PURPOSE. You are solely responsible for determining the
        appropriateness of using or redistributing the Work and assume any
        risks associated with Your exercise of permissions under this License.

    8. Limitation of Liability. In no event and under no legal theory,
        whether in tort (including negligence), contract, or otherwise,
        unless required by applicable law (such as deliberate and grossly
        negligent acts) or agreed to in writing, shall any Contributor be
        liable to You for damages, including any direct, indirect, special,
        incidental, or consequential damages of any character arising as a
        result of this License or out of the use or inability to use the
        Work (including but not limited to damages for loss of goodwill,
        work stoppage, computer failure or malfunction, or any and all
        other commercial damages or losses), even if such Contributor
        has been advised of the possibility of such damages.

    9. Accepting Warranty or Additional Liability. While redistributing
        the Work or Derivative Works thereof, You may choose to offer,
        and charge a fee for, acceptance of support, warranty, indemnity,
        or other liability obligations and/or rights consistent with this
        License. However, in accepting such obligations, You may act only
        on Your own behalf and on Your sole responsibility, not on behalf
        of any other Contributor, and only if You agree to indemnify,
        defend, and hold each Contributor harmless for any liability
        incurred by, or claims asserted against, such Contributor by reason
        of your accepting any such warranty or additional liability.

    END OF TERMS AND CONDITIONS

    APPENDIX: How to apply the Apache License to your work.

        To apply the Apache License to your work, attach the following
        boilerplate notice, with the fields enclosed by brackets "[]"
        replaced with your own identifying information. (Don't include
        the brackets!)  The text should be enclosed in the appropriate
        comment syntax for the file format. We also recommend that a
        file or class name and description of purpose be included on the
        same "printed page" as the copyright notice for easier
        identification within third-party archives.

    Copyright [yyyy] [name of copyright owner]

    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
    You may obtain a copy of the License at

        http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.


================================================
FILE: src/axolotl/integrations/cut_cross_entropy/LICENSE
================================================
Copyright (C) 2024 Apple Inc. All Rights Reserved.

IMPORTANT:  This Apple software is supplied to you by Apple
Inc. ("Apple") in consideration of your agreement to the following
terms, and your use, installation, modification or redistribution of
this Apple software constitutes acceptance of these terms.  If you do
not agree with these terms, please do not use, install, modify or
redistribute this Apple software.

In consideration of your agreement to abide by the following terms, and
subject to these terms, Apple grants you a personal, non-exclusive
license, under Apple's copyrights in this original Apple software (the
"Apple Software"), to use, reproduce, modify and redistribute the Apple
Software, with or without modifications, in source and/or binary forms;
provided that if you redistribute the Apple Software in its entirety and
without modifications, you must retain this notice and the following
text and disclaimers in all such redistributions of the Apple Software.
Neither the name, trademarks, service marks or logos of Apple Inc. may
be used to endorse or promote products derived from the Apple Software
without specific prior written permission from Apple.  Except as
expressly stated in this notice, no other rights or licenses, express or
implied, are granted by Apple herein, including but not limited to any
patent rights that may be infringed by your derivative works or by other
works in which the Apple Software may be incorporated.

The Apple Software is provided by Apple on an "AS IS" basis.  APPLE
MAKES NO WARRANTIES, EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION
THE IMPLIED WARRANTIES OF NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS
FOR A PARTICULAR PURPOSE, REGARDING THE APPLE SOFTWARE OR ITS USE AND
OPERATION ALONE OR IN COMBINATION WITH YOUR PRODUCTS.

IN NO EVENT SHALL APPLE BE LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL
OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
INTERRUPTION) ARISING IN ANY WAY OUT OF THE USE, REPRODUCTION,
MODIFICATION AND/OR DISTRIBUTION OF THE APPLE SOFTWARE, HOWEVER CAUSED
AND WHETHER UNDER THEORY OF CONTRACT, TORT (INCLUDING NEGLIGENCE),
STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS BEEN ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.


-------------------------------------------------------------------------------
SOFTWARE DISTRIBUTED WITH CUT CROSS ENTROPY:

The Cut Cross Entropy software includes a number of subcomponents with separate
copyright notices and license terms - please see the file ACKNOWLEDGEMENTS.md.
-------------------------------------------------------------------------------


================================================
FILE: src/axolotl/integrations/cut_cross_entropy/README.md
================================================
# Cut Cross Entropy

Cut Cross Entropy (CCE) reduces VRAM usage through optimization on the cross-entropy operation during loss calculation.

See https://github.com/apple/ml-cross-entropy

## Requirements

- PyTorch 2.4.0 or higher

## Installation

Run the following command to install `cut_cross_entropy[transformers]` if you don't have it already.

- If you are in dev environment
```bash
python scripts/cutcrossentropy_install.py | sh
```

- If you are installing from pip
```bash
pip3 uninstall -y cut-cross-entropy && pip3 install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@63b15e6"
```

## Usage

```yaml
plugins:
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin
```

## Supported Models

- afmoe
- apertus
- arcee
- cohere
- cohere2
- deepseek_v3
- exaone4
- gemma
- gemma2
- gemma3
- gemma3_text
- gemma3n
- gemma3n_text
- glm
- glm4
- glm4_moe
- glm4_moe_lite
- glm46v
- glm4v
- glm4v_moe
- glm_image
- glm_moe_dsa
- gpt_oss
- granite
- granitemoe
- granitemoehybrid
- granitemoeshared
- hunyuan_v1_dense
- hunyuan_v1_moe
- internvl
- kimi_linear
- lfm2
- lfm2_moe
- lfm2_vl
- llama
- llama4
- llama4_text
- llava
- ministral
- ministral3
- mistral
- mistral3
- mistral4
- mixtral
- mllama
- nemotron_h
- olmo
- olmo2
- olmo3
- olmoe
- phi
- phi3
- phi4_multimodal
- qwen2
- qwen2_5_vl
- qwen2_moe
- qwen2_vl
- qwen3
- qwen3_5
- qwen3_5_text
- qwen3_5_moe
- qwen3_5_moe_text
- qwen3_moe
- qwen3_next
- qwen3_vl
- qwen3_vl_moe
- seed_oss
- smollm3
- step3p5
- voxtral

## Citation

```bib
@article{wijmans2024cut,
  author       = {Erik Wijmans and
                  Brody Huval and
                  Alexander Hertzberg and
                  Vladlen Koltun and
                  Philipp Kr\"ahenb\"uhl},
  title        = {Cut Your Losses in Large-Vocabulary Language Models},
  journal      = {arXiv},
  year         = {2024},
  url          = {https://arxiv.org/abs/2411.09009},
}
```


================================================
FILE: src/axolotl/integrations/cut_cross_entropy/__init__.py
================================================
# Copyright 2024 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Module for the Plugin for Cut Cross Entropy integration with Axolotl.

Cut Cross Entropy is an optimized implementation of cross entropy loss
from Apple's ML team.
"""

import importlib
from functools import partial

import torch

from axolotl.integrations.base import BasePlugin
from axolotl.utils import get_pytorch_version
from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix
from axolotl.utils.logging import get_logger

from .args import CutCrossEntropyArgs as CutCrossEntropyArgs

LOG = get_logger(__name__)

_CCE_INSTALL_MESSAGE = (
    "Please install Axolotl's fork of cut_cross_entropy with transformers support using "
    '`pip install "cut-cross-entropy[transformers] @ git+https://github.com/axolotl-ai-cloud/ml-cross-entropy.git@63b15e6"`'
)


class CutCrossEntropyPlugin(BasePlugin):
    """
    Plugin for Cut Cross Entropy integration with Axolotl.
    """

    def get_input_args(self):
        return "axolotl.integrations.cut_cross_entropy.CutCrossEntropyArgs"

    def _check_requirements(self):
        """Check if all requirements are met."""
        # Check PyTorch version

        major, minor, _ = get_pytorch_version()
        if (major, minor) < (2, 4):
            raise ImportError(
                "Cut Cross Entropy requires PyTorch >= 2.4.0. "
                f"Current version: {torch.__version__}"
            )

        # Check if cut_cross_entropy is installed
        cce_spec = importlib.util.find_spec("cut_cross_entropy")
        if cce_spec is None:
            raise ImportError(_CCE_INSTALL_MESSAGE)

        cce_spec_transformers = importlib.util.find_spec(
            "cut_cross_entropy.transformers"
        )
        if cce_spec_transformers is None:
            raise ImportError(
                "Transformers support is not installed. " + _CCE_INSTALL_MESSAGE
            )

        # Check if Axolotl's cce fork is installed
        try:
            from cut_cross_entropy.transformers.patch import AXOLOTL_CCE_FORK

            if not AXOLOTL_CCE_FORK:
                raise ImportError
        except ImportError as e:
            raise ImportError(
                "Axolotl's fork of cut_cross_entropy is not installed. "
                + _CCE_INSTALL_MESSAGE
            ) from e

    def pre_model_load(self, cfg):
        """Apply cut cross entropy before model loading if enabled."""
        if cfg.cut_cross_entropy:
            self._check_requirements()
            self.patch_llama_like(cfg.model_config_type)

            from cut_cross_entropy.transformers.patch import cce_patch

            LOG.info(
                f"Applying Cut Cross Entropy to model type: {cfg.model_config_type}"
            )

            # The patch checks model_type internally

            cce_patch(
                cfg.model_config_type,
                remote_model_id=cfg.base_model if cfg.trust_remote_code else None,
            )

    def patch_llama_like(
        self,
        model_type_to_patch: str,
    ) -> None:
        """
        Generic patch for model architectures with causal lm similar to llama
        """
        from cut_cross_entropy.transformers.patch import PATCH_FNS

        def patch_generic(
            maybe_model,
            patch_options,
            remote_model_id: str | None,
            model_type: str,
        ):
            import cut_cross_entropy.transformers.llama
            from cut_cross_entropy.transformers.llama import cce_forward

            try:
                # Dynamically import the module and CausalLM class
                module_path = f"transformers.models.{model_type}.modeling_{model_type}"
                model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
                module = __import__(
                    module_path, fromlist=[f"{model_cls_prefix}ForCausalLM"]
                )
                model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")

                cut_cross_entropy.transformers.llama._PATCH_OPTS = patch_options

                model_cls.forward = cce_forward

            except (ImportError, AttributeError) as e:
                raise RuntimeError(
                    f"Could not import ForCausalLM class for model_type: {model_type}. "
                    f"Error: {str(e)}"
                ) from e

        if model_type_to_patch not in PATCH_FNS:
            LOG.warning_once(
                "Setting up generic cce patch for model type: %s", model_type_to_patch
            )
            LOG.warning_once(
                f"Generic Cut Cross Entropy + {model_type_to_patch} support is experimental and may not work as expected."
            )
            PATCH_FNS[model_type_to_patch] = partial(
                patch_generic, model_type=model_type_to_patch
            )


================================================
FILE: src/axolotl/integrations/cut_cross_entropy/args.py
================================================
# Copyright 2024 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Module for handling Cut Cross Entropy input arguments.
"""

from typing import Optional

from pydantic import BaseModel, model_validator

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


class CutCrossEntropyArgs(BaseModel):
    """
    Input args for Cut Cross Entropy.
    """

    cut_cross_entropy: Optional[bool] = True

    @model_validator(mode="before")
    @classmethod
    def check_dtype_is_half(cls, data):
        if data.get("cut_cross_entropy") and not (data.get("bf16") or data.get("fp16")):
            raise ValueError(
                "Cut Cross Entropy requires fp16/bf16 training for backward pass. "
                "Please set `bf16` or `fp16` to `True`."
            )

        return data

    @model_validator(mode="before")
    @classmethod
    def check_chunked_cross_entropy_not_set(cls, data):
        if data.get("chunked_cross_entropy"):
            raise ValueError(
                "Cut Cross Entropy does not support chunked cross entropy. "
                "Please set `chunked_cross_entropy` to `False` or disable Cut Cross Entropy."
            )
        return data


================================================
FILE: src/axolotl/integrations/densemixer/README.md
================================================
# DenseMixer

See [DenseMixer](https://github.com/yaof20/DenseMixer/)

# Usage

Simply add the following to your axolotl YAML config:

```yaml
plugins:
  - axolotl.integrations.densemixer.DenseMixerPlugin
```


================================================
FILE: src/axolotl/integrations/densemixer/__init__.py
================================================
"""Integration entry point for the DenseMixer plugin."""

from .plugin import DenseMixerPlugin

__all__ = ["DenseMixerPlugin"]


================================================
FILE: src/axolotl/integrations/densemixer/args.py
================================================
"""Pydantic models for DenseMixer plugin"""

from pydantic import BaseModel


class DenseMixerArgs(BaseModel):
    """
    Args for DenseMixer
    """

    dense_mixer: bool = True


================================================
FILE: src/axolotl/integrations/densemixer/plugin.py
================================================
"""DenseMixer plugin for Axolotl"""

import importlib

from axolotl.integrations.base import BasePlugin
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


class DenseMixerPlugin(BasePlugin):
    """
    Plugin for DenseMixer
    """

    def get_input_args(self) -> str | None:
        return "axolotl.integrations.densemixer.args.DenseMixerArgs"

    def pre_model_load(self, cfg):
        """Apply densemixer patches before model loading if enabled."""
        if cfg.dense_mixer:
            if not importlib.util.find_spec("densemixer"):
                raise RuntimeError(
                    "DenseMixer is not installed. Install it with `pip install densemixer`"
                )

            from densemixer.patching import (
                apply_olmoe_patch,
                apply_qwen2_moe_patch,
                apply_qwen3_moe_patch,
            )

            LOG.info(
                f"Applying DenseMixer patches for model type: {cfg.model_config_type}"
            )

            if cfg.model_config_type == "olmoe":
                apply_olmoe_patch()
            if cfg.model_config_type == "qwen2_moe":
                apply_qwen2_moe_patch()
            if cfg.model_config_type == "qwen3_moe":
                apply_qwen3_moe_patch()


================================================
FILE: src/axolotl/integrations/diffusion/README.md
================================================
# Diffusion LM Training Plugin for Axolotl

This plugin enables diffusion language model training using an approach inspired by
LLaDA (Large Language Diffusion Models) within Axolotl.

## Overview

LLaDA is a diffusion-based approach to language model training that uses:
- **Random token masking** during training instead of next-token prediction
- **Bidirectional attention** to allow the model to attend to the full context
- **Importance weighting** based on masking probabilities for stable training

This approach can lead to more robust language models with better understanding of
bidirectional context.

## Installation

The plugin is included with Axolotl. See our
[installation docs](https://docs.axolotl.ai/docs/installation.html).

## Quickstart

Train with an example config (Llama‑3.2 1B):
   - Pretrain: `axolotl train examples/llama-3/diffusion-3.2-1b-pretrain.yaml`
   - SFT: `axolotl train examples/llama-3/diffusion-3.2-1b-sft.yaml`

### Basic Configuration

You can also modify your existing configs to enable / customize diffusion training.

Add the following to your Axolotl config:

```yaml
# Enable diffusion LM training plugin
plugins:
  - axolotl.integrations.diffusion.DiffusionPlugin
```

And, configure the nested `diffusion` block (defaults shown):

```yaml
diffusion:
  noise_schedule: linear  # or "cosine"
  min_mask_ratio: 0.1
  max_mask_ratio: 0.9
  num_diffusion_steps: 128
  eps: 1e-3
  importance_weighting: true

  # Mask token (training auto-adds if missing, avoid pad/eos)
  mask_token_str: "<|diffusion_mask|>"
  # Or use an existing special token id (e.g., 128002 for Llama-3.x)
  # mask_token_id: 128002

  # Sample generation during training (optional)
  generate_samples: true
  generation_interval: 100
  num_generation_samples: 3
  generation_steps: 128
  generation_temperature: 0.0
  generation_max_length: 100
```

## Supported Models

Any models that support 4D attention masks should work out of the box. If not, please
create an [issue](https://github.com/axolotl-ai-cloud/axolotl/issues) or open a
[PR](https://github.com/axolotl-ai-cloud/axolotl/compare)!

## How It Works

### Random Masking
During training, tokens are randomly masked:
- Sample timestep `t` uniformly from [0, 1]
- Calculate masking probability: `p = (1 - eps) * t + eps`
- Randomly mask tokens with probability `p`

### Diffusion Loss

Loss is computed only on masked tokens with (optional) importance weighting:

```python
loss = sum(cross_entropy(pred, target) / p_mask) / total_tokens
```

## Sample Generation

When `diffusion.generate_samples: true`, the plugin generates samples during training:

```
Sample 1:
   Original (45 tokens): The quick brown fox jumps over the lazy dog...
   Masked (18/45 tokens, 40.0%): The [MASK] [MASK] fox [MASK] over [MASK] lazy [MASK]...
   Generated: The quick brown fox jumps over the lazy dog...
```

Samples are logged to console and wandb (if enabled).

## Inference

Diffusion inference is integrated into the standard Axolotl CLI. Use the same config
you trained with and run:

```
axolotl inference path/to/your-config.yaml
```

Optionally, pass `--gradio` to use a simple web interface.

Interactive controls (prefix the prompt with commands):
- `:complete N` → completion mode with N new masked tokens appended (default 64)
- `:mask R` → random masking mode with target mask ratio R in [0.0, 1.0]

Example session:

```
================================================================================
Commands:
:complete N -> completion mode with N tokens (default 64)
:mask R     -> random masking with ratio R (0.0–1.0)
================================================================================
Give me an instruction (Ctrl + D to submit):

:mask 0.4 The quick brown fox jumps over the lazy dog

Masked (40.0%):
The [MASK] brown [MASK] jumps over the [MASK] dog

Generated:
The quick brown fox jumps over the loud dog
```

## Metrics and Monitoring

The plugin adds (or modifies) several metrics to track diffusion training:

- `train/loss`: Weighted diffusion loss
- `train/accuracy`: Accuracy on masked tokens
- `train/mask_ratio`: Average fraction of tokens masked
- `train/num_masked_tokens`: Number of tokens masked
- `train/avg_p_mask`: Average masking probability
- `train/ce_loss`: Unweighted cross-entropy loss
- `train/importance_weight_avg`: Average importance weight

## Limitations

- No flash attention support
- No RL training support

## References

- [LLaDA Paper](https://arxiv.org/abs/2404.10406)
- [Axolotl Documentation](https://docs.axolotl.ai/)
- [API reference for plugin](https://docs.axolotl.ai/docs/api/integrations.diffusion.args.html#axolotl.integrations.diffusion.args)


================================================
FILE: src/axolotl/integrations/diffusion/__init__.py
================================================
"""Diffusion LM training plugin init."""

from .args import DiffusionArgs, DiffusionConfig
from .callbacks import DiffusionGenerationCallback
from .generation import generate
from .plugin import DiffusionPlugin
from .trainer import DiffusionTrainer
from .utils import create_bidirectional_attention_mask, resolve_mask_token_id

__all__ = [
    "DiffusionArgs",
    "DiffusionPlugin",
    "DiffusionTrainer",
    "generate",
    "resolve_mask_token_id",
    "create_bidirectional_attention_mask",
    "DiffusionGenerationCallback",
    "DiffusionConfig",
]


================================================
FILE: src/axolotl/integrations/diffusion/args.py
================================================
"""Config args for diffusion LM training (nested under `diffusion:`)."""

from __future__ import annotations

from typing import Literal

from pydantic import BaseModel, Field, model_validator


class DiffusionConfig(BaseModel):
    """Nested diffusion configuration available under the `diffusion` key."""

    # Noise schedule config
    noise_schedule: Literal["linear", "cosine"] = Field(
        default="linear", description="Type of noise schedule for diffusion training"
    )
    min_mask_ratio: float = Field(
        default=0.1,
        ge=0.0,
        le=1.0,
        description="Minimum masking ratio for diffusion noise schedule",
    )
    max_mask_ratio: float = Field(
        default=0.9,
        ge=0.0,
        le=1.0,
        description="Maximum masking ratio for diffusion noise schedule",
    )
    num_diffusion_steps: int = Field(
        default=128, ge=1, description="Number of diffusion timesteps"
    )
    eps: float = Field(
        default=1e-3,
        ge=0.0,
        le=1.0,
        description="Epsilon value for minimum masking probability in forward process",
    )

    # Training config
    importance_weighting: bool = Field(
        default=True,
        description="Apply importance weighting to loss based on masking probability",
    )
    mask_token_id: int | None = Field(
        default=None,
        description=(
            "Token ID to use for masking. Unset by default; can use one of the "
            "tokenizer's special tokens here."
        ),
    )
    mask_token_str: str | None = Field(
        default=None,
        description=(
            "Token string to use as a mask. If `mask_token_id` is invalid or unset, "
            "this token will be ensured to exist as an additional special token and "
            "used. If absent, a default '<|diffusion_mask|>' will be added."
        ),
    )

    # Sample generation config
    generate_samples: bool = Field(
        default=True, description="Enable sample generation during training"
    )
    generation_interval: int = Field(
        default=100, ge=1, description="Generate samples every N steps"
    )
    num_generation_samples: int = Field(
        default=3, ge=1, description="Number of samples to generate each time"
    )
    generation_steps: int = Field(
        default=128, ge=1, description="Number of diffusion steps for generation"
    )
    generation_temperature: float = Field(
        default=0.0,
        ge=0.0,
        description="Temperature for generation sampling (0.0 = deterministic)",
    )
    generation_max_length: int = Field(
        default=100, ge=1, description="Maximum sequence length for generation"
    )

    @model_validator(mode="after")
    def _validate_mask_ratios(self) -> "DiffusionConfig":
        if self.min_mask_ratio > self.max_mask_ratio:
            raise ValueError("min_mask_ratio must be ≤ max_mask_ratio")
        return self


class DiffusionArgs(BaseModel):
    """Plugin entry that exposes the nested `diffusion` block to the core config."""

    diffusion: DiffusionConfig = Field(
        default_factory=DiffusionConfig,
        description="Diffusion training configuration. Only nested block is supported.",
    )


================================================
FILE: src/axolotl/integrations/diffusion/callbacks.py
================================================
"""Callbacks for diffusion training."""

import logging
import sys

import wandb
from colorama import Fore, Style
from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
from transformers.training_args import TrainingArguments

from .generation import generate_samples

# Simpler logger for more readable sample generation
logger = logging.getLogger(__name__)
if not logger.handlers:
    handler = logging.StreamHandler(sys.stdout)
    handler.setFormatter(logging.Formatter("%(message)s"))
    logger.addHandler(handler)
    logger.propagate = False
logger.setLevel(logging.INFO)


class DiffusionGenerationCallback(TrainerCallback):
    """Callback for generating samples during diffusion training."""

    def __init__(self, trainer):
        self.trainer = trainer

    def on_step_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Generate samples at specified intervals."""
        if (
            state.global_step > 0
            and state.global_step % self.trainer.cfg.diffusion.generation_interval == 0
        ):
            if not self.trainer.state.is_world_process_zero:
                return

            # Use eval dataloader if available, otherwise use train dataloader
            dataloader = None
            try:
                if getattr(self.trainer, "eval_dataset", None) is not None:
                    dataloader = self.trainer.get_eval_dataloader()
            except Exception:
                dataloader = None
            if dataloader is None:
                dataloader = self.trainer.get_train_dataloader()

            # Generate samples
            diffusion_cfg = self.trainer.cfg.diffusion
            samples = generate_samples(
                model=self.trainer.model,
                tokenizer=self.trainer.processing_class,
                dataloader=dataloader,
                num_generation_samples=diffusion_cfg.num_generation_samples,
                max_length=diffusion_cfg.generation_max_length,
                num_diffusion_steps=diffusion_cfg.generation_steps,
                temperature=diffusion_cfg.generation_temperature,
                mask_token_id=diffusion_cfg.mask_token_id,
            )

            # Log samples
            self._log_samples(samples, state.global_step)

    def _log_samples(self, samples: list, step: int):
        """Log generated samples."""
        if not samples:
            return

        logger.info("=" * 60)
        logger.info("GENERATED SAMPLES")
        logger.info("=" * 60)

        for i, sample_data in enumerate(samples, 1):
            original = sample_data["original"]
            masked = sample_data["masked"]
            generated = sample_data["generated"]
            mask_ratio = sample_data["mask_ratio"]
            masked_tokens = sample_data["masked_tokens"]
            total_tokens = sample_data["total_tokens"]

            logger.info(f"\nSample {i}:")
            logger.info(f"\tOriginal ({total_tokens} tokens): {original}")
            logger.info(
                f"\tMasked ({masked_tokens}/{total_tokens} tokens, "
                f"{mask_ratio:.1%}): {masked}"
            )

            try:
                gen_ids = sample_data.get("generated_ids")
                orig_ids = sample_data.get("orig_ids")
                masked_positions = set(sample_data.get("masked_positions") or [])
                if isinstance(gen_ids, list) and isinstance(orig_ids, list):
                    styles: list[str] = []
                    for i, tid in enumerate(gen_ids):
                        if i in masked_positions:
                            if i < len(orig_ids) and tid == orig_ids[i]:
                                styles.append("green")
                            elif i < len(orig_ids):
                                styles.append("red")
                            else:
                                styles.append("normal")
                        else:
                            same = i < len(orig_ids) and tid == orig_ids[i]
                            styles.append("dim" if same else "normal")

                    spans: list[tuple[str, int, int]] = []
                    if gen_ids:
                        cur = styles[0]
                        start = 0
                        for i in range(1, len(gen_ids)):
                            s = styles[i]
                            if s != cur:
                                spans.append((cur, start, i))
                                cur, start = s, i
                        spans.append((cur, start, len(gen_ids)))

                    parts = []
                    for style_name, a, b in spans:
                        chunk_text = self.trainer.processing_class.decode(
                            gen_ids[a:b], skip_special_tokens=False
                        )
                        if style_name == "green":
                            parts.append(Fore.GREEN + chunk_text + Style.RESET_ALL)
                        elif style_name == "red":
                            parts.append(Fore.RED + chunk_text + Style.RESET_ALL)
                        else:
                            if style_name == "dim":
                                parts.append(Style.DIM + chunk_text + Style.RESET_ALL)
                            else:
                                parts.append(chunk_text)
                    logger.info("\tGenerated:\n%s", "".join(parts))
                else:
                    logger.info(f"\tGenerated: {generated}")
            except Exception:
                logger.info(f"\tGenerated: {generated}")

        logger.info("=" * 60)

        if self.trainer.cfg.use_wandb:
            if wandb.run is not None:
                wandb.log(
                    {
                        "generated_samples": wandb.Table(
                            columns=[
                                "step",
                                "original",
                                "masked",
                                "generated",
                                "mask_ratio",
                                "masked_tokens",
                                "total_tokens",
                            ],
                            data=[
                                [
                                    step,
                                    sample["original"],
                                    sample["masked"],
                                    sample["generated"],
                                    f"{sample['mask_ratio']:.1%}",
                                    sample["masked_tokens"],
                                    sample["total_tokens"],
                                ]
                                for sample in samples
                            ],
                        )
                    },
                    step=step,
                )


================================================
FILE: src/axolotl/integrations/diffusion/generation.py
================================================
"""Sample generation utilities for diffusion training."""

import re
from typing import Any, List, Literal, Optional

import torch

from axolotl.utils.logging import get_logger

from .utils import create_bidirectional_attention_mask, shift_logits_to_input_positions

LOG = get_logger(__name__)


def generate_samples(
    model: torch.nn.Module,
    tokenizer: Any,
    dataloader: Optional[Any] = None,
    num_generation_samples: int = 3,
    max_length: int = 100,
    num_diffusion_steps: int = 128,
    temperature: float = 0.0,
    mask_token_id: int = 32000,
    mode: Literal["random", "completion"] = "random",
    completion_tokens: int = 0,
    target_mask_ratio: Optional[float] = None,
) -> List[dict]:
    """
    Generate text samples using the diffusion model by randomly masking sequences from
    the given dataset and running the reverse diffusion process.

    Args:
        model: The wrapped or unwrapped model
        tokenizer: Tokenizer for encoding/decoding
        dataloader: Validation dataloader (for sampling sequences)
        num_generation_samples: Number of samples to generate
        max_length: Maximum length of sequences to use
        num_diffusion_steps: Number of diffusion steps for generation
        temperature: Temperature for sampling (0.0 = deterministic)
        mask_token_id: Token ID used for masking

    Returns:
        List of dictionaries with original text, masked text, and generated text
    """
    if dataloader is None:
        LOG.warning("No validation dataloader provided, cannot generate samples")
        return []

    unwrapped_model = model.module if hasattr(model, "module") else model
    training = unwrapped_model.training
    unwrapped_model.eval()

    # Resolve device robustly (some modules don't expose `.device`)
    device = getattr(unwrapped_model, "device", None)
    if device is None:
        try:
            device = next(unwrapped_model.parameters()).device
        except StopIteration:
            device = torch.device("cpu")
    generations = []

    # Sample sequences from validation dataset
    sampled_sequences = _sample_sequences_from_dataloader(
        dataloader, num_generation_samples, max_length, device
    )
    LOG.info(f"Sampled {len(sampled_sequences)} sequences from validation dataset")

    # Generate samples using reverse diffusion process
    with torch.no_grad():
        for sample in sampled_sequences:
            if isinstance(sample, dict):
                original_sequence = sample.get("input_ids")
                labels_seq = sample.get("labels")
                attn_seq = sample.get("attention_mask")
            else:
                original_sequence = sample
                labels_seq = None
                attn_seq = None
            generation_result = generate(
                unwrapped_model,
                tokenizer,
                original_sequence,
                num_diffusion_steps,
                temperature,
                mask_token_id,
                mode=mode,
                completion_tokens=completion_tokens,
                target_mask_ratio=target_mask_ratio,
                labels=labels_seq,
                attention_mask=attn_seq,
            )
            generations.append(generation_result)

    # Restore prior training state
    if training:
        unwrapped_model.train()
    else:
        unwrapped_model.eval()

    return generations


def _sample_sequences_from_dataloader(
    dataloader: Any, num_samples: int, max_length: int, device: torch.device
) -> List[Any]:
    """Sample sequences from validation dataloader."""
    sampled_sequences: list[dict[str, torch.Tensor] | torch.Tensor] = []
    sample_count = 0

    # Skip a random number of batches (we could be more clever about this)
    skip_batches = torch.randint(0, 10, (1,)).item()
    batch_count = 0

    for batch in dataloader:
        # Skip some batches for variety
        if batch_count < skip_batches:
            batch_count += 1
            continue

        if sample_count >= num_samples:
            break

        batch_count += 1
        input_ids = batch["input_ids"]
        attention_mask = batch.get("attention_mask")
        labels = batch.get("labels")

        # Randomly sample from sequences in this batch
        batch_indices = torch.randperm(input_ids.size(0)).tolist()

        for i in batch_indices:
            if sample_count >= num_samples:
                break

            # Get actual sequence length (non-padded)
            if attention_mask is not None:
                seq_len = attention_mask[i].sum().item()
            else:
                seq_len = input_ids.size(1)

            if seq_len < 10:
                continue

            # Determine truncation length
            max_total = min(seq_len, max_length)
            if labels is not None:
                labels_i = labels[i][:seq_len]
                answer_mask = labels_i != -100
                if not answer_mask.any():
                    # No answer tokens; skip for SFT masking
                    continue
                first_ans_idx = int(
                    torch.nonzero(answer_mask, as_tuple=False)[0].item()
                )
                prompt_len = first_ans_idx
                if prompt_len >= max_total:
                    # Prompt alone reaches cap; cannot include any answer
                    continue
                remaining_answer = int(answer_mask[prompt_len:].sum().item())
                allowed_answer = max_total - prompt_len
                take_answer = min(remaining_answer, allowed_answer)
                if take_answer <= 0:
                    continue
                actual_length = prompt_len + take_answer
            else:
                actual_length = max_total

            # Extract the (possibly truncated) sequence
            sequence = input_ids[i][:actual_length].unsqueeze(0).to(device)
            attn_seq = (
                attention_mask[i][:actual_length].unsqueeze(0).to(device)
                if attention_mask is not None
                else None
            )
            if labels is not None:
                labels_seq = labels[i][:actual_length].unsqueeze(0).to(device)
                sampled_sequences.append(
                    {
                        "input_ids": sequence,
                        "labels": labels_seq,
                        "attention_mask": attn_seq,
                    }
                )
            else:
                if attn_seq is not None:
                    sampled_sequences.append(
                        {"input_ids": sequence, "attention_mask": attn_seq}
                    )
                else:
                    sampled_sequences.append(sequence)
            sample_count += 1

    return sampled_sequences


def generate(
    model: torch.nn.Module,
    tokenizer: Any,
    original_sequence: torch.Tensor,
    num_diffusion_steps: int,
    temperature: float,
    mask_token_id: int,
    *,
    mode: Literal["random", "completion"] = "random",
    completion_tokens: int = 0,
    target_mask_ratio: Optional[float] = None,
    labels: Optional[torch.Tensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
) -> dict:
    """Generate a single sample using reverse diffusion."""
    # Get original text for comparison
    original_text = tokenizer.decode(
        original_sequence[0].cpu(), skip_special_tokens=True
    )

    # Build masked sequence
    if (
        labels is not None
        and labels.numel() > 0
        and (labels == -100).any()
        and (labels != -100).any()
    ):
        # SFT case: completely mask all answer tokens (labels != -100)
        total_tokens = original_sequence.size(1)
        masked_indices = (labels != -100).to(dtype=torch.bool)
        masked_sequence = original_sequence.clone()
        masked_sequence[masked_indices] = mask_token_id
        masked_tokens = int(masked_indices.sum().item())
        mask_ratio = masked_tokens / max(int(total_tokens), 1)
    elif mode == "completion" and completion_tokens > 0:
        # Append mask tokens to the right for completion
        total_tokens = original_sequence.size(1) + int(completion_tokens)
        masked_indices = torch.zeros(
            1, total_tokens, dtype=torch.bool, device=original_sequence.device
        )
        masked_indices[0, -int(completion_tokens) :] = True

        append = torch.full(
            (1, int(completion_tokens)), mask_token_id, device=original_sequence.device
        )
        masked_sequence = torch.cat([original_sequence, append], dim=1)
        masked_tokens = int(completion_tokens)
        mask_ratio = masked_tokens / total_tokens
    else:
        # Apply random masking with optional fixed ratio
        total_tokens = original_sequence.size(1)
        if target_mask_ratio is None:
            min_ratio, max_ratio = 0.1, 0.7
            target_mask_ratio = (
                torch.rand(1).item() * (max_ratio - min_ratio) + min_ratio
            )
        target_masked_tokens = max(1, int(total_tokens * float(target_mask_ratio)))

        # Create random mask indices
        mask_positions = torch.randperm(total_tokens)[:target_masked_tokens]
        masked_indices = torch.zeros(
            1, total_tokens, dtype=torch.bool, device=original_sequence.device
        )
        masked_indices[0, mask_positions] = True

        # Create masked sequence
        masked_sequence = original_sequence.clone()
        masked_sequence[masked_indices] = mask_token_id

        # Calculate actual mask ratio
        masked_tokens = masked_indices.sum().item()
        mask_ratio = masked_tokens / total_tokens

    # Get masked text for comparison
    masked_text = tokenizer.decode(masked_sequence[0].cpu(), skip_special_tokens=False)
    masked_text = _clean_masked_text(masked_text, tokenizer, mask_token_id)

    # Run reverse diffusion process
    sequence = masked_sequence.clone()
    attention_mask = create_bidirectional_attention_mask(
        sequence, attention_mask, sample_packing=attention_mask is not None
    )
    for step in range(num_diffusion_steps):
        sequence = _diffusion_step(
            model,
            sequence,
            step,
            num_diffusion_steps,
            temperature,
            mask_token_id,
            attention_mask,
        )
    generated_text = tokenizer.decode(sequence[0].cpu(), skip_special_tokens=True)

    # Collect diagnostic info
    final_ids = sequence[0].detach().cpu().tolist()
    orig_ids_for_render = original_sequence[0].detach().cpu().tolist()
    if masked_indices is not None:
        masked_positions = (
            torch.where(masked_indices[0])[0].detach().cpu().tolist()
            if masked_indices.ndim == 2
            else []
        )
    else:
        masked_positions = []

    result = {
        "original": original_text,
        "masked": masked_text,
        "generated": generated_text,
        "mask_ratio": mask_ratio,
        "masked_tokens": masked_tokens,
        "total_tokens": total_tokens,
        "generated_ids": final_ids,
        "masked_positions": masked_positions,
        "orig_ids": orig_ids_for_render,
        "formatted": (
            f"Original: '{original_text}' → Masked: '{masked_text}' "
            f"({mask_ratio:.1%}) → Generated: '{generated_text}'"
        ),
    }

    return result


def _clean_masked_text(masked_text: str, tokenizer: Any, mask_token_id: int) -> str:
    """Clean up masked text for display."""
    mask_token_repr = tokenizer.decode([mask_token_id], skip_special_tokens=False)
    cleaned = masked_text.replace(mask_token_repr, "[MASK]")

    # Remove literal special token strings
    if hasattr(tokenizer, "special_tokens_map"):
        for token_value in tokenizer.special_tokens_map.values():
            if token_value and isinstance(token_value, str):
                cleaned = cleaned.replace(token_value, "")

    # Normalize whitespace but preserve newlines
    cleaned = cleaned.replace("\r\n", "\n").replace("\r", "\n")
    cleaned = re.sub(r"[ \t]+", " ", cleaned)
    cleaned = "\n".join(line.rstrip() for line in cleaned.split("\n")).strip()
    return cleaned


def _diffusion_step(
    model: torch.nn.Module,
    sequence: torch.Tensor,
    step: int,
    num_diffusion_steps: int,
    temperature: float,
    mask_token_id: int,
    attention_mask: torch.Tensor | None = None,
) -> torch.Tensor:
    """Perform a single diffusion step with remasking."""
    # Only process if there are masked tokens remaining
    current_mask = sequence == mask_token_id
    if not current_mask.any():
        return sequence

    # Create or use provided attention mask
    if attention_mask is None:
        batch_size, seq_len = sequence.shape
        attention_mask = torch.ones(
            batch_size, 1, seq_len, seq_len, dtype=torch.bool, device=sequence.device
        )

    # Forward pass
    outputs = model(input_ids=sequence, attention_mask=attention_mask)
    logits = shift_logits_to_input_positions(outputs.logits)

    # Only sample at currently masked positions
    if current_mask.any():
        masked_logits = logits[current_mask]

        # Apply temperature scaling
        if temperature > 0:
            scaled_logits = masked_logits / temperature
        else:
            scaled_logits = masked_logits

        # Suppress mask token in outputs
        scaled_logits[:, mask_token_id] = -float("inf")

        if temperature > 0:
            # Add Gumbel noise for sampling
            gumbel_noise = -torch.log(
                -torch.log(torch.rand_like(scaled_logits, dtype=torch.float32))
            )
            gumbel_logits = scaled_logits + gumbel_noise
            predicted_tokens = torch.argmax(gumbel_logits, dim=-1)
        else:
            predicted_tokens = torch.argmax(scaled_logits, dim=-1)

        # Calculate probabilities for confidence scoring
        probs = torch.softmax(scaled_logits, dim=-1)
        predicted_token_probs = probs[range(len(predicted_tokens)), predicted_tokens]

        # Determine how many tokens to unmask this step
        remaining_masked = current_mask.sum().item()
        if step == num_diffusion_steps - 1:
            num_to_unmask = remaining_masked
        else:
            unmask_ratio = 1.0 / (num_diffusion_steps - step)
            num_to_unmask = max(1, int(remaining_masked * unmask_ratio))

        # Select highest confidence predictions to unmask
        if num_to_unmask >= remaining_masked:
            sequence[current_mask] = predicted_tokens
        else:
            _, top_indices = predicted_token_probs.topk(num_to_unmask)
            mask_positions = torch.where(current_mask)[1]
            positions_to_unmask = mask_positions[top_indices]
            sequence[0, positions_to_unmask] = predicted_tokens[top_indices]

    return sequence


================================================
FILE: src/axolotl/integrations/diffusion/plugin.py
================================================
"""Diffusion LM training plugin for Axolotl."""

from peft import PeftModel
from transformers import PreTrainedModel

from axolotl.integrations.base import BasePlugin
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

from .trainer import DiffusionTrainer

LOG = get_logger(__name__)


class DiffusionPlugin(BasePlugin):
    """
    Plugin for diffusion language model training.

    This plugin enables diffusion-based training using the LLaDA approach, which uses
    random masking and bidirectional attention to train language models.
    """

    def __init__(self):
        super().__init__()
        self.cfg = None

    def get_input_args(self) -> str:
        """Returns the pydantic model for LLaDA plugin arguments."""
        return "axolotl.integrations.diffusion.DiffusionArgs"

    def post_model_load(self, cfg: DictDefault, model: PreTrainedModel | PeftModel):
        """Perform actions after model is loaded."""
        self.cfg = cfg

    def get_trainer_cls(self, cfg: DictDefault) -> type[DiffusionTrainer] | None:
        """Return custom trainer class for diffusion training."""
        return DiffusionTrainer

    def post_trainer_create(self, cfg: DictDefault, trainer: DiffusionTrainer):
        """Configure trainer after creation."""
        trainer.set_config(cfg)


================================================
FILE: src/axolotl/integrations/diffusion/trainer.py
================================================
"""Custom trainer for diffusion LM training."""

from typing import Any, Literal

import torch
import torch.nn.functional as F
from torch import nn

from axolotl.core.trainers.base import AxolotlTrainer
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

from .callbacks import DiffusionGenerationCallback
from .utils import create_bidirectional_attention_mask, shift_logits_to_input_positions

LOG = get_logger(__name__)


class DiffusionTrainer(AxolotlTrainer):
    """Custom trainer for diffusion LM training that overrides loss computation."""

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.cfg = None
        self._special_token_ids = None

    def set_config(self, config: DictDefault):
        """Set config for diffusion training."""
        self.cfg = config
        self._cache_special_token_ids()
        self._resolve_mask_token_id()

        token_id = int(getattr(self.cfg.diffusion, "mask_token_id", 0))
        LOG.info(f"Diffusion: using mask_token_id={token_id}")

        if getattr(config.diffusion, "generate_samples", True):
            generation_callback = DiffusionGenerationCallback(self)
            self.add_callback(generation_callback)

    def _resolve_mask_token_id(self) -> None:
        """Ensure mask_token_id is valid for the current tokenizer."""
        from .utils import resolve_mask_token_id

        tokenizer = getattr(self, "processing_class", None)
        if tokenizer is None:
            return

        mid = resolve_mask_token_id(
            tokenizer,
            self.cfg,
            allow_add=True,
            model=getattr(self, "model", None),
        )
        try:
            self.cfg.diffusion.mask_token_id = int(mid)
        except Exception:
            pass

    def compute_loss(
        self,
        model: nn.Module,
        inputs: dict[str, torch.Tensor],
        return_outputs: bool = False,
        num_items_in_batch: torch.Tensor | None = None,
    ) -> torch.Tensor | tuple[torch.Tensor, dict[str, torch.Tensor]]:
        """Override compute_loss to use diffusion loss."""
        input_ids = inputs.get("input_ids")
        attention_mask = inputs.get("attention_mask")
        labels = inputs.get("labels")

        if input_ids is None:
            raise ValueError("input_ids is required for diffusion training")

        loss, outputs = self._compute_diffusion_loss(
            model, input_ids, attention_mask, labels
        )

        if return_outputs:
            return loss, outputs
        return loss

    def _cache_special_token_ids(self):
        """Cache special token IDs to avoid repeated tokenizer access."""
        if self.processing_class is None:
            self._special_token_ids = set()
            return

        tokenizer = self.processing_class
        special_tokens = set()

        if hasattr(tokenizer, "bos_token_id") and tokenizer.bos_token_id is not None:
            special_tokens.add(tokenizer.bos_token_id)
        if hasattr(tokenizer, "eos_token_id") and tokenizer.eos_token_id is not None:
            special_tokens.add(tokenizer.eos_token_id)
        if hasattr(tokenizer, "pad_token_id") and tokenizer.pad_token_id is not None:
            special_tokens.add(tokenizer.pad_token_id)

        self._special_token_ids = special_tokens

    def _forward_process(
        self,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor | None = None,
        labels: torch.Tensor | None = None,
        eps: float = 1e-3,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Forward noising process. A timestep is sampled along the process, and tokens are
        masked with probability determined by the configured noise schedule.

        Args:
            input_ids: Input token ids [batch_size, seq_len].
            attention_mask: Attention mask [batch_size, seq_len].
            labels: Labels for SFT training [batch_size, seq_len].
            eps: Small epsilon value for minimum masking probability.

        Returns:
            noisy_batch: Input with some tokens masked.
            masked_indices: Boolean mask indicating which tokens were masked.
            p_mask: Masking probabilities for each token [batch_size, seq_len].
        """
        batch_size, seq_len = input_ids.shape
        device = input_ids.device

        # Sample random timesteps for each sample in batch
        t = torch.rand(batch_size, device=device)
        p_mask = (1 - eps) * t + eps  # [batch_size]
        p_mask = p_mask[:, None].repeat(1, seq_len)  # [batch_size, seq_len]

        # Don't mask padding tokens if attention_mask is provided
        if attention_mask is not None:
            valid_mask = attention_mask.bool()
            p_mask = p_mask * valid_mask.float()

        # Create mask to exclude special tokens
        special_token_mask = torch.zeros_like(input_ids, dtype=torch.bool)
        if self._special_token_ids:
            for token_id in self._special_token_ids:
                special_token_mask |= input_ids == token_id

        # Create random mask based on p_mask
        masked_indices = torch.rand((batch_size, seq_len), device=device) < p_mask
        masked_indices = masked_indices & ~special_token_mask
        if attention_mask is not None:
            masked_indices = masked_indices & attention_mask.bool()

        # For SFT data, only mask answer tokens
        if labels is not None:
            answer_mask = labels != -100
            masked_indices = masked_indices & answer_mask

        # Create masked input
        mask_token_id = int(self.cfg.diffusion.mask_token_id)
        mask_value = torch.full_like(input_ids, mask_token_id)
        noisy_batch = torch.where(masked_indices, mask_value, input_ids)

        return noisy_batch, masked_indices, p_mask

    def _compute_diffusion_loss(
        self,
        model: nn.Module,
        input_ids: torch.Tensor,
        attention_mask: torch.Tensor | None = None,
        labels: torch.Tensor | None = None,
    ) -> tuple[torch.Tensor, torch.Tensor | Any]:
        """
        Compute diffusion loss.

        Args:
            model: The model to compute loss for.
            input_ids: Ground truth token ids [batch_size, seq_len].
            attention_mask: Attention mask [batch_size, seq_len].
            labels: Labels for SFT training [batch_size, seq_len].

        Returns:
            loss: Cross-entropy loss.
            metrics: Dictionary of metrics.
        """
        # Short-circuit empty sequences
        if input_ids is None or input_ids.numel() == 0 or input_ids.shape[1] == 0:
            zero = torch.tensor(
                0.0,
                device=(input_ids.device if input_ids is not None else None),
                requires_grad=True,
            )
            return zero, {}

        # If an attention_mask is provided and all positions are padding for every
        # sample in this batch, skip the step.
        if attention_mask is not None:
            if attention_mask.dim() == 2 and (attention_mask.sum(dim=1) == 0).all():
                zero = torch.tensor(0.0, device=input_ids.device, requires_grad=True)
                return zero, {}

        # Apply forward process
        noisy_batch, masked_indices, p_mask = self._forward_process(
            input_ids, attention_mask, labels, self.cfg.diffusion.eps
        )

        # Create bidirectional attention mask
        bidirectional_mask = create_bidirectional_attention_mask(
            input_ids, attention_mask, sample_packing=self.cfg.sample_packing
        )

        # Forward pass
        outputs = model(
            input_ids=noisy_batch.long(),
            attention_mask=bidirectional_mask,
        )
        logits = shift_logits_to_input_positions(outputs.logits)

        if masked_indices.sum() > 0:
            valid_indices = torch.where(masked_indices)
            batch_indices, seq_indices = valid_indices

            masked_logits = logits[batch_indices, seq_indices]
            masked_targets = input_ids[batch_indices, seq_indices]
            masked_p_mask = p_mask[batch_indices, seq_indices]

            # Compute cross-entropy loss without reduction
            token_loss = F.cross_entropy(
                masked_logits.float(), masked_targets, reduction="none"
            )

            if self.cfg.diffusion.importance_weighting:
                masked_p_mask = masked_p_mask.float()
                weighted_loss = token_loss / masked_p_mask
            else:
                weighted_loss = token_loss

            if labels is not None:
                # For SFT data: normalize by answer token count per sample
                answer_mask = labels != -100
                answer_lengths = answer_mask.sum(dim=1).float()  # [batch_size]

                # Get batch indices for masked tokens
                masked_batch_indices = batch_indices

                # Sum losses per sample and divide by answer length
                batch_size = input_ids.shape[0]
                loss_per_sample = torch.zeros(batch_size, device=input_ids.device)
                for i in range(batch_size):
                    sample_mask = masked_batch_indices == i
                    if sample_mask.sum() > 0:
                        sample_loss = weighted_loss[sample_mask].sum()
                        denom = answer_lengths[i].clamp(min=1.0)
                        loss_per_sample[i] = sample_loss / denom

                loss = loss_per_sample.mean()
            else:
                # Non-SFT: when importance weighting is enabled, use unbiased estimator
                # (sum(loss/p) / total_tokens). Otherwise, average over masked tokens
                # for stable scaling across varying mask ratios.
                if self.cfg.diffusion.importance_weighting:
                    loss = weighted_loss.sum() / (
                        input_ids.shape[0] * input_ids.shape[1]
                    )
                else:
                    loss = weighted_loss.mean()

            ce_loss = token_loss.mean()

            # Compute accuracy on masked tokens
            with torch.no_grad():
                pred_tokens = masked_logits.argmax(dim=-1)
                accuracy = (pred_tokens == masked_targets).float().mean()
        else:
            loss = torch.tensor(0.0, device=input_ids.device, requires_grad=True)
            accuracy = torch.tensor(0.0, device=input_ids.device)
            ce_loss = torch.tensor(0.0, device=input_ids.device)
            masked_p_mask = torch.tensor(1.0, device=input_ids.device)

        avg_p_mask = (
            p_mask[masked_indices].mean().item() if masked_indices.any() else 0.0
        )
        metrics = {
            "loss": loss.item(),
            "accuracy": accuracy.item(),
            "mask_ratio": masked_indices.float().mean().item(),
            "num_masked_tokens": (masked_indices.sum().item(), "sum"),
            "avg_p_mask": avg_p_mask,
            "ce_loss": ce_loss.item(),
        }

        # If doing SFT training, log answer-specific metrics
        if self.cfg.datasets is not None:
            with torch.no_grad():
                answer_mask = labels != -100
                answer_lengths = answer_mask.sum(dim=1).float()  # type: ignore
                total_answer_tokens = answer_mask.sum().item()  # type: ignore
                total_tokens = labels.numel()  # type: ignore
                metrics["answer_ratio"] = total_answer_tokens / max(total_tokens, 1)
                metrics["avg_answer_length"] = answer_lengths.mean().item()

        if self.cfg.diffusion.importance_weighting:
            metrics["importance_weight_avg"] = (1.0 / masked_p_mask).mean().item()

        train_eval: Literal["train", "eval"] = "train" if model.training else "eval"
        self.store_metrics(metrics, train_eval=train_eval)

        return loss, outputs


================================================
FILE: src/axolotl/integrations/diffusion/utils.py
================================================
"""Shared utilities for diffusion integration."""

from __future__ import annotations

from typing import Any, Optional

import torch

from axolotl.utils.dict import DictDefault


def resolve_mask_token_id(
    tokenizer: Any,
    cfg: DictDefault,
    *,
    allow_add: bool,
    model: Any | None = None,
    default_token: str = "<|diffusion_mask|>",
) -> int:
    """Resolve mask token id. Training may add a new special token; inference won't."""
    # Determine vocab size if available
    vocab_size = None
    if tokenizer is not None:
        if hasattr(tokenizer, "vocab_size") and tokenizer.vocab_size is not None:
            try:
                vocab_size = int(tokenizer.vocab_size)  # type: ignore[arg-type]
            except Exception:
                vocab_size = None
        elif hasattr(tokenizer, "__len__"):
            try:
                vocab_size = int(len(tokenizer))
            except Exception:
                vocab_size = None

    # Use explicit id from config if provided
    diffusion_cfg = getattr(cfg, "diffusion", None)
    # Fallback to top-level attr names only if nested missing (shouldn't happen)
    cfg_id = (
        getattr(diffusion_cfg, "mask_token_id", None)
        if diffusion_cfg is not None
        else getattr(cfg, "diffusion_mask_token_id", None)
    )
    if isinstance(cfg_id, int) and cfg_id >= 0:
        if vocab_size is None or cfg_id < vocab_size:
            return int(cfg_id)

    def _existing_special_token_id(token_str: str | None) -> int | None:
        """Attempt to resolve an existing special token string to a real ID."""
        if not token_str or not hasattr(tokenizer, "convert_tokens_to_ids"):
            return None
        try:
            token_id = tokenizer.convert_tokens_to_ids(token_str)
        except Exception:
            return None

        if not isinstance(token_id, int) or token_id < 0:
            return None

        # Ensure it's registered as special and not UNK, and within vocab
        unk_id = getattr(tokenizer, "unk_token_id", None)
        specials = set(getattr(tokenizer, "all_special_tokens", []) or [])
        addl = set(getattr(tokenizer, "additional_special_tokens", []) or [])
        is_special = token_str in specials or token_str in addl
        in_vocab = vocab_size is None or token_id < vocab_size
        if (
            (unk_id is not None and token_id == unk_id)
            or not is_special
            or not in_vocab
        ):
            return None
        return token_id

    # Try mask token string if provided
    token_str = (
        getattr(diffusion_cfg, "mask_token_str", None)
        if diffusion_cfg is not None
        else getattr(cfg, "diffusion_mask_token_str", None)
    )
    for candidate in (token_str, default_token):
        token_id = _existing_special_token_id(candidate)
        if isinstance(token_id, int):
            try:
                if diffusion_cfg is None:
                    cfg.diffusion_mask_token_id = int(token_id)  # legacy fallback
                else:
                    diffusion_cfg.mask_token_id = int(token_id)
            except Exception:
                pass
            return int(token_id)

    # Optionally add and return a dedicated special token during training
    if allow_add and hasattr(tokenizer, "add_special_tokens"):
        token_to_add = token_str or default_token
        try:
            tokenizer.add_special_tokens({"additional_special_tokens": [token_to_add]})

            # Resize embeddings if possible
            if (
                model is not None
                and hasattr(tokenizer, "__len__")
                and hasattr(model, "resize_token_embeddings")
            ):
                try:
                    model.resize_token_embeddings(len(tokenizer))
                except Exception:
                    pass
            new_id = tokenizer.convert_tokens_to_ids(token_to_add)
            if isinstance(new_id, int) and new_id >= 0:
                try:
                    if diffusion_cfg is None:
                        cfg.diffusion_mask_token_id = int(new_id)  # legacy fallback
                    else:
                        diffusion_cfg.mask_token_id = int(new_id)
                except Exception:
                    pass
                return int(new_id)
        except Exception:
            pass

    # Fallback to unk or 0 (do not update cfg)
    fallback = getattr(tokenizer, "unk_token_id", 0) or 0
    return int(fallback)


def create_bidirectional_attention_mask(
    input_ids: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    sample_packing: bool = False,
) -> torch.Tensor:
    """
    Create bidirectional attention mask to override default causal masking.
    Handles sample-packed sequences where different samples are identified
    by different attention mask values.

    Args:
        input_ids: Input token ids [batch_size, seq_len]
        attention_mask: Attention mask [batch_size, seq_len]
        sample_packing: Whether sample packing is enabled

    Returns:
        bidirectional_mask: 4D attention mask [batch_size, 1, seq_len, seq_len]
    """
    batch_size, seq_len = input_ids.shape
    device = input_ids.device

    if attention_mask is None or not sample_packing:
        return torch.ones(
            batch_size, 1, seq_len, seq_len, dtype=torch.bool, device=device
        )

    # Handle sample packing: tokens can only attend within their sample
    mask_i = attention_mask.unsqueeze(2)  # [batch_size, seq_len, 1]
    mask_j = attention_mask.unsqueeze(1)  # [batch_size, 1, seq_len]

    # Tokens can attend to each other if they have the same non-zero sample ID
    bidirectional_mask = (mask_i == mask_j) & (mask_i > 0)

    # Add head dimension: [batch_size, 1, seq_len, seq_len]
    return bidirectional_mask.unsqueeze(1)


def shift_logits_to_input_positions(logits: torch.Tensor) -> torch.Tensor:
    """Align next-token logits with their input token positions for diffusion."""
    if logits.size(1) <= 1:
        return logits
    return torch.cat([logits[:, :1], logits[:, :-1]], dim=1)


================================================
FILE: src/axolotl/integrations/grokfast/LICENSE
================================================
MIT License

Copyright (c) 2024 Jaerin Lee, Bong Gyun Kang, Kihoon Kim, Kyoung Mu Lee

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


================================================
FILE: src/axolotl/integrations/grokfast/README.md
================================================
# Grokfast Optimizer

See https://github.com/ironjr/grokfast

## Usage

```yaml
plugins:
  - axolotl.integrations.grokfast.GrokfastPlugin

grokfast_alpha: 2.0
grokfast_lamb: 0.98
```

## Citation

```bib
@article{lee2024grokfast,
    title={{Grokfast}: Accelerated Grokking by Amplifying Slow Gradients},
    author={Lee, Jaerin and Kang, Bong Gyun and Kim, Kihoon and Lee, Kyoung Mu},
    journal={arXiv preprint arXiv:2405.20233},
    year={2024}
}
```


================================================
FILE: src/axolotl/integrations/grokfast/__init__.py
================================================
"""
Grokfast plugin for Axolotl
"""

from transformers.trainer_callback import TrainerCallback

from axolotl.utils.logging import get_logger

from ..base import BasePlugin
from .args import GrokfastArgs as GrokfastArgs
from .optimizer import gradfilter_ema

LOG = get_logger(__name__)


class GrokfastCallbackHandler(TrainerCallback):
    """
    Transformer trainer callbacks for Grokfast
    """

    def __init__(self, *args_, alpha=0.98, lamb=2.0, **kwargs):
        super().__init__(*args_, **kwargs)
        self.grads = None
        self.alpha = alpha
        self.lamb = lamb

    def on_train_begin(self, *args_, **kwargs):
        self.grads = None

    def on_pre_optimizer_step(self, args_, state, control, **kwargs):
        model = kwargs.pop("model")
        self.grads = gradfilter_ema(model, self.grads, alpha=self.alpha, lamb=self.lamb)
        return control


class GrokfastPlugin(BasePlugin):
    """
    Plugin for Grokfast optimizer integraton with Axolotl.
    """

    def get_input_args(self):
        return "axolotl.integrations.grokfast.GrokfastArgs"

    def add_callbacks_post_trainer(self, cfg, trainer):
        LOG.info("Adding Grokfast callback to the trainer")
        callback = GrokfastCallbackHandler(
            alpha=cfg.grokfast_alpha, lamb=cfg.grokfast_lamb
        )
        return [callback]


================================================
FILE: src/axolotl/integrations/grokfast/args.py
================================================
"""
config args for grokfast plugin
"""

from typing import Optional

from pydantic import BaseModel


class GrokfastArgs(BaseModel):
    """
    Input args for Grokfast optimizer.
    """

    grokfast_alpha: Optional[float] = 0.98
    grokfast_lamb: Optional[float] = 2.0


================================================
FILE: src/axolotl/integrations/grokfast/optimizer.py
================================================
# Copyright: MIT License (c) 2024 Jaerin Lee, Bong Gyun Kang, Kihoon Kim, Kyoung Mu Lee
# Reference: https://github.com/ironjr/grokfast

from collections import deque
from typing import Dict, Literal, Optional

import torch
import torch.nn as nn


def gradfilter_ma(
    m: nn.Module,
    grads: Optional[Dict[str, deque]] = None,
    window_size: int = 100,
    lamb: float = 5.0,
    filter_type: Literal["mean", "sum"] = "mean",
    warmup: bool = True,
    trigger: bool = False,  # For ablation study.
) -> Dict[str, deque]:
    if grads is None:
        grads = {
            n: deque(maxlen=window_size)
            for n, p in m.named_parameters()
            if p.requires_grad and p.grad is not None
        }

    for n, p in m.named_parameters():
        if p.requires_grad and p.grad is not None:
            grads[n].append(p.grad.data.detach())  # .cpu())

            # Modify the gradients.
            if not warmup or len(grads[n]) == window_size and not trigger:
                if filter_type == "mean":
                    avg = sum(grads[n]) / len(grads[n])
                elif filter_type == "sum":
                    avg = sum(grads[n])
                else:
                    raise ValueError(f"Unrecognized filter_type {filter_type}")
                p.grad.data = p.grad.data + avg * lamb

    return grads


def gradfilter_ema(
    m: nn.Module,
    grads: Optional[Dict[str, torch.Tensor]] = None,
    alpha: float = 0.98,
    lamb: float = 2.0,
) -> Dict[str, torch.Tensor]:
    if grads is None:
        grads = {
            n: p.grad.data.detach()
            for n, p in m.named_parameters()
            if p.requires_grad and p.grad is not None
        }

    for n, p in m.named_parameters():
        if p.requires_grad and p.grad is not None:
            grads[n] = grads[n] * alpha + p.grad.data.detach() * (1 - alpha)
            p.grad.data = p.grad.data + grads[n] * lamb

    return grads


================================================
FILE: src/axolotl/integrations/kd/README.md
================================================
# Knowledge Distillation

## Usage

```yaml
plugins:
  - "axolotl.integrations.kd.KDPlugin"

kd_trainer: True
kd_ce_alpha: 0.1
kd_alpha: 0.9
kd_temperature: 1.0

torch_compile: True  # torch>=2.6.0, recommended to reduce vram

datasets:
  - path: ...
    type: "axolotl.integrations.kd.chat_template"
    field_messages: "messages_combined"
    logprobs_field: "llm_text_generation_vllm_logprobs"  # for kd only, field of logprobs
```

An example dataset can be found at [`axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample`](https://huggingface.co/datasets/axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample)


================================================
FILE: src/axolotl/integrations/kd/__init__.py
================================================
# Copyright 2024 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Plugin init to add KD support to Axolotl.
"""

from typing import Any

from transformers import Trainer

from axolotl.integrations.base import BasePlugin
from axolotl.integrations.kd.callbacks import KDTemperatureSchedulerCallback

from .args import KDArgs as KDArgs


class KDPlugin(BasePlugin):
    """
    Plugin for KD support in Axolotl.
    """

    def get_input_args(self):
        return "axolotl.integrations.kd.KDArgs"

    def get_training_args_mixin(self):
        return "axolotl.integrations.kd.args.KDTrainingArgsMixin"

    def get_trainer_cls(self, cfg):
        if cfg.kd_trainer:
            from .trainer import AxolotlKDTrainer

            return AxolotlKDTrainer
        return None

    def get_training_args(self, cfg):
        return {
            "kd_ce_alpha": cfg.kd_ce_alpha,
            "kd_alpha": cfg.kd_alpha,
            "kd_temperature": cfg.kd_temperature,
            "kd_beta": cfg.kd_beta,
            "kd_normalize_topk": cfg.kd_normalize_topk,
        }

    def get_collator_cls_and_kwargs(self, cfg, is_eval=False):
        if not cfg.kd_trainer:
            return None, None

        from .collator import DataCollatorForKD, KDBatchSamplerDataCollatorForSeq2Seq

        use_batch_sampler_collator = False
        if is_eval is False and cfg.sample_packing:
            use_batch_sampler_collator = True
        if cfg.eval_sample_packing and is_eval:
            use_batch_sampler_collator = True

        if cfg.kd_online_server_base_url:
            from .collator_online_teacher import OnlineTeacherCollator

            return OnlineTeacherCollator, {
                "kd_online_server_base_url": cfg.kd_online_server_base_url,
                "kd_online_topk": cfg.kd_online_topk,
                "kd_temperature": cfg.kd_temperature,
                "kd_online_server": cfg.kd_online_server,
                "kd_online_timeout": cfg.kd_online_timeout,
                "kd_normalize_topk": cfg.kd_normalize_topk,
            }

        if use_batch_sampler_collator:
            return KDBatchSamplerDataCollatorForSeq2Seq, {}
        return DataCollatorForKD, {}

    def pre_model_load(self, cfg):
        from .kernels.models import apply_kernel

        apply_kernel(cfg.model_config_type)

    def add_callbacks_post_trainer(self, cfg: Any, trainer: Trainer) -> list:
        """
        Adds temp scheduler callback to the Trainer instance.

        Args:
            cfg (Any): Configuration object containing the sparse recipe.
            trainer (Trainer): Huggingface Trainer instance.

        Returns:
            list: List containing the configured callback instances.
        """
        if cfg.kd_temperature_min is not None and cfg.kd_online_server_base_url:
            callback = KDTemperatureSchedulerCallback(
                cfg.kd_temperature,
                cfg.kd_temperature_min,
                trainer,
            )
            return [callback]

        return []


================================================
FILE: src/axolotl/integrations/kd/args.py
================================================
# Copyright 2024 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Plugin args for KD support.
"""

from dataclasses import dataclass
from enum import Enum

from pydantic import BaseModel, Field


class InferenceServerType(str, Enum):
    """
    Online inferences server types to handle different request args
    """

    vllm = "vllm"
    sglang = "sglang"


class KDArgs(BaseModel):
    """
    Input args for knowledge distillation.
    """

    kd_trainer: float | None = None  # whether to use KD trainer
    kd_ce_alpha: float | None = (
        None  # loss coefficient for cross-entropy loss during KD
    )
    kd_alpha: float | None = None  # loss coefficient for KD loss
    kd_temperature: float | None = None  # temperature for sampling during KD
    kd_beta: float | None = 0.0  # beta coefficient for ratio of fwd and reverse KL
    kd_normalize_topk: bool | None = (
        None  # whether to normalize student logits during KD
    )

    # TODO online kd
    kd_online_server_base_url: str | None = None
    kd_online_topk: int | None = None
    kd_online_server: InferenceServerType | None = Field(
        default_factory=lambda: InferenceServerType.vllm
    )
    kd_online_timeout: int | None = 120
    kd_temperature_min: float | None = (
        None  # kd temperature scheduling during online kd
    )


@dataclass
class KDTrainingArgsMixin:
    """
    Additional args for KD training.
    """

    kd_ce_alpha: float | None = (
        None  # loss coefficient for cross-entropy loss during KD
    )
    kd_alpha: float | None = None  # loss coefficient for KD loss
    kd_temperature: float | None = None  # temperature for sampling during KD
    kd_beta: float | None = None  # beta coefficient for ratio of fwd and reverse KL
    kd_normalize_topk: float | None = (
        None  # whether to normalize student logits during KD
    )


================================================
FILE: src/axolotl/integrations/kd/callbacks.py
================================================
"""
Transformers trainer callbacks to schedule the KD temperature during training
"""

import math

from transformers.trainer_callback import TrainerCallback


class KDTemperatureSchedulerCallback(TrainerCallback):
    """
    KD temperature scheduler callback for the trainer.
    """

    def __init__(self, temperature_start, temperature_min, trainer):
        self.temperature_start = temperature_start
        self.temperature_min = temperature_min
        self.temperature = temperature_start

        self.trainer = trainer

    def on_step_end(self, args, state, control, **kwargs):
        # cosine decay temperature over the max steps

        progress = state.global_step / state.max_steps
        # Cosine decay factor: 0.5 * (1 + cos(pi * progress))
        # This factor goes from 1 (at progress=0) to 0 (at progress=1)
        decay_factor = 0.5 * (1.0 + math.cos(math.pi * progress))
        self.temperature = self.temperature_start - (
            (self.temperature_start - self.temperature_min) * (1.0 - decay_factor)
        )

        if hasattr(self.trainer.data_collator, "kd_temperature"):
            self.trainer.data_collator.kd_temperature = self.temperature


================================================
FILE: src/axolotl/integrations/kd/chat_template.py
================================================
# Copyright 2024 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Chat template prompt strategy loader with KD support
"""

import logging
from typing import Any, Dict

import torch

from axolotl.prompt_strategies.chat_template import ChatTemplateStrategy, StrategyLoader

LOG = logging.getLogger(__name__)


class ChatTemplateStrategyWithKD(ChatTemplateStrategy):
    """
    Handle fields for logprob KD
    """

    def __init__(
        self,
        prompter,
        tokenizer,
        train_on_inputs,
        sequence_len,
        roles_to_train=None,
        train_on_eos=None,
        train_on_eot=None,
        eot_tokens=None,
        split_thinking: bool | None = False,
        logprobs_field="logprobs",
        gen_temperature=1.0,
        kd_temperature=1.0,
    ):
        self.logprobs_field = logprobs_field
        self.gen_temperature = gen_temperature
        self.kd_temperature = kd_temperature

        super().__init__(
            prompter,
            tokenizer,
            train_on_inputs,
            sequence_len,
            roles_to_train=roles_to_train,
            train_on_eos=train_on_eos,
            train_on_eot=train_on_eot,
            eot_tokens=eot_tokens,
            split_thinking=split_thinking,
        )

    @property
    def supports_batched(self) -> bool:
        # batching doesn't work well for logprob data
        return False

    def transform_logprobs(self, sample):
        """
        Transform logprobs to target format for KD training
        """

        logprobs = sample.pop(self.logprobs_field)
        target_seq_len = len(logprobs)
        input_seq_len = len(sample["input_ids"])
        input_padding_len = input_seq_len - target_seq_len
        # get non-zero top-k (prune None logprobs from vllm data step)
        top_k_vals = [
            len(logprobs[i])
            for i in range(len(logprobs))
            if logprobs[i] is not None and len(logprobs[i])
        ]
        max_top_k = max(set(top_k_vals), key=top_k_vals.count)
        min_top_k = min(set(top_k_vals), key=top_k_vals.count)
        top_k = min(max_top_k, min_top_k)
        if top_k == 0:
            raise ValueError("No non-zero top-k logprobs found.")

        target_logprobs = []
        target_token_ids = []
        target_mask = []

        if input_padding_len < 0:
            # logprobs is longer than target_seq_len,
            # so we need to slice from the left/beginning of logprobs
            logprobs = logprobs[:-input_seq_len]
            input_padding_len = 0
            # target_seq_len = input_seq_len

        # truncate the second dimension of the logprobs to top_k
        logprobs = [row[:top_k] for row in logprobs]

        # fill with -inf for padding_len tokens for top_k tokens
        # extend target_logprobs with a padding_len x top_k 2D list filled with -inf

        # we shift for causal models in the trainer, so start the range from 0
        for _ in range(0, input_padding_len):
            target_logprobs.append([-float("inf")] * top_k)
            target_token_ids.append(list(range(top_k)))
            target_mask.append([0] * top_k)

        for position in range(input_padding_len, input_seq_len):
            if sample["labels"][position] == -100:
                target_mask.append([0] * top_k)
            else:
                target_mask.append([1] * top_k)

        for _, token_pos_logprobs in enumerate(logprobs):
            # Initialize collections for logprobs and token_ids
            position_logprobs = []
            position_token_ids = []

            # Process each token probability entry
            for entry in token_pos_logprobs:
                # Extract logprob value
                logprob = entry["logprob"]

                # Parse token_id from the "token_id:###" format
                token_id = int(entry["token"].split(":")[1])

                # Append to our collections
                position_logprobs.append(logprob)
                position_token_ids.append(token_id)

            # Convert to a tensor for easier manipulation
            position_logprobs_tensor = torch.tensor(
                position_logprobs, dtype=torch.float
            )

            # Now we have distribution at T1 in log form, i.e. log p_{T1}(k).
            # Next, re-scale to T2 = self.kd_temperature via exponent-based trick
            # p_{T2}(k) = [p_{T1}(k)]^(T1 / T2) / Z
            #
            # Convert from log to probability
            teacher_probs_t1 = position_logprobs_tensor.exp()
            # normalize probabilities to sum to 1 in case they aren't already
            teacher_probs_t1_sum = teacher_probs_t1.sum(dim=0, keepdim=True)
            if teacher_probs_t1_sum > 1e-9:
                teacher_probs_t1 = teacher_probs_t1 / teacher_probs_t1_sum
            if self.kd_temperature != self.gen_temperature:
                # Exponentiate by factor (T1 / T2)
                exponent = self.gen_temperature / self.kd_temperature
                teacher_probs_t2 = teacher_probs_t1**exponent
            else:
                teacher_probs_t2 = teacher_probs_t1
            # Re-normalize
            teacher_probs_t2 = teacher_probs_t2 / teacher_probs_t2.sum(
                dim=0, keepdim=True
            )
            # Convert back to log
            position_logprobs_tensor = torch.log(teacher_probs_t2)

            # Now we have log p_{teacher, T2}(k) stored in position_logprobs_tensor
            position_logprobs_scaled = position_logprobs_tensor.tolist()

            target_logprobs.append(position_logprobs_scaled)
            target_token_ids.append(position_token_ids)

        # Update sample with transformed logprobs
        sample["target_logprobs"] = target_logprobs
        sample["target_token_ids"] = target_token_ids
        sample["target_mask"] = target_mask

        return sample

    def _tokenize_single_prompt(self, prompt):
        logprobs = prompt.pop(self.logprobs_field)
        tokenized_prompt = super()._tokenize_single_prompt(prompt)
        tokenized_prompt[self.logprobs_field] = logprobs

        # let subclasses add fields before transform
        tokenized_prompt = self._prepare_kd_fields(tokenized_prompt, prompt)

        tokenized_prompt = self.transform_logprobs(tokenized_prompt)
        return tokenized_prompt

    def _prepare_kd_fields(self, tokenized_prompt, original_prompt):
        """
        Hook for subclasses to prepare additional KD fields before transform
        """
        return tokenized_prompt


class ChatTemplateStrategyWithKDv2(ChatTemplateStrategyWithKD):
    """
    Strat for datasets with complete structured KD logprob data
    """

    def transform_logprobs(self, sample):
        """
        Transform logprobs to target format for KD training
        """

        logprobs = sample.pop(self.logprobs_field)
        target_seq_len = len(logprobs)
        input_seq_len = len(sample["input_ids"])
        input_padding_len = input_seq_len - target_seq_len
        # get non-zero top-k (prune None logprobs from vllm data step)
        top_k_vals = [
            len(logprobs[i])
            for i in range(len(logprobs))
            if logprobs[i] is not None and len(logprobs[i])
        ]
        max_top_k = max(set(top_k_vals), key=top_k_vals.count)
        min_top_k = min(set(top_k_vals), key=top_k_vals.count)
        top_k = min(max_top_k, min_top_k)
        if top_k == 0:
            raise ValueError("No non-zero top-k logprobs found.")

        target_logprobs = []
        target_token_ids = []
        target_mask = []

        if input_padding_len < 0:
            # logprobs is longer than target_seq_len,
            # so we need to slice from the left/beginning of logprobs
            logprobs = logprobs[:-input_seq_len]
            input_padding_len = 0
            # target_seq_len = input_seq_len

        # truncate the second dimension of the logprobs to top_k
        logprobs = [row[:top_k] for row in logprobs]

        # fill with -inf for padding_len tokens for top_k tokens
        # extend target_logprobs with a padding_len x top_k 2D list filled with -inf

        # we shift for causal models in the trainer, so start the range from 0
        for _ in range(0, input_padding_len):
            target_logprobs.append([-float("inf")] * top_k)
            target_token_ids.append(list(range(top_k)))
            target_mask.append([0] * top_k)

        for position in range(input_padding_len, input_seq_len):
            if sample["labels"][position] == -100:
                target_mask.append([0] * top_k)
            else:
                target_mask.append([1] * top_k)

        for token_pos_logprobs, pos_target_token_ids in zip(
            logprobs, sample["target_token_ids"], strict=False
        ):
            # Convert to a tensor for easier manipulation
            position_logprobs_tensor = torch.tensor(
                token_pos_logprobs, dtype=torch.float
            )

            # Now we have distribution at T1 in log form, i.e. log p_{T1}(k).
            # Next, re-scale to T2 = self.kd_temperature via exponent-based trick
            # p_{T2}(k) = [p_{T1}(k)]^(T1 / T2) / Z
            #
            # Convert from log to probability
            teacher_probs_t1 = position_logprobs_tensor.exp()
            # normalize probabilities to sum to 1 in case they aren't already
            teacher_probs_t1_sum = teacher_probs_t1.sum(dim=0, keepdim=True)
            if teacher_probs_t1_sum > 1e-9:
                teacher_probs_t1 = teacher_probs_t1 / teacher_probs_t1_sum
            if self.kd_temperature != self.gen_temperature:
                # Exponentiate by factor (T1 / T2)
                exponent = self.gen_temperature / self.kd_temperature
                teacher_probs_t2 = teacher_probs_t1**exponent
            else:
                teacher_probs_t2 = teacher_probs_t1
            # Re-normalize
            teacher_probs_t2 = teacher_probs_t2 / teacher_probs_t2.sum(
                dim=0, keepdim=True
            )
            # Convert back to log
            position_logprobs_tensor = torch.log(teacher_probs_t2)

            # Now we have log p_{teacher, T2}(k) stored in position_logprobs_tensor
            position_logprobs_scaled = position_logprobs_tensor.tolist()

            target_logprobs.append(position_logprobs_scaled)
            target_token_ids.append(pos_target_token_ids)

        # Update sample with transformed logprobs
        sample["target_logprobs"] = target_logprobs
        sample["target_token_ids"] = target_token_ids
        sample["target_mask"] = target_mask

        return sample

    def _prepare_kd_fields(self, tokenized_prompt, original_prompt):
        """
        Add pre-tokenized target_token_ids for v2 format
        """
        target_token_ids = original_prompt.pop("target_token_ids", None)
        if target_token_ids is not None:
            tokenized_prompt["target_token_ids"] = target_token_ids
        return tokenized_prompt


class KDStrategyLoader(StrategyLoader):
    """
    Load ChatTemplateStrategy with KD support using StrategyLoader.
    """

    def _get_strategy_cls(self, cfg):
        return ChatTemplateStrategyWithKD

    def _get_strategy_params(self, cfg, ds_cfg: Dict[str, Any]):
        strategy_params = super()._get_strategy_params(cfg, ds_cfg)
        if logprobs_field := ds_cfg.get("logprobs_field"):
            strategy_params["logprobs_field"] = logprobs_field
        if gen_temperature := ds_cfg.get("temperature"):
            strategy_params["gen_temperature"] = gen_temperature
        if kd_temperature := cfg.get("kd_temperature"):
            strategy_params["kd_temperature"] = kd_temperature

        return strategy_params


class KDStrategyLoaderV2(KDStrategyLoader):
    """
    Load KD chat template datasets with pre-tokenized logprob data
    """

    def _get_strategy_cls(self, cfg):
        return ChatTemplateStrategyWithKDv2


load_legacy = KDStrategyLoader()
load = KDStrategyLoaderV2()


================================================
FILE: src/axolotl/integrations/kd/collator.py
================================================
# Copyright 2024 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
DataCollator for axolotl to handle KD fields without using -inf for padding,
and with a teacher_mask to identify padded positions.
"""

from dataclasses import dataclass
from typing import Any, Optional, Union

import numpy as np
import torch
from transformers import PreTrainedTokenizerBase
from transformers.utils import PaddingStrategy

from axolotl.utils.collators.batching import DataCollatorForSeq2Seq


@dataclass
class DataCollatorForKD(DataCollatorForSeq2Seq):
    """
    Data collator for KD, including handling KD-specific fields.

    This version avoids using -inf and instead uses a large negative value for padding
    target_logprobs. It also creates a teacher_mask to indicate which entries are valid.
    """

    tokenizer: PreTrainedTokenizerBase
    model: Optional[Any] = None
    padding: Union[bool, str, PaddingStrategy] = True
    max_length: Optional[int] = None
    pad_to_multiple_of: Optional[int] = None
    label_pad_token_id: int = -100
    position_pad_token_id: int = 0
    return_tensors: str = "pt"

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True

    def __call__(self, features, return_tensors=None):
        if return_tensors is None:
            return_tensors = self.return_tensors

        padding_side = self.tokenizer.padding_side
        max_len = 0

        # Pad labels and position_ids first
        for feature_name, pad_token_id in [
            ("labels", self.label_pad_token_id),
            ("position_ids", self.position_pad_token_id),
        ]:
            if feature_name in features[0]:
                feat = [f[feature_name] for f in features]
                max_len = max(len(x) for x in feat)
                if self.pad_to_multiple_of is not None:
                    max_len = (
                        (max_len + self.pad_to_multiple_of - 1)
                        // self.pad_to_multiple_of
                    ) * self.pad_to_multiple_of

                for f in features:
                    remainder = [pad_token_id] * (max_len - len(f[feature_name]))
                    if isinstance(f[feature_name], list):
                        f[feature_name] = (
                            f[feature_name] + remainder
                            if padding_side == "right"
                            else remainder + f[feature_name]
                        )
                    else:
                        # If they are numpy arrays
                        if padding_side == "right":
                            f[feature_name] = np.concatenate(
                                [f[feature_name], remainder]
                            ).astype(np.int64)
                        else:
                            f[feature_name] = np.concatenate(
                                [remainder, f[feature_name]]
                            ).astype(np.int64)

        # Handle target_logprobs and target_token_ids manually
        target_logprobs_list = []
        target_token_ids_list = []
        target_mask_list = []
        has_teacher_data = ("target_logprobs" in features[0]) and (
            "target_token_ids" in features[0]
        )

        if has_teacher_data:
            # Extract and remove from features
            for f in features:
                target_logprobs_list.append(f.pop("target_logprobs"))
                target_token_ids_list.append(f.pop("target_token_ids"))
                target_mask_list.append(f.pop("target_mask"))

            # Determine max lengths
            max_teacher_seq_len = max_len or max(
                len(seq) for seq in target_logprobs_list
            )
            max_k = max(len(seq_k) for seq in target_logprobs_list for seq_k in seq)

            padded_target_logprobs = []
            padded_target_token_ids = []
            padded_teacher_mask_list = []

            for t_logprobs, t_ids, t_mask in zip(
                target_logprobs_list,
                target_token_ids_list,
                target_mask_list,
                strict=False,
            ):
                t_logprobs_padded = []
                t_ids_padded = []
                t_mask_padded = []

                for lp, ids, mask in zip(t_logprobs, t_ids, t_mask, strict=False):
                    lp_len = len(lp)
                    if lp_len < max_k:
                        # Use -1e9 for padding logprobs and 0 for token_ids
                        pad_len = max_k - lp_len
                        lp = lp + [-1e9] * pad_len
                        ids = ids + [0] * pad_len
                        mask = mask + [0] * pad_len
                    else:
                        lp = lp[:max_k]
                        ids = ids[:max_k]
                        mask = mask[:max_k]

                    t_logprobs_padded.append(lp)
                    t_ids_padded.append(ids)
                    t_mask_padded.append(mask)

                seq_len_diff = max_teacher_seq_len - len(t_logprobs_padded)
                if seq_len_diff > 0:
                    # Pad sequences fully if needed
                    t_logprobs_padded.extend(
                        [[-1e9] * max_k for _ in range(seq_len_diff)]
                    )
                    t_ids_padded.extend([[0] * max_k for _ in range(seq_len_diff)])
                    t_mask_padded.extend([[0] * max_k for _ in range(seq_len_diff)])

                padded_target_logprobs.append(t_logprobs_padded)
                padded_target_token_ids.append(t_ids_padded)
                padded_teacher_mask_list.append(t_mask_padded)

            # Convert to tensors
            padded_target_logprobs = torch.tensor(
                padded_target_logprobs, dtype=torch.float
            )
            padded_target_token_ids = torch.tensor(
                padded_target_token_ids, dtype=torch.long
            )
            padded_teacher_mask_list = torch.tensor(
                padded_teacher_mask_list, dtype=torch.int
            )

        # Pad using tokenizer for regular fields
        features = self.tokenizer.pad(
            features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=return_tensors,
        )

        # Add back teacher data if present
        if has_teacher_data:
            features["target_logprobs"] = padded_target_logprobs
            features["target_token_ids"] = padded_target_token_ids
            features["target_mask"] = padded_teacher_mask_list

        # Prepare decoder_input_ids if the model supports it
        if (
            "labels" in features
            and self.model is not None
            and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
        ):
            decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(
                labels=features["labels"]
            )
            features["decoder_input_ids"] = decoder_input_ids

        return features


class KDBatchSamplerDataCollatorForSeq2Seq(DataCollatorForKD):
    """
    Collator for multipack (batch of sub-batches) specifically for KD.
    Adapts DataCollatorForKD so it can pack multiple sequences in a single batch item.
    """

    def __call__(self, features, return_tensors=None):
        """
        Expects that `features` could be either:
          - a single list of dicts, OR
          - a list of lists of dicts (the "sub-batches" to be packed).
        """
        # 1) If we are *not* dealing with multiple sequences per batch element,
        #    just pass straight to parent.
        if not isinstance(features[0], list):
            return super().__call__(features, return_tensors=return_tensors)

        # 2) Otherwise, we *are* dealing with multiple sequences in each batch item.
        #    We want to produce a single "merged" feature dict for each sub-batch.
        out_features = [{} for _ in features]

        for i, sub_features in enumerate(features):
            # sub_features is a list of dicts, each dict = one sequence’s features
            # We'll merge them into out_features[i].
            #
            # NOTE: You can customize how you combine fields as needed (e.g. summation
            # or offset for attention_mask). Below is a straightforward concatenation/extension.

            for field_name in sub_features[0].keys():
                # Some fields you might want to skip or treat specially:
                if field_name == "length":
                    continue

                # If it’s a KD field that’s a list-of-lists (e.g. target_logprobs),
                # you typically just want to flatten them by extending.
                if field_name in ["target_logprobs", "target_token_ids", "target_mask"]:
                    combined = []
                    for feat in sub_features:
                        combined.extend(feat[field_name])
                    out_features[i][field_name] = combined

                elif field_name == "attention_mask":
                    # Here we apply the (j+1) factor to differentiate each sub-sample
                    # within this merged batch item.
                    arrays = []
                    for j, feat in enumerate(sub_features):
                        if field_name in feat:
                            arrays.append((j + 1) * np.array(feat[field_name]))
                    out_features[i][field_name] = np.concatenate(arrays)
                else:
                    # By default, just concatenate them if they are arrays
                    # or extend them if they are lists.
                    # For example, input_ids or labels are often arrays.
                    arrays = []
                    for feat in sub_features:
                        if field_name in feat and isinstance(
                            feat[field_name], (list, torch.Tensor)
                        ):
                            if isinstance(feat[field_name][0], (dict, str)):
                                continue
                            arr = np.array(feat[field_name])
                            arrays.append(arr)
                    if arrays:
                        out_features[i][field_name] = np.concatenate(arrays)

        # 3) Now call the parent collator, which will do:
        #    - padding of labels/position_ids
        #    - KD-specific padding for target_logprobs, target_token_ids, etc.
        #    - final conversion to return_tensors
        return super().__call__(out_features, return_tensors=return_tensors)


================================================
FILE: src/axolotl/integrations/kd/collator_online_teacher.py
================================================
"""
Packed data loader for online teacher training supporting vllm and sglang.
"""

import hashlib
import hmac
import logging
from typing import Any, Dict, List, Optional

import requests
import torch
from orjson import orjson

from axolotl.integrations.kd.collator import KDBatchSamplerDataCollatorForSeq2Seq
from axolotl.integrations.kd.utils import normalize_logprobs
from axolotl.utils.data.utils import retry_on_request_exceptions

LOG = logging.getLogger(__name__)


def hmac_sha_from_int_list(int_list, key, hash_func=hashlib.sha256):
    """
    Create HMAC-SHA hash from a list of integers

    Args:
        int_list: List of integers
        key: Secret key (string or bytes)
        hash_func: Hash function (default: sha256)

    Returns:
        HMAC digest as hex string
    """
    # Convert key to bytes if it's a string
    if isinstance(key, str):
        key = key.encode("utf-8")

    # Convert list of ints to bytes
    # Method 1: Convert each int to bytes and concatenate
    data = b"".join(i.to_bytes(4, byteorder="big") for i in int_list)

    # Create HMAC
    h = hmac.new(key, data, hash_func)
    return h.hexdigest()


class OnlineTeacherCollator(KDBatchSamplerDataCollatorForSeq2Seq):
    """
    Collator for online teacher training.
    """

    DEFAULT_LABEL_PAD_TOKEN_ID: int = -100

    def __init__(
        self,
        *args: Any,
        kd_online_server_base_url: Optional[str] = None,
        kd_online_topk: Optional[int] = None,
        kd_temperature: Optional[float] = 1.0,
        kd_online_server: Optional[str] = "vllm",
        kd_online_timeout: Optional[int] = 120,
        kd_cache_dir: Optional[str] = None,
        kd_normalize_topk: Optional[bool] = True,
        **kwargs: Any,
    ):
        super().__init__(*args, **kwargs)

        if kd_online_server_base_url is None:
            raise ValueError(
                "kd_online_server_base_url must be provided for OnlineTeacherDataloader"
            )
        if kd_online_topk is None or kd_online_topk <= 0:
            raise ValueError(
                "kd_online_topk must be a positive integer for OnlineTeacherDataloader"
            )

        self.kd_online_server_base_url = kd_online_server_base_url.rstrip("/")
        self.kd_online_topk = kd_online_topk
        self.kd_temperature = kd_temperature
        self.kd_online_server = kd_online_server
        self.http_session = requests.Session()
        self.kd_online_timeout = kd_online_timeout
        self.kd_cache_dir = kd_cache_dir
        self.kd_normalize_topk = kd_normalize_topk

    def _normalize_logprobs(self, raw_logprobs: List[float]) -> List[float]:
        """
        Re-normalizes top-k raw logprobs as probabilities, and converts back to logprobs.
        """
        if not raw_logprobs or self.kd_online_topk == 0:
            return (
                [-float("inf")] * self.kd_online_topk if self.kd_online_topk > 0 else []
            )

        raw_logprobs_tensor = torch.tensor(raw_logprobs, dtype=torch.float32)
        return normalize_logprobs(raw_logprobs_tensor, self.kd_online_topk).tolist()

    @retry_on_request_exceptions(max_retries=10, delay=5)
    def fetch_online_logprobs_sglang(
        self, batch_input_ids: List[List[int]], labels: List[List[int]]
    ):
        """
        Fetches logprobs from an online teacher served by sglang for a batch of input_ids.
        Assumes API returns token IDs as strings in logprob dictionary keys.
        """
        api_endpoint = f"{self.kd_online_server_base_url}/generate"

        payload = {
            "input_ids": batch_input_ids,
            "return_logprob": True,
            "top_logprobs_num": self.kd_online_topk,
            "logprob_start_len": 0,
            "return_text_in_logprobs": True,
            "echo": True,
            "sampling_params": {
                "max_new_tokens": 0,
                "temperature": self.kd_temperature,
                "skip_special_tokens": False,
            },
        }

        # Initialize with empty lists, so if API call fails, these are returned.
        ret_data_target_token_ids: List[List[List[int]]] = []
        ret_data_target_logprobs: List[List[List[float]]] = []
        ret_data_target_mask: List[List[List[int]]] = []

        try:
            response = self.http_session.post(
                api_endpoint, json=payload, timeout=self.kd_online_timeout
            )
            response.raise_for_status()
            api_data: list[dict] = response.json()

            # Ensure api_data is a list, and its length matches batch_input_ids
            if not isinstance(api_data, list) or len(api_data) != len(batch_input_ids):
                LOG.error(
                    f"API response format error. Expected a list of {len(batch_input_ids)} "
                    f"items, got {type(api_data)} with length {len(api_data) if isinstance(api_data, list) else 'N/A'}."
                )
                # Return empty data; items processed later will get default empty KD fields
                return {
                    "target_token_ids": ret_data_target_token_ids,
                    "target_logprobs": ret_data_target_logprobs,
                    "target_mask": ret_data_target_mask,
                }

            for sequence_data, seq_input_ids, seq_labels in zip(
                api_data, batch_input_ids, labels, strict=False
            ):
                current_target_logprobs = []
                current_target_token_ids = []
                current_target_mask = []

                meta_info = sequence_data.pop("meta_info", {})
                # Ensure input_top_logprobs is a list
                input_top_logprobs: Optional[list[None | list[tuple]]] = meta_info.pop(
                    "input_top_logprobs", []
                )
                if not isinstance(input_top_logprobs, list):
                    LOG.warning(
                        f"Received non-list input_top_logprobs: {input_top_logprobs}. Skipping sequence."
                    )
                    input_top_logprobs = []  # Treat as empty

                # basic check that the logprob data len matches the input len, so no need to handle padding
                assert len(seq_input_ids) == len(input_top_logprobs)

                for i, _, label in zip(
                    range(len(seq_input_ids)), seq_input_ids, seq_labels, strict=False
                ):
                    if i < len(input_top_logprobs) and input_top_logprobs[i] is None:
                        # this is always the case for the first token.
                        # there is never logprob data for the first token since that's a true input
                        # so we replace the None value with padding data
                        current_target_logprobs.append(
                            [-float("inf")] * self.kd_online_topk
                        )
                        current_target_token_ids.append([0] * self.kd_online_topk)
                        current_target_mask.append([0] * self.kd_online_topk)
                    elif (
                        i < len(input_top_logprobs)
                        and input_top_logprobs[i] is not None
                    ):
                        pos_top_logprobs_data = input_top_logprobs[i]
                        # Ensure pos_top_logprobs_data is a list of lists as expected
                        if not (
                            isinstance(pos_top_logprobs_data, list)
                            and all(
                                isinstance(item, list) for item in pos_top_logprobs_data
                            )
                            and len(pos_top_logprobs_data) > 0
                            and len(pos_top_logprobs_data[0]) == 3
                        ):  # [logprob, token_id, token_str]
                            LOG.warning(
                                f"Malformed pos_top_logprobs_data: {pos_top_logprobs_data}. Padding this position."
                            )
                            current_target_logprobs.append(
                                [-float("inf")] * self.kd_online_topk
                            )
                            current_target_token_ids.append([0] * self.kd_online_topk)
                            current_target_mask.append([0] * self.kd_online_topk)
                            continue

                        # pos_top_logprobs: list of logprobs, pos_token_ids: list of token_ids
                        pos_logprobs_raw, pos_token_ids, _ = [
                            list(row)
                            for row in zip(*pos_top_logprobs_data, strict=False)
                        ]

                        # Ensure correct length (top_k)
                        if len(pos_logprobs_raw) < self.kd_online_topk:
                            pad_len = self.kd_online_topk - len(pos_logprobs_raw)
                            pos_logprobs_raw.extend([-float("inf")] * pad_len)
                            pos_token_ids.extend([0] * pad_len)  # Pad with 0 token_id

                        # truncate to top_k in case the response was longer
                        current_target_token_ids.append(
                            pos_token_ids[: self.kd_online_topk]
                        )

                        if self.kd_normalize_topk:
                            normalized_logprobs_for_position = self._normalize_logprobs(
                                pos_logprobs_raw[: self.kd_online_topk]
                            )
                            current_target_logprobs.append(
                                normalized_logprobs_for_position
                            )
                        else:
                            current_target_logprobs.append(
                                pos_logprobs_raw[: self.kd_online_topk]
                            )

                        # Mask depends on the corresponding label for the student
                        if label == self.DEFAULT_LABEL_PAD_TOKEN_ID:
                            current_target_mask.append([0] * self.kd_online_topk)
                        else:
                            current_target_mask.append([1] * self.kd_online_topk)
                    else:
                        # Pad if no logprobs for this position (either due to length mismatch or None entry)
                        current_target_logprobs.append(
                            [-float("inf")] * self.kd_online_topk
                        )
                        current_target_token_ids.append([0] * self.kd_online_topk)
                        current_target_mask.append([0] * self.kd_online_topk)

                ret_data_target_token_ids.append(current_target_token_ids)
                ret_data_target_logprobs.append(current_target_logprobs)
                ret_data_target_mask.append(current_target_mask)

        except requests.exceptions.RequestException as e:
            LOG.error(f"Error fetching logprobs from online teacher: {e}")
            raise e
            # ret_logprobs_data will be returned with empty lists, handled by the caller.
        except Exception as e:  # Catch other potential errors during processing
            LOG.error(
                f"Unexpected error processing API response in fetch_online_logprobs: {e}",
                exc_info=True,
            )
            raise e

        return {
            "target_token_ids": ret_data_target_token_ids,
            "target_logprobs": ret_data_target_logprobs,
            "target_mask": ret_data_target_mask,
        }

    @retry_on_request_exceptions(max_retries=10, delay=5)
    def fetch_online_logprobs_vllm(
        self, batch_input_ids: List[List[int]], labels: List[List[int]]
    ):
        """
        Fetches logprobs from an online teacher served by vllm for a batch of input_ids.
        Assumes API returns token IDs as strings in logprob dictionary keys.
        """
        api_endpoint = f"{self.kd_online_server_base_url}/v1/completions"

        payload = {
            "prompt": batch_input_ids,
            "echo": True,
            "logprobs": True,
            "prompt_logprobs": self.kd_online_topk,
            "top_logprobs": self.kd_online_topk,
            "max_new_tokens": 0,
            "skip_special_tokens": False,
            "temperature": self.kd_temperature,
            "sampling_params": {
                "max_tokens": 0,
            },
        }

        # Initialize with empty lists, so if API call fails, these are returned.
        ret_data_target_token_ids: List[List[List[int]]] = []
        ret_data_target_logprobs: List[List[List[float]]] = []
        ret_data_target_mask: List[List[List[int]]] = []

        try:
            headers = {"Accept-Encoding": "deflate, gzip, br, zstd"}
            response = self.http_session.post(
                api_endpoint,
                json=payload,
                headers=headers,
                timeout=self.kd_online_timeout,
            )
            response.raise_for_status()
            api_data: dict = orjson.loads(response.content)
            choices: list[dict] = api_data["choices"]

            # Ensure api_data is a list, and its length matches batch_input_ids
            if not isinstance(choices, list) or len(choices) != len(batch_input_ids):
                LOG.error(
                    f"API response format error. Expected a list of {len(batch_input_ids)} "
                    f"items, got {type(api_data)} with length {len(api_data) if isinstance(api_data, list) else 'N/A'}."
                )
                # Return empty data; items processed later will get default empty KD fields
                return {
                    "target_token_ids": ret_data_target_token_ids,
                    "target_logprobs": ret_data_target_logprobs,
                    "target_mask": ret_data_target_mask,
                }

            for sequence_data, seq_input_ids, seq_labels in zip(
                choices, batch_input_ids, labels, strict=False
            ):
                # seq_input_ids: List[int]
                # seq_labels: List[int]

                current_target_logprobs = []
                current_target_token_ids = []
                current_target_mask = []

                # Ensure input_top_logprobs is a list
                input_top_logprobs: Optional[list[None | dict[str, dict]]] = (
                    sequence_data.pop("prompt_logprobs", [])
                )

                if not isinstance(input_top_logprobs, list):
                    LOG.warning(
                        f"Received non-list input_top_logprobs: {input_top_logprobs}. Skipping sequence."
                    )
                    input_top_logprobs = []  # Treat as empty

                # basic check that the logprob data len matches the input len, so no need to handle padding
                assert len(seq_input_ids) == len(input_top_logprobs)

                seq_len = len(seq_input_ids)

                for i, _, label in zip(
                    range(seq_len), seq_input_ids, seq_labels, strict=False
                ):
                    if i < len(input_top_logprobs) and input_top_logprobs[i] is None:
                        # this is always the case for the first token.
                        # there is never logprob data for the first token since that's a true input
                        continue
                    if (
                        i < len(input_top_logprobs)
                        and input_top_logprobs[i] is not None
                    ):
                        pos_top_logprobs_data: dict[str, dict] = input_top_logprobs[i]  # type: ignore[assignment]
                        # Ensure pos_top_logprobs_data is a list of lists as expected
                        if not (
                            isinstance(pos_top_logprobs_data, dict)
                            and all(
                                isinstance(item, dict)
                                for item in pos_top_logprobs_data.values()
                            )
                            and len(pos_top_logprobs_data.keys()) > 0
                        ):  # [logprob, token_id, token_str]
                            LOG.warning(
                                f"Malformed pos_top_logprobs_data: {pos_top_logprobs_data}. Padding this position."
                            )
                            current_target_logprobs.append(
                                [-float("inf")] * self.kd_online_topk
                            )
                            current_target_token_ids.append(
                                list(range(self.kd_online_topk))
                            )
                            current_target_mask.append([0] * self.kd_online_topk)
                            continue

                        # pos_top_logprobs: list of logprobs, pos_token_ids: list of token_ids
                        pos_token_ids_str = list(pos_top_logprobs_data.keys())
                        pos_logprobs_dict = pos_top_logprobs_data.values()
                        pos_token_ids = [
                            int(token_id) for token_id in pos_token_ids_str
                        ]
                        pos_logprobs_raw = [
                            float(logprob.get("logprob", -float("inf")))
                            for logprob in pos_logprobs_dict
                        ]

                        # Ensure correct length (top_k)
                        if len(pos_logprobs_raw) < self.kd_online_topk:
                            pad_len = self.kd_online_topk - len(pos_logprobs_raw)
                            LOG.warning(
                                f"Padding position {i} with {pad_len} top-k tokens and logprobs."
                            )
                            pos_logprobs_raw.extend([-float("inf")] * pad_len)
                            pos_token_ids.extend([0] * pad_len)  # Pad with 0 token_id

                        # truncate to top_k in case the response was longer
                        current_target_token_ids.append(
                            pos_token_ids[: self.kd_online_topk]
                        )

                        if self.kd_normalize_topk:
                            normalized_logprobs_for_position = self._normalize_logprobs(
                                pos_logprobs_raw[: self.kd_online_topk]
                            )
                            current_target_logprobs.append(
                                normalized_logprobs_for_position
                            )
                        else:
                            current_target_logprobs.append(
                                pos_logprobs_raw[: self.kd_online_topk]
                            )

                        # Mask depends on the corresponding label for the student
                        if label == self.DEFAULT_LABEL_PAD_TOKEN_ID:
                            current_target_mask.append([0] * self.kd_online_topk)
                        else:
                            current_target_mask.append([1] * self.kd_online_topk)
                    else:
                        # Pad if no logprobs for this position (either due to length mismatch or None entry)
                        current_target_logprobs.append(
                            [-float("inf")] * self.kd_online_topk
                        )
                        current_target_token_ids.append(
                            list(range(self.kd_online_topk))
                        )
                        current_target_mask.append([0] * self.kd_online_topk)
                for _ in range(max(0, seq_len - len(current_target_logprobs))):
                    current_target_logprobs.append(
                        [-float("inf")] * self.kd_online_topk
                    )
                    current_target_token_ids.append(list(range(self.kd_online_topk)))
                    current_target_mask.append([0] * self.kd_online_topk)

                ret_data_target_token_ids.append(current_target_token_ids)
                ret_data_target_logprobs.append(current_target_logprobs)
                ret_data_target_mask.append(current_target_mask)

                # TODO save and load targets to disk for caching for next epoch
                # generate a hmac SHA256 hash over the list seq_input_ids and convert it to an int
                # if self.kd_cache_dir:
                #     hash_input_ids = hmac_sha_from_int_list(
                #         seq_input_ids, f"{self.kd_online_server_base_url}:{self.kd_online_topk}"
                #     )
                #     with open(f"{self.kd_cache_dir}/{hash_input_ids}.parquet", "wb") as f:
                #         pd.DataFrame(ret_logprobs_data).to_parquet(f, index=False)

        except requests.exceptions.RequestException as e:
            LOG.error(f"Error fetching logprobs from online teacher: {e}")
            raise e
            # ret_logprobs_data will be returned with empty lists, handled by the caller.
        except Exception as e:  # Catch other potential errors during processing
            LOG.error(
                f"Unexpected error processing API response in fetch_online_logprobs: {e}",
                exc_info=True,
            )
            raise e

        return {
            "target_token_ids": ret_data_target_token_ids,
            "target_logprobs": ret_data_target_logprobs,
            "target_mask": ret_data_target_mask,
        }

    def __call__(
        self, features: List[List[Dict[str, Any]]], return_tensors: Optional[str] = None
    ) -> Dict[str, Any]:
        if not features:
            return super().__call__(features, return_tensors=return_tensors)

        for (
            sub_batch_features
        ) in features:  # sub_batch_features is List[Dict[str, Any]]
            if not sub_batch_features:
                continue

            input_ids_for_api_call: List[List[int]] = []
            labels_for_api_call: List[List[int]] = []
            # Store references to the original item dictionaries to update them in-place
            items_for_api_call: List[Dict[str, Any]] = []

            for item_dict in sub_batch_features:
                if not isinstance(item_dict, dict):
                    LOG.warning(
                        f"Skipping non-dict item in sub_batch_features: {item_dict}"
                    )
                    continue

                current_input_ids = item_dict.get("input_ids")
                current_labels = item_dict.get("labels")

                if current_input_ids is not None and current_labels is not None:
                    # Ensure input_ids and labels are lists of ints for JSON serialization
                    input_ids_list = (
                        current_input_ids.tolist()
                        if hasattr(current_input_ids, "tolist")
                        else list(current_input_ids)
                    )
                    labels_list = (
                        current_labels.tolist()
                        if hasattr(current_labels, "tolist")
                        else list(current_labels)
                    )

                    input_ids_for_api_call.append(input_ids_list)
                    labels_for_api_call.append(labels_list)
                    items_for_api_call.append(item_dict)
                else:
                    # This item will not get teacher logprobs from the API.
                    # Initialize KD fields to empty lists so downstream collators handle them uniformly.
                    item_dict.setdefault("target_token_ids", [])
                    item_dict.setdefault("target_logprobs", [])
                    item_dict.setdefault("target_mask", [])

            # print(items_for_api_call)
            if items_for_api_call:  # Only call API if there's something to process
                if self.kd_online_server == "sglang":
                    api_responses_for_sub_batch = self.fetch_online_logprobs_sglang(
                        input_ids_for_api_call, labels_for_api_call
                    )
                else:
                    api_responses_for_sub_batch = self.fetch_online_logprobs_vllm(
                        input_ids_for_api_call, labels_for_api_call
                    )

                # api_responses_for_sub_batch has keys: "target_token_ids", "target_logprobs", "target_mask"
                # Each value is a list, corresponding to items_for_api_call
                for i, item_to_update in enumerate(items_for_api_call):
                    # TODO make sure to figure out which input in sub_batch_features to update the batch in the original `features` object so the super class can handle it properly.
                    if api_responses_for_sub_batch and i < len(
                        api_responses_for_sub_batch["target_token_ids"]
                    ):  # Check bounds
                        assert len(
                            api_responses_for_sub_batch["target_token_ids"][i]
                        ) == len(item_to_update["input_ids"])
                        assert len(
                            api_responses_for_sub_batch["target_logprobs"][i]
                        ) == len(item_to_update["input_ids"])
                        assert len(
                            api_responses_for_sub_batch["target_mask"][i]
                        ) == len(item_to_update["labels"])
                        item_to_update["target_token_ids"] = (
                            api_responses_for_sub_batch["target_token_ids"][i]
                        )
                        item_to_update["target_logprobs"] = api_responses_for_sub_batch[
                            "target_logprobs"
                        ][i]
                        item_to_update["target_mask"] = api_responses_for_sub_batch[
                            "target_mask"
                        ][i]
                    else:
                        # API call failed for this item, or response was shorter than expected.
                        # Ensure KD fields are initialized as empty lists.
                        LOG.warning(
                            f" (index {i}), or API response was too short. "
                            f"API response keys: {list(api_responses_for_sub_batch.keys()) if api_responses_for_sub_batch else 'None'}"
                        )
                        item_to_update.setdefault("target_token_ids", [])
                        item_to_update.setdefault("target_logprobs", [])
                        item_to_update.setdefault("target_mask", [])

        return super().__call__(features, return_tensors=return_tensors)


================================================
FILE: src/axolotl/integrations/kd/kernels/__init__.py
================================================
"""
Liger Chunked loss optimizations module
"""

from .liger import LigerFusedLinearKLTopKLogprobLoss
from .models import apply_kernel

__all__ = ["LigerFusedLinearKLTopKLogprobLoss", "apply_kernel"]


================================================
FILE: src/axolotl/integrations/kd/kernels/liger.py
================================================
"""
Liger Kernels for Chunked Top-K Log-Prob Distillation
"""

import torch
import torch.nn.functional as F
from liger_kernel.chunked_loss.fused_linear_distillation import (
    LigerFusedLinearDistillationBase,
)

from axolotl.integrations.kd.utils import normalize_logprobs


class LigerFusedLinearKLTopKLogprobFunction(LigerFusedLinearDistillationBase):
    """
    Chunked kl-div loss for top-k logprobs
    """

    @staticmethod
    def distillation_loss_fn(
        student_logits_temp_scaled: torch.Tensor,  # [chunk_size, vocab_size], already temp-scaled
        target_token_ids_chunk: torch.Tensor,  # [chunk_size, top_k]
        target_logprobs_chunk: torch.Tensor,  # [chunk_size, top_k], already temp-scaled and normalized logprobs
        target_mask_chunk: torch.Tensor,  # [chunk_size, top_k]
        beta: float = 0.0,
        normalize_topk: bool = True,
    ) -> torch.Tensor:
        """
        Compute Top-K KL divergence loss for a chunk.
        Args:
            student_logits_temp_scaled: Student logits, scaled by temperature. Shape: (N, V).
            target_token_ids_chunk: Top-k teacher token IDs. Shape: (N, K).
            target_logprobs_chunk: Top-k teacher log probabilities (temp-scaled, normalized). Shape: (N, K).
            target_mask_chunk: Mask for valid top-k tokens. Shape: (N, K).
            beta: Controls the type of KL divergence.
                  0.0 for Forward KL (P_teacher || P_student).
                  1.0 for Reverse KL (P_student || P_teacher).
                  0.5 for Symmetric KL (average of Forward and Reverse).
            normalize_topk: Whether to normalize the log probabilities
        Returns:
            Sum of KL divergence losses for the chunk.
        """
        topk = target_token_ids_chunk.shape[-1]
        student_logits_temp_scaled = (  # [chunk_size, vocab_size]
            student_logits_temp_scaled.float()
        )
        target_logprobs_chunk = target_logprobs_chunk.float()

        # Gather student logits for the top-k teacher token IDs
        # target_token_ids_chunk: [chunk_size, top_k]
        # student_logits_topk_temp_scaled: [chunk_size, top_k]
        student_logits_topk_temp_scaled = torch.gather(
            student_logits_temp_scaled, dim=-1, index=target_token_ids_chunk
        )

        # Student log-probabilities for the gathered top-k tokens
        student_lse = torch.logsumexp(
            student_logits_temp_scaled, dim=-1, keepdim=True
        )  # [chunk_size, 1]
        student_logprobs_topk_temp_scaled = (
            student_logits_topk_temp_scaled - student_lse
        )

        # we have the top-k student logprobs, normalize them
        if normalize_topk:
            student_logprobs_topk_temp_scaled = normalize_logprobs(
                student_logprobs_topk_temp_scaled, topk
            )

        valid_mask = target_mask_chunk.to(torch.bool)  # [chunk_size, top_k]

        student_logprobs_topk_valid = student_logprobs_topk_temp_scaled[valid_mask]
        teacher_logprobs_valid = target_logprobs_chunk[valid_mask]

        # Teacher probabilities P(y|x_teacher) from logprobs
        # target_logprobs_valid are already normalized (log(softmax(teacher_logits/T)))
        teacher_probs_valid = teacher_logprobs_valid.exp()
        # Student probabilities P_student from log P_student
        student_probs_topk_valid = student_logprobs_topk_valid.exp()

        # kd_loss_per_token = torch.zeros_like(target_logprobs_valid)

        # KL divergence: sum(P_teacher * (log P_teacher - log P_student))
        # = sum(P_teacher * log P_teacher) - sum(P_teacher * log P_student)
        # The distillation loss is often formulated as -sum(P_teacher * log P_student)
        # or as sum(P_teacher * (log_softmax_teacher - log_softmax_student))
        # Here, target_logprobs_valid are log_softmax_teacher.
        # student_logprobs_topk_valid are log_softmax_student (for the selected K indices).
        if beta == 0.0:  # Contribution from Forward KL
            fwd_kl_per_token = teacher_probs_valid * (
                teacher_logprobs_valid - student_logprobs_topk_valid
            )
            kd_loss = fwd_kl_per_token.sum()
        elif beta == 1.0:  # Contribution from Reverse KL
            rev_kl_per_token = student_probs_topk_valid * (
                student_logprobs_topk_valid - teacher_logprobs_valid
            )
            kd_loss = rev_kl_per_token.sum()
        else:
            # JSD - Jensen-Shannon Divergence / Symmetric
            mean_probs = (
                1 - beta
            ) * student_probs_topk_valid + beta * teacher_probs_valid
            log_mean_probs = mean_probs.log()
            student_kl = F.kl_div(
                log_mean_probs,
                student_logprobs_topk_valid,
                reduction="sum",
                log_target=True,
            )
            teacher_kl = F.kl_div(
                log_mean_probs, teacher_logprobs_valid, reduction="sum", log_target=True
            )
            jsd_loss = beta * teacher_kl + (1 - beta) * student_kl
            kd_loss = jsd_loss

        return kd_loss

    @staticmethod
    def _compute_loss_kl_topk(
        student_input_chunk: torch.Tensor,
        student_weight: torch.Tensor,
        # Args for student_bias, target_token_ids_chunk etc. are passed to the lambda wrapped by grad_and_value
        # or through `partial`. Let's make them explicit here for clarity.
        target_token_ids_chunk: torch.Tensor,
        target_logprobs_chunk: torch.Tensor,
        target_mask_chunk: torch.Tensor,
        target_chunk: torch.Tensor,  # For hard loss (true labels)
        student_bias: torch.Tensor = None,  # This will be one of the grad targets
        # Other params passed via `partial` from `forward`
        distillation_loss_fn=None,
        ignore_index: int = -100,
        weight_hard_loss: float = 0.5,
        weight_soft_loss: float = 0.5,
        compute_ce_loss: bool = True,
        temperature: float = 1.0,
        beta: float = 0.0,
        normalize_topk: bool = True,
    ):
        # Compute student logits for the chunk from hidden states and LM head
        # student_input_chunk: [chunk_size, hidden_dim]
        # student_lm_head_weight: [vocab_size, hidden_dim]
        # student_logits_chunk: [chunk_size, vocab_size]
        student_logits_chunk = F.linear(
            student_input_chunk, student_weight, student_bias
        )

        ce_loss = torch.tensor(
            0.0, device=student_logits_chunk.device, dtype=student_logits_chunk.dtype
        )
        if compute_ce_loss and weight_hard_loss > 0.0:
            ce_loss = F.cross_entropy(
                student_logits_chunk.view(-1, student_logits_chunk.shape[-1]),
                target_chunk.view(-1),
                reduction="sum",
                ignore_index=ignore_index,
            )

        soft_loss = torch.tensor(
            0.0, device=student_logits_chunk.device, dtype=student_logits_chunk.dtype
        )
        if weight_soft_loss > 0.0:
            student_logits_chunk_temp_scaled = student_logits_chunk / temperature

            # Assuming student_weight.shape[0] (vocab_size) is adequate for target_token_ids_chunk.max()
            # No explicit padding here; user must ensure vocab alignment or pre-pad student_weight.

            soft_loss = distillation_loss_fn(
                student_logits_chunk_temp_scaled,
                target_token_ids_chunk,
                target_logprobs_chunk,
                target_mask_chunk,
                beta=beta,
                normalize_topk=normalize_topk,
            )

        return soft_loss, ce_loss

    @classmethod
    def forward(
        cls,
        ctx,
        student_input: torch.Tensor,  # [batch_size, seq_len, dim]
        student_lm_head_weight: torch.Tensor,  # [dim, vocab_size]
        target_token_ids: torch.Tensor,  # [batch_size, seq_len, top_k]
        target_logprobs: torch.Tensor,  # [batch_size, seq_len, top_k]
        target_mask: torch.Tensor,  # [batch_size, seq_len, top_k]
        true_labels: torch.Tensor,  # [batch_size, seq_len]
        student_lm_head_bias: torch.Tensor = None,
        weight_hard_loss: float = 0.5,
        weight_soft_loss: float = 0.5,
        ignore_index: int = -100,
        temperature: float = 1.0,
        beta: float = 0.0,
        compiled: bool = False,
        chunk_size: int = 1024,
        compute_ce_loss: bool = True,
        normalize_topk: bool = True,
    ):
        CHUNK_SIZE = chunk_size
        grad_weight_acc = torch.zeros_like(student_lm_head_weight)
        grad_inputs_list = []
        grad_bias_acc = (
            torch.zeros_like(student_lm_head_bias)
            if student_lm_head_bias is not None
            else None
        )
        kd_loss_acc = torch.zeros(
            (), device=student_input.device, dtype=student_input.dtype
        )
        ce_loss_acc = torch.zeros(
            (), device=student_input.device, dtype=student_input.dtype
        )

        # This function will be what torch.func.grad_and_value differentiates.
        # It takes student_input_chunk, student_weight (full), student_bias (full) as primals.
        # Other necessary data (target_*, etc.) are passed as non-differentiable arguments.
        def loss_fn_for_grad(
            _student_input_chunk,
            _student_lm_head_weight,  # full weight
            _student_lm_head_bias,  # full bias
            # Fixed arguments for a given chunk, not differentiated:
            _target_token_ids_chunk,
            _target_logprobs_chunk,
            _target_mask_chunk,
            _true_labels_chunk,
        ):
            return cls._compute_loss_kl_topk(
                student_input_chunk=_student_input_chunk,
                student_weight=_student_lm_head_weight,
                target_token_ids_chunk=_target_token_ids_chunk,
                target_logprobs_chunk=_target_logprobs_chunk,
                target_mask_chunk=_target_mask_chunk,
                target_chunk=_true_labels_chunk,
                student_bias=_student_lm_head_bias,
                distillation_loss_fn=cls.distillation_loss_fn,
                ignore_index=ignore_index,
                weight_hard_loss=weight_hard_loss,
                weight_soft_loss=weight_soft_loss,
                compute_ce_loss=compute_ce_loss,
                temperature=temperature,
                beta=beta,
                normalize_topk=normalize_topk,
            )

        def accumulate_chunk_grads(
            student_input_chunk_ac,
            target_token_ids_chunk_ac,
            target_logprobs_chunk_ac,
            target_mask_chunk_ac,
            true_labels_chunk_ac,
        ):
            # student_weight and student_bias are closed over from the outer scope (full tensors)
            if student_lm_head_bias is not None:
                (
                    (chunk_grad_input, chunk_grad_weight, chunk_grad_bias),
                    (chunk_kd_loss, chunk_ce_loss),
                ) = torch.func.grad_and_value(
                    loss_fn_for_grad, argnums=(0, 1, 2), has_aux=True
                )(
                    student_input_chunk_ac,
                    student_lm_head_weight,
                    student_lm_head_bias,  # primals
                    target_token_ids_chunk_ac,
                    target_logprobs_chunk_ac,
                    target_mask_chunk_ac,
                    true_labels_chunk_ac,
                )  # non-primals
                grad_bias_acc.add_(chunk_grad_bias)
            else:
                argnums_for_grad = (0, 1)  # Differentiate wrt input_chunk, weight
                (
                    (chunk_grad_input, chunk_grad_weight),  # No grad for bias
                    (chunk_kd_loss, chunk_ce_loss),
                ) = torch.func.grad_and_value(
                    loss_fn_for_grad, argnums=argnums_for_grad, has_aux=True
                )(
                    student_input_chunk_ac,
                    student_lm_head_weight,
                    None,  # Pass None for student_bias primal
                    target_token_ids_chunk_ac,
                    target_logprobs_chunk_ac,
                    target_mask_chunk_ac,
                    true_labels_chunk_ac,
                )

            grad_weight_acc.add_(chunk_grad_weight)
            kd_loss_acc.add_(chunk_kd_loss)
            ce_loss_acc.add_(chunk_ce_loss)

            return chunk_grad_input

        if compiled:
            accumulate_chunk_grads_compiled = torch.compile(
                accumulate_chunk_grads, dynamic=True, backend="inductor"
            )  # dynamic=True often helpful
        else:
            accumulate_chunk_grads_compiled = accumulate_chunk_grads

        # Use the same chunking logic as LigerFusedLinearDistillationBase.forward
        B, N, D = student_input.shape
        K = target_token_ids.shape[-1]

        student_input_flat = student_input.reshape(-1, student_input.shape[-1])
        target_token_ids_flat = target_token_ids.reshape(-1, target_token_ids.shape[-1])
        target_logprobs_flat = target_logprobs.reshape(-1, target_logprobs.shape[-1])
        target_mask_flat = target_mask.reshape(-1, target_mask.shape[-1])
        # pad and shift for cross entropy loss
        true_labels = torch.nn.functional.pad(true_labels, (0, 1), value=ignore_index)
        true_labels_flat = true_labels[:, 1:].contiguous().view(-1)

        num_chunks = max(1, student_input_flat.shape[0] // CHUNK_SIZE)

        _student_input_chunks = torch.chunk(
            student_input_flat, chunks=num_chunks, dim=0
        )
        _target_token_ids_chunks = torch.chunk(
            target_token_ids_flat, chunks=num_chunks, dim=0
        )
        _target_logprobs_chunks = torch.chunk(
            target_logprobs_flat, chunks=num_chunks, dim=0
        )
        _target_mask_chunks = torch.chunk(target_mask_flat, chunks=num_chunks, dim=0)
        _true_labels_chunks = torch.chunk(true_labels_flat, chunks=num_chunks, dim=0)

        for i in range(num_chunks):
            grad_input_chunk = accumulate_chunk_grads_compiled(
                _student_input_chunks[i],
                _target_token_ids_chunks[i],
                _target_logprobs_chunks[i],
                _target_mask_chunks[i],
                _true_labels_chunks[i],
            )
            grad_inputs_list.append(grad_input_chunk)

        grad_inputs_combined = torch.cat(grad_inputs_list, dim=0)
        ctx.save_for_backward(grad_inputs_combined, grad_weight_acc, grad_bias_acc)

        # For matching None returns in backward for non-tensor/non-grad_requiring inputs
        ctx.hyperparams_count = 9  # Corresponds to number of hyperparams after main tensors in fwd signature
        ctx.bias_was_none = student_lm_head_bias is None
        ctx.orig_dims = (B, N, D, K)

        # since this is packed, there is simply a single batch, so batchmean reduction of kl-div is simply the accumulated sum
        # we still need to scale the kd_loss by the temp^2
        kd_loss_acc = kd_loss_acc * (temperature**2)
        final_loss = weight_soft_loss * kd_loss_acc + weight_hard_loss * ce_loss_acc

        return final_loss

    @staticmethod
    def backward(ctx, grad_output):
        grad_input_flat, grad_weight, grad_bias_maybe = (
            ctx.saved_tensors
        )  # grad_input_flat is (B*N, D)

        # Scale gradients by grad_output if it's not 1.0
        if not torch.equal(
            grad_output,
            torch.tensor(1.0, device=grad_output.device, dtype=grad_output.dtype),
        ):
            grad_input_flat = grad_input_flat * grad_output
            grad_weight = grad_weight * grad_output
            if grad_bias_maybe is not None:
                grad_bias_maybe = grad_bias_maybe * grad_output

        # Reshape grad_input_flat to match original student_input shape (B, N, D)
        # ctx.orig_dims stores (B, N, D, K)
        # We need the first three dimensions for student_input's shape.
        # Ensure that orig_dims are not (0,0,0,K) for empty inputs leading to view errors
        if (
            ctx.orig_dims[0] * ctx.orig_dims[1] * ctx.orig_dims[2] == 0
            and grad_input_flat.numel() == 0
        ):
            # If original input was empty, gradient should also be empty with correct shape
            grad_input_reshaped = torch.zeros(
                ctx.orig_dims[0],
                ctx.orig_dims[1],
                ctx.orig_dims[2],
                dtype=grad_input_flat.dtype,
                device=grad_input_flat.device,
            )
        elif grad_input_flat.numel() == 0 and not (
            ctx.orig_dims[0] * ctx.orig_dims[1] * ctx.orig_dims[2] == 0
        ):
            # This case should ideally not happen if forward path is correct (non-empty input -> non-empty flat grad)
            # but as a safeguard:
            grad_input_reshaped = torch.zeros(
                ctx.orig_dims[0],
                ctx.orig_dims[1],
                ctx.orig_dims[2],
                dtype=grad_input_flat.dtype,
                device=grad_input_flat.device,
            )
        else:
            grad_input_reshaped = grad_input_flat.view(
                ctx.orig_dims[0], ctx.orig_dims[1], ctx.orig_dims[2]
            )

        nones_for_hyperparams = [None] * ctx.hyperparams_count
        grad_bias_return = grad_bias_maybe if not ctx.bias_was_none else None

        return (
            grad_input_reshaped,  # Gradient for student_input (reshaped)
            grad_weight,  # Gradient for student_lm_head_weight
            None,  # Gradient for target_token_ids
            None,  # Gradient for target_logprobs
            None,  # Gradient for target_mask
            None,  # Gradient for true_labels
            grad_bias_return,  # Gradient for student_lm_head_bias
            *nones_for_hyperparams,  # Grads for weight_hard_loss, ..., compute_ce_loss
        )


class LigerFusedLinearKLTopKLogprobLoss(torch.nn.Module):
    """
    wrapper for chunked top-k logprob kl-d
    """

    def __init__(
        self,
        weight_hard_loss: float = 0.5,
        weight_soft_loss: float = 0.5,
        temperature: float = 1.0,  # This is the kd_temperature
        beta: float = 1.0,
        ignore_index: int = -100,
        compiled: bool = True,
        chunk_size: int = 1024,
        compute_ce_loss: bool = True,
        normalize_topk: bool = True,
    ):
        super().__init__()
        if not (0.0 <= weight_hard_loss <= 1.0 and 0.0 <= weight_soft_loss <= 1.0):
            raise ValueError("Loss weights must be between 0.0 and 1.0.")
        if temperature <= 0:
            raise ValueError("Temperature must be positive.")

        self.weight_hard_loss = weight_hard_loss
        self.weight_soft_loss = weight_soft_loss
        self.temperature = temperature
        self.beta = beta
        self.ignore_index = ignore_index
        self.compiled = compiled
        self.chunk_size = chunk_size
        self.compute_ce_loss = compute_ce_loss
        self.normalize_topk = normalize_topk

        if not self.compute_ce_loss and self.weight_hard_loss > 0.0:
            print(
                f"Warning: compute_ce_loss is False, but weight_hard_loss ({self.weight_hard_loss}) > 0. Hard loss will effectively be zero."
            )
            # self.weight_hard_loss = 0.0 # Or let user manage this
        if self.weight_soft_loss == 0.0:
            print(
                "Warning: weight_soft_loss is 0.0. Soft (KD) loss will not be computed."
            )

    def forward(
        self,
        lm_head_weight: torch.Tensor,  # Weights of the linear layer in the LM head
        student_hidden_states: torch.Tensor,  # student_hidden_states before the lm_head
        target_token_ids: torch.Tensor,
        target_logprobs: torch.Tensor,
        target_mask: torch.Tensor,
        true_labels: torch.Tensor,
        student_bias: torch.Tensor = None,
    ) -> torch.Tensor:
        return LigerFusedLinearKLTopKLogprobFunction.apply(
            student_hidden_states,
            lm_head_weight,
            target_token_ids,
            target_logprobs,
            target_mask,
            true_labels,
            student_bias,
            self.weight_hard_loss,
            self.weight_soft_loss,
            self.ignore_index,
            self.temperature,
            self.beta,
            self.compiled,
            self.chunk_size,
            self.compute_ce_loss,
            self.normalize_topk,
        )


================================================
FILE: src/axolotl/integrations/kd/kernels/models.py
================================================
"""
model patcher for chunked top-k kl-div
"""

from typing import Optional, Union, Unpack

import torch
from transformers import Cache
from transformers.modeling_outputs import CausalLMOutputWithPast

try:
    from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
    from transformers.utils import LossKwargs

    class TransformersKwargs(FlashAttentionKwargs, LossKwargs):
        """
        placeholder kwargs for hf model classes
        """

except ImportError:
    from transformers.utils.generic import (  # type: ignore[no-redef]
        TransformersKwargs,
    )

from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix


def kldiv_forward_llama_like(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    target_logprobs: Optional[torch.Tensor] = None,
    target_token_ids: Optional[torch.LongTensor] = None,
    target_mask: Optional[torch.Tensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[Cache] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    **kwargs: Unpack[TransformersKwargs],  # type: ignore[misc]
) -> CausalLMOutputWithPast:
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )

    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        cache_position=cache_position,
        **kwargs,
    )

    hidden_states = outputs.last_hidden_state

    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
    # TODO, we can optimize this further by filtering hidden_states on sequence dimension using labels != -100
    # self._loss_function should be LigerFusedLinearKLTopKLogprobLoss

    loss = self._loss_function(
        self.lm_head.weight,
        hidden_states,
        target_token_ids,
        target_logprobs,
        target_mask,
        true_labels=labels,
    )
    num_items_in_batch = kwargs.pop("num_items_in_batch", -1)
    if num_items_in_batch is not None and num_items_in_batch > 0:
        loss = loss / num_items_in_batch

    return CausalLMOutputWithPast(
        loss=loss,
        logits=None,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )


def apply_kernel(model_type):
    # Dynamically import the module and attention class
    module_path = f"transformers.models.{model_type}.modeling_{model_type}"
    model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
    module = __import__(module_path, fromlist=[f"{model_cls_prefix}ForCausalLM"])
    model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")
    model_cls.forward = kldiv_forward_llama_like


================================================
FILE: src/axolotl/integrations/kd/topk_logprob/__init__.py
================================================


================================================
FILE: src/axolotl/integrations/kd/topk_logprob/forward_kl.py
================================================
# Copyright 2024 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
loss for top_k KL divergence
"""

import torch
from torch import nn


@torch.jit.script
def loss(
    student_logits: torch.Tensor,
    target_token_ids: torch.Tensor,
    target_logprobs: torch.Tensor,
    target_mask: torch.Tensor,
    num_items_in_batch: int = -1,  # Use -1 to indicate "None"
    kd_temperature: float = 1.0,
) -> torch.Tensor:
    """
    A KD loss function that is TorchScript-friendly.

    Arguments:
        student_logits (torch.Tensor): The logits of the student model.
            Shape: [B, student_seq_len, vocab_size]
        target_token_ids (torch.Tensor): The top-k teacher/target token IDs
            Shape: [B, teacher_seq_len, top_k]
        target_logprobs (torch.Tensor): The top-k teacher/target logprobs, these should already be re-normalized.
            Shape: [B, teacher_seq_len, top_k]
        target_mask (torch.Tensor): The mask for valid tokens.
            Shape: [B, teacher_seq_len, top_k]
        num_items_in_batch (int, optional): The number of items in the batch.
        kd_temperature (float, optional): The temperature for KD.
            Default: 1.0
    """

    target_logprobs = target_logprobs.float()

    # Determine the teacher sequence length
    # target_token_ids shape: [B, teacher_seq_len, K]
    # student_logits shape:   [B, student_seq_len, vocab_size]
    teacher_seq_len = target_token_ids.shape[1]

    # Slice student logits to match teacher-provided sequence length
    student_logits_for_kd = (
        student_logits[:, :teacher_seq_len, :] / kd_temperature
    )  # [B, teacher_seq_len, vocab_size]

    # keep in full precision for numerical stability of loss
    student_logits_for_kd = student_logits_for_kd.float()

    # Gather student logits for teacher's top-K tokens
    student_logits_topk = torch.gather(
        student_logits_for_kd, dim=-1, index=target_token_ids
    )  # [B, teacher_seq_len, K]

    # Compute logsumexp across full vocabulary
    student_lse = torch.logsumexp(student_logits_for_kd, dim=-1, keepdim=True)

    #  Convert just the top-k logits to logprobs
    student_logprobs_topk = student_logits_topk - student_lse

    # Convert teacher_mask to boolean for indexing
    # In TorchScript, .bool() is sometimes unsupported, so we do:
    valid_mask = target_mask.to(torch.bool)

    # Prune tensors to only keep valid tokens
    student_logprobs_topk = student_logprobs_topk[valid_mask]
    target_logprobs = target_logprobs[valid_mask]

    # Convert teacher logprobs to probabilities
    teacher_probs = target_logprobs.exp()

    # Compute forward KL
    kd_loss_per_token = teacher_probs * (target_logprobs - student_logprobs_topk)
    kd_loss = kd_loss_per_token.sum()

    # Normalize by number of items (if provided) or by valid tokens
    if num_items_in_batch > 0:
        kd_loss = kd_loss / float(num_items_in_batch)
    else:
        # Fall back to average over valid tokens
        kd_loss = kd_loss / float(kd_loss_per_token.size(0))

    return kd_loss


class ChunkedTopKKDLoss(nn.Module):
    """
    A wrapper that chunks (splits) the student and teacher outputs along the time dimension
    to reduce peak memory usage when upcasting from bf16 to fp32, especially for large vocabularies.

    Usage is analogous to ForwardKLWithChunkedOutputLoss but adapted to top-K teacher logprobs.
    """

    def __init__(self, num_output_chunks: int = 8, kd_temperature: float = 1.0):
        super().__init__()
        self.num_output_chunks = num_output_chunks
        self.kd_temperature = kd_temperature

    def forward(
        self,
        student_logits: torch.Tensor,  # [B, seq_len, vocab_size]
        target_token_ids: torch.Tensor,  # [B, seq_len, K]
        target_logprobs: torch.Tensor,  # [B, seq_len, K]
        target_mask: torch.Tensor,  # [B, seq_len, K]
        num_items_in_batch: int = -1,  # optional batch size for normalization
    ) -> torch.Tensor:
        # 1. Split along the "token" dimension (dim=1).
        student_logits_chunks = student_logits.chunk(self.num_output_chunks, dim=1)
        token_ids_chunks = target_token_ids.chunk(self.num_output_chunks, dim=1)
        logprobs_chunks = target_logprobs.chunk(self.num_output_chunks, dim=1)
        mask_chunks = target_mask.chunk(self.num_output_chunks, dim=1)

        # We'll accumulate a global "sum of losses" and "sum of valid tokens"
        # so that our final average is consistent with the entire sequence/batch.
        total_loss = 0.0
        total_valid_tokens = 0

        # 2. Loop over each chunk and compute a chunk-specific loss.
        for st_chunk, tid_chunk, lp_chunk, msk_chunk in zip(
            student_logits_chunks,
            token_ids_chunks,
            logprobs_chunks,
            mask_chunks,
            strict=False,
        ):
            # We pass num_items_in_batch=-1 so that the kd_loss
            # will average over *this chunk's* valid tokens only.
            chunk_loss = loss(
                student_logits=st_chunk,
                target_token_ids=tid_chunk,
                target_logprobs=lp_chunk,
                target_mask=msk_chunk,
                num_items_in_batch=-1,  # ensure per-chunk averaging by valid tokens
                kd_temperature=self.kd_temperature,
            )

            # kd_loss returns an average over the chunk's valid tokens.
            # We want a global average in the end, so we need to re‐weight
            # by the number of valid tokens in this chunk and keep track of the total.
            chunk_valid_mask = msk_chunk.to(torch.bool)
            chunk_valid_count = chunk_valid_mask.sum()  # scalar tensor

            # Re-scale "chunk average" back to "chunk sum"
            chunk_loss_sum = chunk_loss * chunk_valid_count

            total_loss += chunk_loss_sum
            total_valid_tokens += chunk_valid_count

        # 3. Normalize *once* at the end.
        if num_items_in_batch > 0:
            # If the user gave us a manual denominator (e.g. total items in batch),
            # we divide by it. Typically used if each item is of different length.
            final_loss = total_loss / float(num_items_in_batch)
        else:
            # Otherwise, divide by total valid tokens across all chunks.
            # to get the same result as a non-chunked approach.
            final_loss = total_loss / float(total_valid_tokens)

        return final_loss


================================================
FILE: src/axolotl/integrations/kd/trainer.py
================================================
# Copyright 2024 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
KD trainer
"""

from typing_extensions import override

from axolotl.core.trainers.base import AxolotlTrainer

from .kernels.liger import LigerFusedLinearKLTopKLogprobLoss


class AxolotlKDTrainer(AxolotlTrainer):
    """
    Custom trainer subclass for Knowledge Distillation (KD)
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.model_accepts_loss_kwargs = True

        loss_fn = LigerFusedLinearKLTopKLogprobLoss(
            self.args.kd_ce_alpha,  # hard label loss
            self.args.kd_alpha,  # kd loss
            self.args.kd_temperature,
            self.args.kd_beta or 0.0,
            compute_ce_loss=bool(self.args.kd_ce_alpha),
            normalize_topk=self.args.kd_normalize_topk,
        )
        target = self.model

        # Unwrap PEFT wrapper
        if hasattr(target, "get_base_model"):
            target = target.get_base_model()

        # Set on the actual model instance
        target._loss_function = loss_fn

    def _set_signature_columns_if_needed(self):
        super()._set_signature_columns_if_needed()
        columns_to_add = []
        if self._signature_columns:
            if "target_logprobs" not in self._signature_columns:
                columns_to_add.append("target_logprobs")
            if "target_token_ids" not in self._signature_columns:
                columns_to_add.append("target_token_ids")
            if "target_mask" not in self._signature_columns:
                columns_to_add.append("target_mask")
            if columns_to_add:
                self._signature_columns += columns_to_add

    @override
    def compute_loss(
        self,
        model,
        inputs,
        return_outputs=False,
        num_items_in_batch=None,
    ):
        """
        How the loss is computed by Trainer. By default, all models return the loss in the first element.

        Subclass and override for custom behavior.
        """
        if (
            self.args.sample_packing
            and hasattr(inputs, "attention_mask")
            and hasattr(inputs, "position_ids")
        ):
            del inputs["attention_mask"]

        if num_items_in_batch is None and "labels" in inputs:
            num_items_in_batch = (inputs["labels"] != -100).sum().item()

        if self.model_accepts_loss_kwargs:
            loss_kwargs = {}
            if num_items_in_batch is not None:
                loss_kwargs["num_items_in_batch"] = num_items_in_batch
            inputs = {**inputs, **loss_kwargs}

        outputs = model(**inputs)

        if isinstance(outputs, dict):
            loss = outputs["loss"]
        elif isinstance(outputs, tuple):
            loss = outputs[0]
        else:
            loss = outputs.loss if hasattr(outputs, "loss") else outputs

        return (loss, outputs) if return_outputs else loss


================================================
FILE: src/axolotl/integrations/kd/utils.py
================================================
"""Helper KD utils"""

import math
from typing import List, Union

import numpy as np
import torch
from torch import FloatTensor, Tensor


def normalize_logprobs(logprobs: FloatTensor, topk: int) -> FloatTensor:
    """
    Re-normalizes top-k raw logprobs as probabilities, and converts back to logprobs.
    """
    # Ensure raw_logprobs matches kd_online_topk length for tensor operations
    # This should ideally be handled by the caller ensuring correct padding/truncation first
    if logprobs.shape[-1] != topk:
        # pad last dimension of logprobs to match topk length with -inf
        padding_len = topk - logprobs.shape[-1]
        padding_tensor = torch.full(
            (
                *logprobs.shape[:-1],
                padding_len,
            ),  # Takes all dimensions of logprobs except the last, then appends padding_needed
            float("-inf"),
            dtype=logprobs.dtype,
            device=logprobs.device,
        )
        logprobs = torch.cat((logprobs, padding_tensor), dim=-1)

    # Convert logprobs at T_online to probabilities
    # use log sum exp trick to avoid underflow
    position_logprobs_lse = torch.logsumexp(logprobs, dim=-1, keepdim=True)
    teacher_probs_t_online = torch.exp(logprobs - position_logprobs_lse)

    # Normalize probabilities (sum to 1)
    # This is important if the top-k from server aren't a full distribution
    teacher_probs_t_online_sum = teacher_probs_t_online.sum(dim=-1, keepdim=True)
    teacher_probs_t_online = teacher_probs_t_online / teacher_probs_t_online_sum

    final_logprobs_tensor = torch.log(teacher_probs_t_online)

    return final_logprobs_tensor


def strided_chunk_views(
    tensor: Union[np.ndarray, torch.Tensor],
    chunks: int,
    dim: int = 0,
    stride: int = 1,
    chunk_size: int | None = None,
) -> List[Union[np.ndarray, torch.Tensor]]:
    """
    Split a tensor into chunks along a dimension with striding, prioritizing views over copies.

    Args:
        tensor: Input tensor (numpy array or torch tensor)
        chunks: Number of chunks to create
        dim: Dimension along which to chunk (default: 0)
        stride: Stride between chunk starting positions (default: 1)
        chunk_size: Size of each chunk. If None, calculated automatically (default: None)

    Returns:
        List of tensor chunks (views when possible, copies when necessary)
    """

    # Get the size of the specified dimension
    dim_size = tensor.shape[dim]

    # Calculate chunk size if not provided
    if chunk_size is None:
        chunk_size = (dim_size + chunks - 1) // chunks  # Ceiling division

    chunks_list = []

    for i in range(chunks):
        start_idx = i * stride
        end_idx = min(start_idx + chunk_size, dim_size)

        # Break if we've gone beyond the tensor
        if start_idx >= dim_size:
            break

        # Create slice objects for all dimensions
        slices = [slice(None)] * tensor.ndim
        slices[dim] = slice(start_idx, end_idx)

        chunk = tensor[tuple(slices)]
        chunks_list.append(chunk)

    return chunks_list


def chunk_overlap(input_tensor: Tensor, chunks: int, dim: int = 0, overlap: int = 1):
    dim_size = input_tensor.shape[dim]
    stride = math.ceil(dim_size / chunks)

    return strided_chunk_views(
        input_tensor, chunks, dim, stride=stride, chunk_size=stride + overlap
    )


================================================
FILE: src/axolotl/integrations/kernels/README.md
================================================
# Kernels Integration

MoE (Mixture of Experts) kernels speed up training for MoE layers and reduce VRAM costs. In transformers v5, `batched_mm` and `grouped_mm` were integrated as built-in options via the `experts_implementation` config kwarg:

```python
class ExpertsInterface(GeneralInterface):
    _global_mapping = {
        "batched_mm": batched_mm_experts_forward,
        "grouped_mm": grouped_mm_experts_forward,
    }
```

In our custom integration, we add support for **ScatterMoE** and **SonicMoE**, which are more efficient and faster than `grouped_mm`.

## Usage

Add the following to your axolotl YAML config:

```yaml
plugins:
  - axolotl.integrations.kernels.KernelsPlugin

use_kernels: true

# Choose one (mutually exclusive):
use_scattermoe: true
# OR
use_sonicmoe: true
```

**Important:** Setting `experts_implementation` is incompatible with custom kernel options.

### SonicMoE installation

**Prerequisites:**
- NVIDIA Hopper (H100, H200) or Blackwell (B200, GB200) GPU
- CUDA 12.9+ (13.0+ for B300)
- PyTorch 2.7+ (2.9.1 recommended)
- For B300: Triton 3.6.0

```bash
pip install --ignore-requires-python --no-deps "sonic-moe @ git+https://github.com/Dao-AILab/sonic-moe.git@116e2df0a41874f77fa0ad269ce7df3f0cfcb956" && pip install nvidia-cutlass-dsl==4.4.0 quack-kernels==0.2.5
```

See the [SonicMoE installation guide](https://github.com/Dao-AILab/sonic-moe?tab=readme-ov-file#-installation) for the latest prerequisite details.

**Note:** Blackwell support is in upstream beta. On Blackwell GPUs, Axolotl automatically sets `USE_QUACK_GEMM=1` to enable the Blackwell kernels.

## How It Works

The `KernelsPlugin` runs before model loading and:

### ScatterMoE
1. Registers the ScatterMoE kernel from the local `libs/scattermoe_lora` package (includes fused LoRA support via Triton kernels).
2. Patches the model's `SparseMoeBlock` forward method with the optimized ScatterMoE implementation.

### SonicMoE
1. Resolves the model's MoE block class(es) from `constants.py`.
2. Patches the forward method with SonicMoE's optimized kernels and registers a weight converter for the interleaved gate/up projection format.
3. Supports both softmax->topk and sigmoid->topk routing strategies.

Both paths use the shared `resolve_moe_block_classes` utility in `constants.py` for model-type-to-class resolution.

#### Supported Models

See `constants.py` for the full list of supported model types (Qwen2-MoE, Qwen3-MoE, OLMoE, Mixtral, DeepSeek-V3, GLM-MoE, MiniMax, etc.).

## Limitations

ScatterMoE uses a softmax -> topk routing, so results may be different for some model architectures as baseline (GPT-OSS, etc). Incompatible with `GLM_MOE_DSA` (GLM 5) and `GLM4_MOE_LITE` (GLM 4.7 Flash) at the moment.

SonicMoE supports both softmax->topk and sigmoid->topk routing, covering a wider range of architectures.

ScatterMoE does not work for GLM4.7 Flash (glm4_moe_lite) atm.

## Note on MegaBlocks

We tested [MegaBlocks](https://huggingface.co/kernels-community/megablocks) but were unable to ensure numerical accuracy, so we did not integrate it. It was also incompatible with many newer model architectures in transformers.


================================================
FILE: src/axolotl/integrations/kernels/__init__.py
================================================
from .args import KernelsArgs
from .plugin import KernelsPlugin

__all__ = [
    "KernelsArgs",
    "KernelsPlugin",
]


================================================
FILE: src/axolotl/integrations/kernels/args.py
================================================
from pydantic import BaseModel, model_validator

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


class KernelsArgs(BaseModel):
    use_scattermoe: bool | None = None
    use_sonicmoe: bool | None = None

    @model_validator(mode="before")
    @classmethod
    def check_mutually_exclusive(cls, data):
        if data.get("use_scattermoe") and data.get("use_sonicmoe"):
            raise ValueError(
                "Cannot use both ScatterMoE and SonicMoE simultaneously. "
                "Please set only one of `use_scattermoe` or `use_sonicmoe` to true."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_use_kernels(cls, data):
        if data.get("use_kernels") is not True:
            LOG.warning(
                "`use_kernels` must be set to True to use this. Automatically setting it to True."
            )
            data["use_kernels"] = True

        return data

    @model_validator(mode="before")
    @classmethod
    def check_experts_implementation(cls, data):
        experts_implementation = data.get("experts_implementation")
        if experts_implementation is None:
            # transformers may default to batched_mm when unset
            data["experts_implementation"] = "eager"
        elif experts_implementation != "eager":
            LOG.warning(
                "`experts_implementation` must be set to 'eager' to use this. Automatically setting it to 'eager'."
            )
            data["experts_implementation"] = "eager"

        return data

    @model_validator(mode="before")
    @classmethod
    def disable_mlp_kernel(cls, data):
        if data.get("use_scattermoe") is True or data.get("use_sonicmoe") is True:
            if data.get("lora_mlp_kernel") is True:
                LOG.warning(
                    "Disabling lora_mlp_kernel when using custom MoE kernels due to compatibility issues."
                )
                data["lora_mlp_kernel"] = False
            data["mlp_kernel"] = False

        return data


================================================
FILE: src/axolotl/integrations/kernels/autotune_callback.py
================================================
"""Trainer callback for reporting Triton autotune results from scattermoe-lora kernels."""

import logging

import torch
from transformers import (
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
)

LOG = logging.getLogger(__name__)

# Give up looking for autotune data after this many training steps.
_MAX_POLL_STEP = 5


def _get_gpu_info() -> dict:
    """Return basic GPU identification for the current device."""
    if not torch.cuda.is_available():
        return {}
    try:
        idx = torch.cuda.current_device()
        props = torch.cuda.get_device_properties(idx)
        return {
            "gpu_name": props.name,
            "gpu_compute_capability": f"{props.major}.{props.minor}",
            "gpu_memory_bytes": props.total_memory,
        }
    except Exception:  # pylint: disable=broad-exception-caught
        return {}


def _get_smem_capacity() -> dict:
    """Return shared memory capacity from the runtime lora_ops module."""
    try:
        from axolotl.integrations.kernels.autotune_collector import (
            _find_lora_ops_module,
        )

        lora_ops = _find_lora_ops_module()
        if lora_ops is None:
            return {}
        fn = getattr(lora_ops, "_get_smem_capacity", None)
        if fn is None:
            return {}
        return {"smem_capacity_bytes": fn()}
    except Exception:  # pylint: disable=broad-exception-caught
        return {}


class AutotuneReportCallback(TrainerCallback):
    """Reports Triton kernel autotune selections via telemetry.

    Fires **once** after the first training step completes (step 1), at
    which point the forward and backward passes have both run and the
    autotuned kernels have populated their caches.  If for some reason
    the caches are still empty (e.g. the kernel was never invoked), the
    callback retries on subsequent steps up to ``_MAX_POLL_STEP`` and
    then stops polling.

    After reporting (or giving up) every subsequent ``on_step_end``
    call short-circuits on the ``_reported`` flag — zero hot-path cost.
    """

    def __init__(self):
        self._reported = False

    # pylint: disable=unused-argument
    def on_step_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if self._reported:
            return

        # Lazy import — Triton / scattermoe kernels may not be installed.
        from axolotl.integrations.kernels.autotune_collector import (
            collect_autotune_configs,
        )

        configs = collect_autotune_configs()

        if not configs:
            if state.global_step >= _MAX_POLL_STEP:
                LOG.debug(
                    "No autotune data found after %d steps; giving up.",
                    state.global_step,
                )
                self._reported = True
            return

        self._reported = True

        from axolotl.telemetry.manager import TelemetryManager

        telemetry_manager = TelemetryManager.get_instance()
        if not telemetry_manager.enabled:
            return

        properties = {
            "kernel_count": len(configs),
            "kernels": configs,
        }
        properties.update(_get_gpu_info())
        properties.update(_get_smem_capacity())

        telemetry_manager.send_event(
            event_type="scattermoe-autotune",
            properties=properties,
        )

        LOG.info(
            "Reported %d scattermoe kernel autotune config(s) to telemetry.",
            len(configs),
        )


================================================
FILE: src/axolotl/integrations/kernels/autotune_collector.py
================================================
"""Collect Triton autotune results from scattermoe-lora kernels.

This module reads the ``.cache`` attribute from Triton ``@triton.autotune``
decorated kernel objects and returns structured dicts describing the selected
configurations.  It has **no** telemetry dependency — callers decide what to
do with the data.
"""

import logging
import sys
from types import ModuleType
from typing import Any

LOG = logging.getLogger(__name__)

# (human-readable name, attribute on the lora_ops module)
_KERNEL_REGISTRY: list[tuple[str, str]] = [
    ("scatter2scatter_lora_fwd", "_scatter2scatter_lora"),
    ("scatter2scatter_lora_dX", "_scatter2scatter_lora_dX"),
    ("group_bwd_lora", "_group_bwd_lora"),
    ("group_bwd_lora_fused", "_group_bwd_lora_fused"),
]

# The autotune key declared on every kernel: key=["M", "N", "K"]
_KEY_NAMES: list[str] = ["M", "N", "K"]


def _parse_key_tuple(key_tuple: tuple) -> dict[str, Any]:
    """Turn the autotune cache key tuple into a labelled dict.

    Triton builds the cache key from the values of the declared ``key``
    args (``M``, ``N``, ``K``) followed by dtype signature elements.
    We label the first three and store the rest under ``_extra``.
    """
    result: dict[str, Any] = {}
    for i, name in enumerate(_KEY_NAMES):
        if i < len(key_tuple):
            result[name] = key_tuple[i]
    if len(key_tuple) > len(_KEY_NAMES):
        result["_extra"] = [str(v) for v in key_tuple[len(_KEY_NAMES) :]]
    return result


def _find_lora_ops_module() -> ModuleType | None:
    """Locate the *runtime* ``lora_ops`` module in ``sys.modules``.

    The HF ``kernels`` package loads ``scattermoe_lora`` via
    ``import_from_path`` which registers it in ``sys.modules`` under a
    hash-suffixed name (e.g. ``scattermoe_lora_a1b2c3d4``).  A normal
    import (``from axolotl.integrations.kernels...``) would create a
    *separate* module instance whose kernel objects have empty
    ``.cache`` dicts because autotuning ran on the runtime copy.

    We search ``sys.modules`` for any module whose name contains
    ``lora_ops`` and that has the ``_scatter2scatter_lora`` kernel
    attribute — that is the runtime copy with populated caches.
    """
    for name, module in list(sys.modules.items()):
        if (
            module is not None
            and "lora_ops" in name
            and hasattr(module, "_scatter2scatter_lora")
        ):
            return module
    return None


def collect_autotune_configs() -> list[dict[str, Any]]:
    """Read autotune caches from the four scattermoe-lora kernels.

    Returns a (possibly empty) list of dicts, each containing:

    * ``kernel`` – human-readable kernel name
    * ``key``    – dict with the ``M``/``N``/``K`` problem dimensions
    * ``config`` – dict with the selected tile sizes, ``num_warps``,
      and ``num_stages``

    Returns ``[]`` if the kernel module cannot be found or if no
    autotune cache entries exist yet.
    """
    lora_ops = _find_lora_ops_module()
    if lora_ops is None:
        LOG.debug(
            "lora_ops module not found in sys.modules; skipping autotune collection"
        )
        return []

    results: list[dict[str, Any]] = []

    for friendly_name, attr_name in _KERNEL_REGISTRY:
        kernel_fn = getattr(lora_ops, attr_name, None)
        if kernel_fn is None:
            continue

        cache = getattr(kernel_fn, "cache", None)
        if not cache:
            continue

        for key_tuple, config in cache.items():
            config_dict = dict(config.kwargs)
            config_dict["num_warps"] = config.num_warps
            config_dict["num_stages"] = config.num_stages
            if getattr(config, "num_ctas", None) is not None:
                config_dict["num_ctas"] = config.num_ctas

            results.append(
                {
                    "kernel": friendly_name,
                    "key": _parse_key_tuple(key_tuple),
                    "config": config_dict,
                }
            )

    return results


================================================
FILE: src/axolotl/integrations/kernels/constants.py
================================================
"""
Supported MoE block mappings for kernel integrations.

Maps model_type to the SparseMoeBlock class name(s) in transformers.
Used by both ScatterMoE and SonicMoE kernel paths.

Values can be a single class name (str) or a list of class names for models
with multiple MoE block types (e.g. qwen3_omni_moe has Thinker + Talker).
"""

import importlib

SPARSE_MOE_BLOCK = {
    # softmax -> topk routing
    "qwen2_moe": "Qwen2MoeSparseMoeBlock",
    "qwen3_moe": "Qwen3MoeSparseMoeBlock",
    "qwen3_5_moe": "Qwen3_5MoeSparseMoeBlock",
    "qwen3_5_moe_text": "Qwen3_5MoeSparseMoeBlock",
    "qwen3_next": "Qwen3NextSparseMoeBlock",
    "qwen3_vl_moe": "Qwen3VLMoeTextSparseMoeBlock",
    # qwen3_omni_moe: Thinker (standard) + Talker (shared experts + shared_expert_gate)
    "qwen3_omni_moe": [
        "Qwen3OmniMoeThinkerTextSparseMoeBlock",
        "Qwen3OmniMoeTalkerTextSparseMoeBlock",
    ],
    "olmoe": "OlmoeSparseMoeBlock",
    "mixtral": "MixtralSparseMoeBlock",
    "minimax": "MiniMaxSparseMoeBlock",
    # softmax -> topk routing (with group-based expert selection)
    "mistral4": "Mistral4MoE",
    # sigmoid -> topk routing (with group-based expert selection)
    "glm_moe_dsa": "GlmMoeDsaMoE",
    "deepseek_v3": "DeepseekV3MoE",
    "glm4_moe": "Glm4MoeMoE",
    "glm4_moe_lite": "Glm4MoeLiteMoE",
    "glm4v_moe": "Glm4vMoeTextMoE",
    # sigmoid -> topk routing (no group selection)
    "minimax_m2": "MiniMaxM2SparseMoeBlock",
    # Models below need custom routing (not yet implemented):
    # "ernie4_5_moe": "Ernie4_5_MoeSparseMoeBlock",  # softmax->topk, e_score_correction_bias between softmax and topk
    # "deepseek_v2": "DeepseekV2Moe",  # softmax->topk, group_limited_greedy, different attr names (num_group)
    # "hunyuan_v1_moe": "HunYuanMoEV1Moe",  # softmax->topk, gate.wg (not gate.weight), scatter routing
    # "gpt_oss": "GptOssMLP",  # topk->softmax, transposed layout [E,H,2*I], custom GLU, expert biases
}


def resolve_moe_block_classes(model_type: str):
    """Resolve all MoE block classes from transformers for the given model type.

    Returns a list of classes (one for most models, multiple for models with
    distinct MoE block types like qwen3_omni_moe).
    """
    entry = SPARSE_MOE_BLOCK.get(model_type)
    if entry is None:
        raise ValueError(
            f"Unsupported MoE model type '{model_type}'. "
            f"Supported types: {list(SPARSE_MOE_BLOCK.keys())}"
        )

    cls_names = entry if isinstance(entry, list) else [entry]
    module_path = f"transformers.models.{model_type}.modeling_{model_type}"
    try:
        module = importlib.import_module(module_path)
    except ModuleNotFoundError:
        # Text sub-model types (e.g. qwen3_5_moe_text) share the parent module
        if model_type.endswith("_text"):
            parent_type = model_type.removesuffix("_text")
            module_path = f"transformers.models.{parent_type}.modeling_{parent_type}"
            module = importlib.import_module(module_path)
        else:
            raise

    classes = []
    for cls_name in cls_names:
        moe_cls = getattr(module, cls_name, None)
        if moe_cls is None:
            raise ValueError(f"Could not find class '{cls_name}' in '{module_path}'")
        classes.append(moe_cls)

    return classes


================================================
FILE: src/axolotl/integrations/kernels/libs/__init__.py
================================================


================================================
FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/__init__.py
================================================
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) Axolotl AI
# Licensed under the Apache License, Version 2.0

from . import layers
from .lora_ops import ParallelExperts
from .parallel_experts import flatten_sort_count, parallel_linear
from .parallel_linear_lora import ScatterMoELoRA, parallel_linear_lora

__all__ = [
    "layers",
    "ParallelExperts",
    "flatten_sort_count",
    "parallel_linear",
    "ScatterMoELoRA",
    "parallel_linear_lora",
    "lora_ops",
]


================================================
FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/__init__.py
================================================
# SPDX-License-Identifier: Apache-2.0
#
# Original work Copyright (c) Shawn Tan and ScatterMoE Contributors
# Adapted from https://github.com/shawntan/scattermoe
# See https://github.com/shawntan/scattermoe/blob/main/LICENSE
#
# Modifications and LoRA adaptation Copyright (c) Axolotl AI
# Licensed under the Apache License, Version 2.0

from . import lora_ops, ops

__all__ = ["ops", "lora_ops"]


================================================
FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/lora_ops.py
================================================
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) Axolotl AI
# Licensed under the Apache License, Version 2.0

"""
Fused ScatterMoE + LoRA Triton Kernels
=======================================

Provides fused forward and backward kernels for ScatterMoE with LoRA adapters.

Forward: Y = X @ W + scaling * (X @ A^T) @ B^T
Backward (LoRA training, W frozen):
  - dX = dY @ W^T + scaling * (dY @ B) @ A    (input gradient)
  - dA = scaling * (dY @ B)^T @ X              (LoRA A gradient)
  - dB = scaling * dY^T @ (X @ A^T)            (LoRA B gradient)

LoRA weight layout (from PEFT ParamWrapper):
  - A: [r*E, K]  -- for expert e, rows [e*r : (e+1)*r] give A_e of shape [r, K]
  - B: [N, r*E]  -- for expert e, cols [e*r : (e+1)*r] give B_e of shape [N, r]

Key design decisions:
  - The forward kernel fuses X@W and X@A^T in the same K-loop for data reuse on X,
    then computes (X@A^T) @ B^T in the epilogue.
  - The backward dA/dB kernel operates on grouped (expert-contiguous) data and
    iterates over tokens per expert, accumulating gradients in registers.
  - R (LoRA rank) is a tl.constexpr, allowing tl.arange(0, R). We pad R to a
    power-of-2 for Triton tile compatibility; typical ranks (4, 8, 16, 32, 64)
    already satisfy this.
"""

from itertools import product
from typing import Optional

import torch
import triton
import triton.language as tl

# =============================================================================
# Configuration
# =============================================================================

BLOCK_M = 128
ALLOW_TF32 = True


def _next_power_of_2(n: int) -> int:
    """Round up to next power of 2."""
    n -= 1
    n |= n >> 1
    n |= n >> 2
    n |= n >> 4
    n |= n >> 8
    n |= n >> 16
    return n + 1


# Triton tl.dot requires minimum tile dimensions of 16 on modern GPUs.
MIN_TRITON_DOT_SIZE = 16


def _block_r_for_rank(r: int) -> int:
    """Compute BLOCK_R: next power-of-2 >= max(r, MIN_TRITON_DOT_SIZE)."""
    return _next_power_of_2(max(r, MIN_TRITON_DOT_SIZE))


# =============================================================================
# Token Rounding: pad expert counts to BLOCK_M multiples
# =============================================================================


def round_expert_counts(
    sorted_expert_idxs: torch.Tensor,
    sorted_scattered_idxs: torch.Tensor,
    expert_offsets: torch.Tensor,
    E: int,
    block_m: int = BLOCK_M,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Pad each expert's token count to a multiple of block_m to eliminate
    partial-tile waste in the backward kernel.

    Padding is done by duplicating the last valid token index for each expert.
    The kernel's M_mask = M_idx < real_end_idx masks these padding entries, so
    correctness is preserved (they contribute 0 to the accumulation via other=0.0).

    This only helps the backward dA/dB kernel where per-expert iteration is
    explicit. The forward scatter2scatter kernel handles partial tiles via masking.

    Args:
        sorted_expert_idxs: Expert assignments sorted [M*k]
        sorted_scattered_idxs: Original indices sorted [M*k]
        expert_offsets: Cumulative token counts per expert [E]
        E: Number of experts
        block_m: Block size for token dimension (default: BLOCK_M)

    Returns:
        padded_expert_idxs: [M_padded] expert assignments with padding
        padded_scattered_idxs: [M_padded] original indices with padding
        padded_offsets: [E] cumulative padded counts (for kernel iteration range)
        real_offsets: [E] original cumulative counts (for M_mask in kernel)
    """
    device = sorted_expert_idxs.device

    # Compute per-expert counts
    counts = torch.zeros(E, dtype=torch.int64, device=device)
    prev = 0
    for e in range(E):
        curr = expert_offsets[e].item()
        counts[e] = curr - prev
        prev = curr

    # Round up each count to multiple of block_m
    padded_counts = ((counts + block_m - 1) // block_m) * block_m
    # Experts with 0 tokens stay at 0
    padded_counts = torch.where(
        counts > 0, padded_counts, torch.zeros_like(padded_counts)
    )
    total_padded = padded_counts.sum().item()

    padded_expert_idxs = torch.empty(
        total_padded, dtype=sorted_expert_idxs.dtype, device=device
    )
    padded_scattered_idxs = torch.empty(
        total_padded, dtype=sorted_scattered_idxs.dtype, device=device
    )

    src_offset = 0
    dst_offset = 0
    for e in range(E):
        count = counts[e].item()
        padded_count = padded_counts[e].item()

        if count > 0:
            # Copy original tokens
            padded_expert_idxs[dst_offset : dst_offset + count] = sorted_expert_idxs[
                src_offset : src_offset + count
            ]
            padded_scattered_idxs[dst_offset : dst_offset + count] = (
                sorted_scattered_idxs[src_offset : src_offset + count]
            )

            # Pad with last valid token (masked out by kernel via M_mask)
            if padded_count > count:
                padded_expert_idxs[dst_offset + count : dst_offset + padded_count] = (
                    sorted_expert_idxs[src_offset + count - 1]
                )
                padded_scattered_idxs[
                    dst_offset + count : dst_offset + padded_count
                ] = sorted_scattered_idxs[src_offset + count - 1]

        src_offset += count
        dst_offset += padded_count

    # Padded offsets: cumulative padded counts (for iteration range in kernel)
    padded_offsets = padded_counts.cumsum(-1).to(expert_offsets.dtype)
    # Real offsets: original cumulative counts (for M_mask in kernel)
    real_offsets = expert_offsets.clone()

    return padded_expert_idxs, padded_scattered_idxs, padded_offsets, real_offsets


# =============================================================================
# Autotuning: SMEM estimation and config pruning
# =============================================================================

_SMEM_CAPACITY: int | None = None


def _get_smem_capacity() -> int:
    """Get device shared memory capacity (bytes). Cached after first call."""
    global _SMEM_CAPACITY
    if _SMEM_CAPACITY is None:
        props = triton.runtime.driver.active.utils.get_device_properties(
            torch.cuda.current_device()
        )
        _SMEM_CAPACITY = props["max_shared_mem"]
    return _SMEM_CAPACITY


def _estimate_smem_usage(
    num_stages: int, BLOCK_M: int, BLOCK_N: int, BLOCK_K: int, dtype_bytes: int = 2
) -> int:
    """Estimate shared memory in bytes for a GEMM-style tile.

    Formula: stages * BLOCK_K * (BLOCK_M + BLOCK_N) + BLOCK_M * BLOCK_N
    Multiply by dtype_bytes (2 for fp16/bf16).
    """
    return (
        num_stages * BLOCK_K * (BLOCK_M + BLOCK_N) + BLOCK_M * BLOCK_N
    ) * dtype_bytes


# Conservative margin (bytes) subtracted from SMEM capacity to account for
# estimation inaccuracies and kernel overhead (registers spilled to SMEM, etc.)
_SMEM_SLACK = 10_000


def _estimate_register_pressure(
    num_warps: int,
    *tile_sizes: tuple[int, int],
) -> float:
    """Rough estimate of per-thread register footprint from live tile sizes.

    This is a heuristic, NOT an accurate register count.  Triton uses tensor
    core MMA fragments that pack multiple elements per register, and can spill
    to local memory when the hardware limit (255 regs/thread) is exceeded.

    The estimate is used to prune only truly extreme configs that would cause
    excessive spilling or compilation failures.  The threshold is set high
    (``_MAX_REGS_SOFT_LIMIT``) because the heuristic overestimates — it
    doesn't account for MMA fragment packing.  Configs like M=64,N=64,K=64
    (est ~520) work fine in practice via spilling.

    Returns estimated registers per thread.
    """
    # Each thread in a warp holds ~1/32 of the tile elements
    tile_regs = sum(r * c for r, c in tile_sizes) / 32
    scalar_overhead = 40
    return tile_regs + scalar_overhead


# Soft limit for register pressure pruning.  Only prune configs with extreme
# tile products (e.g. M=128,K=256,N=256) that reliably crash on Blackwell.
# Moderate configs (M=64,N=64,K=64, est ~520) work via register spilling.
_MAX_REGS_SOFT_LIMIT = 1024


# =============================================================================
# Forward Kernel: scatter2scatter with fused LoRA
# =============================================================================


@triton.jit
def _compute_expert_block_lora(
    E_idx,
    E_mask,
    M_in_idx,
    N_block,
    N_mask,
    # Base weight
    X_ptr,
    stride_xm,
    stride_xk,
    W_ptr,
    stride_we,
    stride_wk,
    stride_wn,
    # LoRA weights
    A_ptr,
    stride_ar,
    stride_ak,  # A: [r*E, K], stride_ar = stride for r*E dim, stride_ak = stride for K dim
    B_ptr,
    stride_bn,
    stride_br,  # B: [N, r*E], stride_bn = stride for N dim, stride_br = stride for r*E dim
    # Dimensions
    K,
    ACTUAL_R: tl.constexpr,  # True LoRA rank (for indexing into weight arrays)
    acc,
    no_k_mask,
    BLOCK_M: tl.constexpr,
    BLOCK_K: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_R: tl.constexpr,  # Padded tile size >= max(ACTUAL_R, 16)
    scaling,
    allow_tf32: tl.constexpr,
):
    """
    Compute Y_block = X_block @ W_e + scaling * (X_block @ A_e^T) @ B_e^T

    for tokens in this M-block assigned to expert E_idx.

    ACTUAL_R is the true LoRA rank used for indexing into A[e*r:(e+1)*r, :].
    BLOCK_R >= ACTUAL_R is the padded tile dimension (must be >= 16 for tl.dot).
    When BLOCK_R > ACTUAL_R, loads are masked on the R dimension.
    """
    K_block = tl.arange(0, BLOCK_K)
    R_block = tl.arange(0, BLOCK_R)
    R_mask = R_block < ACTUAL_R  # Mask for padding when BLOCK_R > ACTUAL_R

    # Base weight pointers: W[E_idx, :, :] is [K, N], load [BLOCK_K, BLOCK_N]
    X_blk_ptrs = X_ptr + M_in_idx[:, None] * stride_xm + K_block[None, :] * stride_xk
    W_blk_ptrs = (
        W_ptr
        + E_idx * stride_we
        + K_block[:, None] * stride_wk
        + N_block[None, :] * stride_wn
    )

    # LoRA A pointers: A[e*ACTUAL_R:(e+1)*ACTUAL_R, :] for expert e, shape [r, K]
    A_expert_offset = E_idx * ACTUAL_R
    A_blk_ptrs = (
        A_ptr
        + (A_expert_offset + R_block)[:, None] * stride_ar
        + K_block[None, :] * stride_ak
    )

    iters = tl.cdiv(K, BLOCK_K)

    # Accumulator for X @ A^T: [BLOCK_M, BLOCK_R]
    xa_acc = tl.zeros((BLOCK_M, BLOCK_R), dtype=tl.float32)

    # Determine the input element type for consistent casting.
    # Masked tl.load with other=0.0 can upcast bf16->fp32 in some Triton versions,
    # causing dtype mismatches in tl.dot.  We cast all tiles to the same type.
    INPUT_DTYPE = X_ptr.dtype.element_ty

    for i in range(iters):
        if no_k_mask:
            x = tl.load(X_blk_ptrs, mask=E_mask[:, None], other=0.0).to(INPUT_DTYPE)
            w = tl.load(W_blk_ptrs, mask=N_mask[None, :], other=0.0).to(INPUT_DTYPE)
            a = tl.load(A_blk_ptrs, mask=R_mask[:, None], other=0.0).to(INPUT_DTYPE)
        else:
            K_mask = (i * BLOCK_K + K_block) < K
            x = tl.load(
                X_blk_ptrs, mask=E_mask[:, None] & K_mask[None, :], other=0.0
            ).to(INPUT_DTYPE)
            w = tl.load(
                W_blk_ptrs, mask=K_mask[:, None] & N_mask[None, :], other=0.0
            ).to(INPUT_DTYPE)
            a = tl.load(
                A_blk_ptrs, mask=R_mask[:, None] & K_mask[None, :], other=0.0
            ).to(INPUT_DTYPE)

        # Base: acc += X @ W  ([M, K] @ [K, N] -> [M, N])
        acc += tl.dot(x, w, allow_tf32=allow_tf32).to(tl.float32)

        # LoRA: xa_acc += X @ A^T  ([M, K] @ [K, R] -> [M, R])
        xa_acc += tl.dot(x, tl.trans(a), allow_tf32=allow_tf32).to(tl.float32)

        X_blk_ptrs += BLOCK_K * stride_xk
        W_blk_ptrs += BLOCK_K * stride_wk
        A_blk_ptrs += BLOCK_K * stride_ak

    # Epilogue: load B[e] and compute (X @ A^T) @ B^T
    # B[e] is B[:, e*ACTUAL_R:(e+1)*ACTUAL_R], shape [N, r]. Load [BLOCK_N, BLOCK_R].
    B_expert_offset = E_idx * ACTUAL_R
    B_blk_ptrs = (
        B_ptr
        + N_block[:, None] * stride_bn
        + (B_expert_offset + R_block)[None, :] * stride_br
    )
    b = tl.load(
        B_blk_ptrs, mask=N_mask[:, None] & R_mask[None, :], other=0.0
    )  # [BLOCK_N, BLOCK_R]

    # tl.dot requires non-float32 inputs (tensor cores); cast back to input dtype
    b_inp = b.to(INPUT_DTYPE)

    # (X @ A^T) @ B^T: [M, R] @ [R, N] -> [M, N]
    lora_out = tl.dot(xa_acc.to(INPUT_DTYPE), tl.trans(b_inp), allow_tf32=allow_tf32)

    acc += scaling * lora_out
    return acc


def _scatter2scatter_lora_configs():
    """Generate forward kernel autotune configs.

    Search space includes BLOCK_M to allow trading token-tile size for
    larger BLOCK_K/BLOCK_N tiles.  On GPUs with ~99KB SMEM, BLOCK_M=128
    forces BLOCK_K=32 and BLOCK_N=32; BLOCK_M=64 allows BLOCK_K=128
    (4× fewer inner-loop iterations).

    Search space:
      BLOCK_M:    {32, 64, 128}
      BLOCK_N:    {32, 64, 128, 256}
      BLOCK_K:    {32, 64, 128}
      num_warps:  {4, 8}
      num_stages: {3, 4, 5}
    """
    configs = []
    for block_m, block_n, block_k, warps, stages in product(
        [32, 64, 128],  # BLOCK_M
        [32, 64, 128, 256],  # BLOCK_N
        [32, 64, 128],  # BLOCK_K
        [4, 8],  # num_warps
        [3, 4, 5],  # num_stages
    ):
        configs.append(
            triton.Config(
                {"BLOCK_M": block_m, "BLOCK_N": block_n, "BLOCK_K": block_k},
                num_stages=stages,
                num_warps=warps,
            )
        )
    return configs


def _prune_fwd_configs(configs, named_args, **kwargs):
    """Prune forward configs based on SMEM capacity and register pressure.

    The forward kernel inner loop loads three tiles per pipeline stage:
      X[BLOCK_M, BLOCK_K], W[BLOCK_K, BLOCK_N], A[BLOCK_R, BLOCK_K].
    The base estimate only accounts for X and W. We add:
      - A tile [BLOCK_R, BLOCK_K] per pipeline stage (loaded in the inner loop)
      - B tile [BLOCK_N, BLOCK_R] loaded once in the epilogue
      - Extra headroom for compiler overhead (register spills, metadata)
    """
    smem_cap = _get_smem_capacity()

    # Get BLOCK_R from named_args if available, else assume worst case
    block_r = named_args.get("BLOCK_R", 64)

    scored = []
    for config in configs:
        block_m = config.kwargs["BLOCK_M"]
        block_n = config.kwargs["BLOCK_N"]
        block_k = config.kwargs["BLOCK_K"]
        # Base: stages * BLOCK_K * (BLOCK_M + BLOCK_N) + BLOCK_M * BLOCK_N
        smem_base = _estimate_smem_usage(config.num_stages, block_m, block_n, block_k)
        # A tile [BLOCK_R, BLOCK_K] loaded per stage in the inner loop
        smem_lora_loop = config.num_stages * block_r * block_k * 2
        # B tile [BLOCK_N, BLOCK_R] loaded once in epilogue
        smem_lora_epilogue = block_n * block_r * 2
        smem = smem_base + smem_lora_loop + smem_lora_epilogue

        # Register pressure: live tiles are acc[M,N], xa_acc[M,R],
        # x[M,K], w[K,N], a[R,K], plus epilogue b[N,R]
        est_regs = _estimate_register_pressure(
            config.num_warps,
            (block_m, block_n),  # acc
            (block_m, block_r),  # xa_acc
            (block_m, block_k),  # x tile
            (block_k, block_n),  # w tile
            (block_r, block_k),  # a tile
            (block_n, block_r),  # b tile (epilogue)
        )
        if est_regs > _MAX_REGS_SOFT_LIMIT:
            continue

        scored.append((smem, config))

    pruned = [c for s, c in scored if s <= smem_cap - _SMEM_SLACK]
    if pruned:
        return pruned
    if scored:
        # All surviving configs exceed SMEM — return the one with smallest usage
        scored.sort(key=lambda x: x[0])
        return [scored[0][1]]
    # All configs pruned by register pressure — fall back to smallest tiles
    return [
        min(
            configs,
            key=lambda c: (
                c.kwargs["BLOCK_M"] * c.kwargs["BLOCK_N"] * c.kwargs["BLOCK_K"]
            ),
        )
    ]


@triton.autotune(
    configs=_scatter2scatter_lora_configs(),
    key=["M", "N", "K"],
    prune_configs_by={"early_config_prune": _prune_fwd_configs},
)
@triton.heuristics(
    {
        "NO_K_MASK": lambda args: (args["K"] % args["BLOCK_K"]) == 0,
        "NO_N_MASK": lambda args: (args["N"] % args["BLOCK_N"]) == 0,
    }
)
@triton.jit
def _scatter2scatter_lora(
    # Input/Output
    X_ptr,
    stride_xm: tl.constexpr,
    stride_xk: tl.constexpr,
    W_ptr,
    stride_we,
    stride_wk: tl.constexpr,
    stride_wn: tl.constexpr,
    Y_ptr,
    stride_ym: tl.constexpr,
    stride_yn: tl.constexpr,
    # Bias
    Bias_ptr,
    stride_bias_e: tl.constexpr,
    stride_bias_n: tl.constexpr,
    # LoRA weights
    LA_ptr,
    stride_la_r,
    stride_la_k,  # A: [r*E, K]
    LB_ptr,
    stride_lb_n,
    stride_lb_r,  # B: [N, r*E]
    # Routing
    grouped_idx_ptr,
    expert_idxs_ptr,
    # Dimensions
    FAN_OUT: tl.constexpr,
    M,
    K: tl.constexpr,
    N: tl.constexpr,
    E: tl.constexpr,
    ACTUAL_R: tl.constexpr,  # True LoRA rank (for weight indexing)
    # Block sizes
    BLOCK_M: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_K: tl.constexpr,
    BLOCK_R: tl.constexpr,  # Padded tile size >= max(ACTUAL_R, 16)
    # Config
    ACC_TYPE: tl.constexpr,
    scaling,
    allow_tf32: tl.constexpr,
    x_grouped: tl.constexpr,
    y_grouped: tl.constexpr,
    NO_K_MASK: tl.constexpr,
    NO_N_MASK: tl.constexpr,
):
    """
    Fused scatter2scatter with LoRA: Y = X @ W + scaling * (X @ A^T) @ B^T + bias
    """
    pid = tl.program_id(axis=0)

    N_BLOCK_COUNT = tl.cdiv(N, BLOCK_N)
    M_block_id = pid // N_BLOCK_COUNT
    N_block_id = pid % N_BLOCK_COUNT

    M_block = M_block_id * BLOCK_M + tl.arange(0, BLOCK_M)
    N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
    N_mask = N_block < N
    M_boundary_mask = M_block < (FAN_OUT * M)

    E_idxs = tl.load(expert_idxs_ptr + M_block, mask=M_boundary_mask, other=E)

    no_k_mask = NO_K_MASK

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)

    E_first_idx = tl.min(E_idxs)
    E_last_idx = tl.minimum(tl.max(E_idxs), E - 1)
    M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int32)

    for E_idx in range(E_first_idx, E_last_idx + 1):
        E_mask = E_idxs == E_idx
        if x_grouped:
            M_in_idx = M_block
        else:
            M_in_idx = M_idx // FAN_OUT

        acc = _compute_expert_block_lora(
            E_idx,
            E_mask,
            M_in_idx,
            N_block,
            N_mask,
            X_ptr,
            stride_xm,
            stride_xk,
            W_ptr,
            stride_we,
            stride_wk,
            stride_wn,
            LA_ptr,
            stride_la_r,
            stride_la_k,
            LB_ptr,
            stride_lb_n,
            stride_lb_r,
            K,
            ACTUAL_R,
            acc,
            no_k_mask,
            BLOCK_M,
            BLOCK_K,
            BLOCK_N,
            BLOCK_R,
            scaling,
            allow_tf32=allow_tf32,
        )

    # Add bias if present
    if Bias_ptr is not None:
        B_blk_ptrs = (
            Bias_ptr
            + E_idxs[:, None] * stride_bias_e
            + N_block[None, :] * stride_bias_n
        )
        acc += tl.load(B_blk_ptrs, mask=M_boundary_mask[:, None] & N_mask[None, :])

    # Store output
    if y_grouped:
        M_out_idx = M_block
    else:
        M_out_idx = M_idx
    Y_blk_ptrs = Y_ptr + (M_out_idx[:, None] * stride_ym + N_block[None, :] * stride_yn)
    tl.store(Y_blk_ptrs, acc, mask=M_boundary_mask[:, None] & N_mask[None, :])


def _scatter2scatter_lora_split(
    X: torch.Tensor,
    W: torch.Tensor,
    sorted_expert_idxs: torch.Tensor,
    sorted_scattered_idxs: torch.Tensor,
    k: int,
    lora_A: torch.Tensor,
    lora_B: torch.Tensor,
    scaling: float,
    b: Optional[torch.Tensor] = None,
    x_grouped: bool = False,
    y_grouped: bool = False,
    out: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    """Split base+LoRA forward: 3 scatter2scatter calls, no fused LoRA kernel.

    Faster for models with few large experts (e.g. Mixtral E=8, I=14336)
    because the base kernel runs at full speed without LoRA SMEM overhead,
    and the LoRA matmuls (R=16) are tiny separate passes.

    Y = scatter(X, W) + scaling * scatter(scatter(X, A^T), B^T)
    """
    from axolotl.integrations.kernels.libs.scattermoe_lora.kernels.ops import (
        scatter2scatter,
    )

    E = W.size(0)
    R = lora_A.size(0) // E
    K = W.size(1)
    N = W.size(2)

    # 1. Base: Y_base = X @ W  (uses base kernel with optimal tile sizes)
    output = scatter2scatter(
        X=X,
        W=W,
        b=b,
        sorted_expert_idxs=sorted_expert_idxs,
        sorted_scattered_idxs=sorted_scattered_idxs,
        k=k,
        x_grouped=x_grouped,
        y_grouped=y_grouped,
        out=out,
    )

    # 2. XA = X @ A^T  (tiny: output is [M*k, R])
    # Reshape A: [R*E, K] → [E, K, R] (expert weights for scatter2scatter)
    W_A = lora_A.reshape(E, R, K).permute(0, 2, 1).contiguous()
    XA = scatter2scatter(
        X=X,
        W=W_A,
        sorted_expert_idxs=sorted_expert_idxs,
        sorted_scattered_idxs=sorted_scattered_idxs,
        k=k,
        x_grouped=x_grouped,
        y_grouped=True,
    )

    # 3. Y_lora = XA @ B^T  (R is tiny, so this is very fast)
    # Reshape B: [N, R*E] → [E, R, N]
    W_B = lora_B.T.reshape(E, R, N).contiguous()
    Y_lora = scatter2scatter(
        X=XA,
        W=W_B,
        sorted_expert_idxs=sorted_expert_idxs,
        sorted_scattered_idxs=sorted_scattered_idxs,
        k=1,
        x_grouped=True,
        y_grouped=y_grouped,
    )

    # 4. Y = Y_base + scaling * Y_lora
    output.add_(Y_lora, alpha=scaling)
    return output


# Threshold for switching from fused to split LoRA forward.
# Split wins when per-expert matmul is large (bandwidth-bound LoRA tile
# loads dominate in the fused kernel's inner loop).
# Empirically: split wins for E<=32 with K*N > 20M (e.g. Mixtral, Phi-MoE).
_SPLIT_LORA_FWD_THRESHOLD = 20_000_000  # per-expert K*N
_SPLIT_LORA_FWD_MAX_EXPERTS = 32


def scatter2scatter_lora(
    X: torch.Tensor,
    W: torch.Tensor,
    sorted_expert_idxs: torch.Tensor,
    sorted_scattered_idxs: torch.Tensor,
    k: int,
    lora_A: torch.Tensor,
    lora_B: torch.Tensor,
    scaling: float,
    b: Optional[torch.Tensor] = None,
    x_grouped: bool = False,
    y_grouped: bool = False,
    out: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    """
    Scatter2scatter with LoRA: Y[i] = X[i] @ W[e] + scaling * (X[i] @ A[e]^T) @ B[e]^T + b[e]

    Automatically selects between:
    - Fused kernel: single Triton kernel with LoRA in the inner loop.
      Best for many small experts (E>=64, small K*N).
    - Split dispatch: 3 separate scatter2scatter calls (base + XA + lora).
      Best for few large experts (E<=32, large K*N like Mixtral).

    Args:
        X: Input [M, K] or [M*k, K] if x_grouped
        W: Expert weights [E, K, N]
        sorted_expert_idxs: Expert assignments sorted [M*k]
        sorted_scattered_idxs: Original indices sorted [M*k]
        k: Fan-out (top-k)
        lora_A: LoRA A weights [r*E, K]
        lora_B: LoRA B weights [N, r*E]
        scaling: LoRA scaling factor (alpha/r)
        b: Optional bias [E, N]
        x_grouped: Input pre-grouped by expert
        y_grouped: Keep output grouped
        out: Optional pre-allocated output buffer

    Returns:
        Y: Output [M*k, N]
    """
    E = W.size(0)
    K = W.size(1)
    N = W.size(2)

    # Dispatch: split for few large experts, fused for many small experts
    if E <= _SPLIT_LORA_FWD_MAX_EXPERTS and K * N >= _SPLIT_LORA_FWD_THRESHOLD:
        return _scatter2scatter_lora_split(
            X,
            W,
            sorted_expert_idxs,
            sorted_scattered_idxs,
            k,
            lora_A,
            lora_B,
            scaling,
            b,
            x_grouped,
            y_grouped,
            out,
        )

    assert sorted_scattered_idxs.size(0) == sorted_expert_idxs.size(0)
    assert sorted_scattered_idxs.size(0) == X.size(0) * k

    R = lora_A.size(0) // E

    # Pad R to power of 2 for Triton tile size
    BLOCK_R = _block_r_for_rank(R)

    L_scattered = sorted_expert_idxs.size(0)

    if out is None:
        output = torch.empty((L_scattered, N), device=X.device, dtype=X.dtype)
    else:
        assert out.size(0) == L_scattered and out.size(1) == N
        output = out

    def grid(META):
        return (
            triton.cdiv(L_scattered, META["BLOCK_M"]) * triton.cdiv(N, META["BLOCK_N"]),
        )

    if b is None:
        stride_be = stride_bn = 0
        b_ptr = None
    else:
        stride_be, stride_bn = b.stride()
        b_ptr = b

    _scatter2scatter_lora[grid](
        X,
        X.stride(0),
        X.stride(1),
        W,
        W.stride(0),
        W.stride(1),
        W.stride(2),
        output,
        output.stride(0),
        output.stride(1),
        b_ptr,
        stride_be,
        stride_bn,
        lora_A,
        lora_A.stride(0),
        lora_A.stride(1),
        lora_B,
        lora_B.stride(0),
        lora_B.stride(1),
        sorted_scattered_idxs,
        sorted_expert_idxs,
        FAN_OUT=k,
        M=X.size(0),
        K=K,
        N=N,
        E=E,
        ACTUAL_R=R,
        BLOCK_R=BLOCK_R,
        ACC_TYPE=tl.float32,
        scaling=scaling,
        allow_tf32=ALLOW_TF32,
        x_grouped=x_grouped,
        y_grouped=y_grouped,
    )

    return output


# =============================================================================
# Backward Kernel: Fused dX = dY @ W^T + scaling * (dY @ B) @ A
# =============================================================================


@triton.jit
def _compute_expert_block_lora_dX(
    E_idx,
    E_mask,
    M_in_idx,
    K_block,
    K_mask,
    # Input: DY (gradient w.r.t. output)
    DY_ptr,
    stride_dym,
    stride_dyn,
    # Base weight W^T: we load W[e] as [K, N] and index as W^T[e] = [N, K]
    W_ptr,
    stride_we,
    stride_wk,
    stride_wn,
    # LoRA weights
    A_ptr,
    stride_ar,
    stride_ak,  # A: [r*E, K]
    B_ptr,
    stride_bn,
    stride_br,  # B: [N, r*E]
    # Dimensions
    N,
    ACTUAL_R: tl.constexpr,
    acc,
    no_n_mask,
    BLOCK_M: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_K: tl.constexpr,
    BLOCK_R: tl.constexpr,
    scaling,
    allow_tf32: tl.constexpr,
):
    """
    Compute dX_block = DY_block @ W_e^T + scaling * (DY_block @ B_e) @ A_e

    for tokens in this M-block assigned to expert E_idx.

    Inner loop over N dimension (reduction dim for dY @ W^T and dY @ B).
    Output dimension is K.
    Epilogue computes (dY @ B) @ A.

    Transpose mapping from forward:
      Forward: X@W (K-loop), X@A^T (K-loop), (X@A^T)@B^T (epilogue)
      Backward: DY@W^T (N-loop), DY@B (N-loop), (DY@B)@A (epilogue)
    """
    N_block = tl.arange(0, BLOCK_N)
    R_block = tl.arange(0, BLOCK_R)
    R_mask = R_block < ACTUAL_R

    # DY pointers: DY is [M_total, N], load [BLOCK_M, BLOCK_N]
    DY_blk_ptrs = (
        DY_ptr + M_in_idx[:, None] * stride_dym + N_block[None, :] * stride_dyn
    )

    # W^T pointers: W[e] is [K, N], W^T[e] is [N, K]. We load W^T as [BLOCK_N, BLOCK_K].
    # W stored as [E, K, N], so W^T[e][n, k] = W[e][k, n] = W_ptr + e*stride_we + k*stride_wk + n*stride_wn
    # As [BLOCK_N, BLOCK_K] tile: row=n, col=k
    WT_blk_ptrs = (
        W_ptr
        + E_idx * stride_we
        + N_block[:, None] * stride_wn  # row = n dimension
        + K_block[None, :] * stride_wk
    )  # col = k dimension

    # B pointers: B[e] is B[:, e*R:(e+1)*R], shape [N, R]. Load [BLOCK_N, BLOCK_R].
    B_expert_offset = E_idx * ACTUAL_R
    B_blk_ptrs = (
        B_ptr
        + N_block[:, None] * stride_bn
        + (B_expert_offset + R_block)[None, :] * stride_br
    )

    iters = tl.cdiv(N, BLOCK_N)

    # Accumulator for DY @ B: [BLOCK_M, BLOCK_R]
    dy_b_acc = tl.zeros((BLOCK_M, BLOCK_R), dtype=tl.float32)

    # Determine the input element type for consistent casting.
    INPUT_DTYPE = DY_ptr.dtype.element_ty

    for i in range(iters):
        if no_n_mask:
            dy = tl.load(DY_blk_ptrs, mask=E_mask[:, None], other=0.0).to(INPUT_DTYPE)
            wt = tl.load(WT_blk_ptrs, mask=K_mask[None, :], other=0.0).to(INPUT_DTYPE)
            b = tl.load(B_blk_ptrs, mask=R_mask[None, :], other=0.0).to(INPUT_DTYPE)
        else:
            N_mask_iter = (i * BLOCK_N + N_block) < N
            dy = tl.load(
                DY_blk_ptrs, mask=E_mask[:, None] & N_mask_iter[None, :], other=0.0
            ).to(INPUT_DTYPE)
            wt = tl.load(
                WT_blk_ptrs, mask=N_mask_iter[:, None] & K_mask[None, :], other=0.0
            ).to(INPUT_DTYPE)
            b = tl.load(
                B_blk_ptrs, mask=N_mask_iter[:, None] & R_mask[None, :], other=0.0
            ).to(INPUT_DTYPE)

        # Base: acc += DY @ W^T  ([M, N] @ [N, K] -> [M, K])
        acc += tl.dot(dy, wt, allow_tf32=allow_tf32).to(tl.float32)

        # LoRA: dy_b_acc += DY @ B  ([M, N] @ [N, R] -> [M, R])
        dy_b_acc += tl.dot(dy, b, allow_tf32=allow_tf32).to(tl.float32)

        DY_blk_ptrs += BLOCK_N * stride_dyn
        WT_blk_ptrs += BLOCK_N * stride_wn
        B_blk_ptrs += BLOCK_N * stride_bn

    # Epilogue: load A[e] and compute (DY @ B) @ A
    # A[e] is A[e*R:(e+1)*R, :], shape [R, K]. Load [BLOCK_R, BLOCK_K].
    A_expert_offset = E_idx * ACTUAL_R
    A_blk_ptrs = (
        A_ptr
        + (A_expert_offset + R_block)[:, None] * stride_ar
        + K_block[None, :] * stride_ak
    )
    a_e = tl.load(A_blk_ptrs, mask=R_mask[:, None] & K_mask[None, :], other=0.0).to(
        INPUT_DTYPE
    )

    # (DY @ B) @ A: [M, R] @ [R, K] -> [M, K]
    # tl.dot requires non-float32 inputs (tensor cores); cast accumulator back to input dtype
    lora_dx = tl.dot(dy_b_acc.to(INPUT_DTYPE), a_e, allow_tf32=allow_tf32)

    acc += scaling * lora_dx
    return acc


def _scatter2scatter_lora_dX_configs():
    """Generate backward dX kernel autotune configs.

    The inner loop is over N (not K as in forward). The output dimension is K.
    So BLOCK_K tiles the output and BLOCK_N tiles the reduction.

    BLOCK_M is now autotunable (was fixed at 128).

    Search space:
      BLOCK_M:    {32, 64, 128}        (token tile)
      BLOCK_K:    {32, 64, 128, 256}   (output tile)
      BLOCK_N:    {32, 64, 128, 256}   (reduction tile)
      num_warps:  {4, 8}
      num_stages: {3, 4, 5}
    """
    configs = []
    for block_m, block_k, block_n, warps, stages in product(
        [32, 64, 128],  # BLOCK_M
        [32, 64, 128, 256],  # BLOCK_K (output dimension)
        [32, 64, 128, 256],  # BLOCK_N (reduction dimension)
        [4, 8],  # num_warps
        [3, 4, 5],  # num_stages
    ):
        configs.append(
            triton.Config(
                {"BLOCK_M": block_m, "BLOCK_K": block_k, "BLOCK_N": block_n},
                num_stages=stages,
                num_warps=warps,
            )
        )
    return configs


def _prune_dX_configs(configs, named_args, **kwargs):
    """Prune backward dX configs based on SMEM capacity and register pressure.

    The dX kernel inner loop loads three tiles per pipeline stage:
      DY[BLOCK_M, BLOCK_N], W^T[BLOCK_N, BLOCK_K], B[BLOCK_N, BLOCK_R].
    The base estimate only accounts for DY and W^T. We add:
      - B tile [BLOCK_N, BLOCK_R] per pipeline stage (loaded in the inner loop)
      - A tile [BLOCK_R, BLOCK_K] loaded once in the epilogue
      - Extra headroom for compiler overhead (register spills, metadata)
    """
    smem_cap = _get_smem_capacity()

    # Get BLOCK_R from named_args if available, else assume worst case
    block_r = named_args.get("BLOCK_R", 64)

    scored = []
    for config in configs:
        block_m = config.kwargs["BLOCK_M"]
        block_k = config.kwargs["BLOCK_K"]
        block_n = config.kwargs["BLOCK_N"]
        # Base: stages * BLOCK_N * (BLOCK_M + BLOCK_K) + BLOCK_M * BLOCK_K
        smem_base = _estimate_smem_usage(config.num_stages, block_m, block_k, block_n)
        # B tile [BLOCK_N, BLOCK_R] loaded per stage in the inner loop
        smem_lora_loop = config.num_stages * block_n * block_r * 2
        # A tile [BLOCK_R, BLOCK_K] loaded once in epilogue
        smem_lora_epilogue = block_r * block_k * 2
        smem = smem_base + smem_lora_loop + smem_lora_epilogue

        # Register pressure: live tiles are acc[M,K], dy_b_acc[M,R],
        # dy[M,N], wt[N,K], b[N,R], plus epilogue a[R,K]
        est_regs = _estimate_register_pressure(
            config.num_warps,
            (block_m, block_k),  # acc
            (block_m, block_r),  # dy_b_acc
            (block_m, block_n),  # dy tile
            (block_n, block_k),  # wt tile
            (block_n, block_r),  # b tile
            (block_r, block_k),  # a tile (epilogue)
        )
        if est_regs > _MAX_REGS_SOFT_LIMIT:
            continue

        scored.append((smem, config))

    pruned = [c for s, c in scored if s <= smem_cap - _SMEM_SLACK]
    if pruned:
        return pruned
    if scored:
        # All surviving configs exceed SMEM — return the one with smallest usage
        scored.sort(key=lambda x: x[0])
        return [scored[0][1]]
    # All configs pruned by register pressure — fall back to smallest tiles
    return [
        min(
            configs,
            key=lambda c: (
                c.kwargs["BLOCK_M"] * c.kwargs["BLOCK_K"] * c.kwargs["BLOCK_N"]
            ),
        )
    ]


@triton.autotune(
    configs=_scatter2scatter_lora_dX_configs(),
    key=["M", "N", "K"],
    prune_configs_by={"early_config_prune": _prune_dX_configs},
)
@triton.heuristics(
    {
        "NO_K_MASK": lambda args: (args["K"] % args["BLOCK_K"]) == 0,
        "NO_N_MASK": lambda args: (args["N"] % args["BLOCK_N"]) == 0,
    }
)
@triton.jit
def _scatter2scatter_lora_dX(
    # Input: DY (gradient w.r.t. output, grouped)
    DY_ptr,
    stride_dym: tl.constexpr,
    stride_dyn: tl.constexpr,
    # Base weight: W [E, K, N] (we compute DY @ W^T)
    W_ptr,
    stride_we,
    stride_wk: tl.constexpr,
    stride_wn: tl.constexpr,
    # Output: dX
    DX_ptr,
    stride_dxm: tl.constexpr,
    stride_dxk: tl.constexpr,
    # LoRA weights
    LA_ptr,
    stride_la_r,
    stride_la_k,  # A: [r*E, K]
    LB_ptr,
    stride_lb_n,
    stride_lb_r,  # B: [N, r*E]
    # Routing
    grouped_idx_ptr,
    expert_idxs_ptr,
    # Dimensions
    FAN_OUT: tl.constexpr,
    M,
    K: tl.constexpr,
    N: tl.constexpr,
    E: tl.constexpr,
    ACTUAL_R: tl.constexpr,
    # Block sizes
    BLOCK_M: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_K: tl.constexpr,
    BLOCK_R: tl.constexpr,
    # Config
    ACC_TYPE: tl.constexpr,
    scaling,
    allow_tf32: tl.constexpr,
    dy_grouped: tl.constexpr,
    dx_grouped: tl.constexpr,
    NO_K_MASK: tl.constexpr,
    NO_N_MASK: tl.constexpr,
):
    """
    Fused backward dX = DY @ W^T + scaling * (DY @ B) @ A

    DY is in expert-grouped order (x_grouped=True).
    dX is output in ungrouped or grouped order based on dx_grouped.

    Grid: (cdiv(M_total, BLOCK_M) * cdiv(K, BLOCK_K),)
    """
    pid = tl.program_id(axis=0)

    K_BLOCK_COUNT = tl.cdiv(K, BLOCK_K)
    M_block_id = pid // K_BLOCK_COUNT
    K_block_id = pid % K_BLOCK_COUNT

    M_block = M_block_id * BLOCK_M + tl.arange(0, BLOCK_M)
    K_block = K_block_id * BLOCK_K + tl.arange(0, BLOCK_K)
    K_mask = K_block < K
    M_boundary_mask = M_block < (FAN_OUT * M)

    E_idxs = tl.load(expert_idxs_ptr + M_block, mask=M_boundary_mask, other=E)

    no_n_mask = NO_N_MASK

    acc = tl.zeros((BLOCK_M, BLOCK_K), dtype=ACC_TYPE)

    E_first_idx = tl.min(E_idxs)
    E_last_idx = tl.minimum(tl.max(E_idxs), E - 1)
    M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int32)

    for E_idx in range(E_first_idx, E_last_idx + 1):
        E_mask = E_idxs == E_idx
        if dy_grouped:
            M_in_idx = M_block
        else:
            M_in_idx = M_idx // FAN_OUT

        acc = _compute_expert_block_lora_dX(
            E_idx,
            E_mask,
            M_in_idx,
            K_block,
            K_mask,
            DY_ptr,
            stride_dym,
            stride_dyn,
            W_ptr,
            stride_we,
            stride_wk,
            stride_wn,
            LA_ptr,
            stride_la_r,
            stride_la_k,
            LB_ptr,
            stride_lb_n,
            stride_lb_r,
            N,
            ACTUAL_R,
            acc,
            no_n_mask,
            BLOCK_M,
            BLOCK_N,
            BLOCK_K,
            BLOCK_R,
            scaling,
            allow_tf32=allow_tf32,
        )

    # Store output
    if dx_grouped:
        M_out_idx = M_block
    else:
        M_out_idx = M_idx
    DX_blk_ptrs = DX_ptr + (
        M_out_idx[:, None] * stride_dxm + K_block[None, :] * stride_dxk
    )
    tl.store(DX_blk_ptrs, acc, mask=M_boundary_mask[:, None] & K_mask[None, :])


def scatter2scatter_lora_dX(
    DY: torch.Tensor,
    W: torch.Tensor,
    sorted_expert_idxs: torch.Tensor,
    sorted_scattered_idxs: torch.Tensor,
    k: int,
    lora_A: torch.Tensor,
    lora_B: torch.Tensor,
    scaling: float,
    dy_grouped: bool = True,
    dx_grouped: bool = False,
    out: Optional[torch.Tensor] = None,
) -> torch.Tensor:
    """
    Fused backward dX = DY @ W^T + scaling * (DY @ B) @ A

    Replaces the separate:
      1. base_ops.scatter2scatter(DY, W^T, x_grouped=True, ...)
      2. _compute_lora_input_grad(DY, A, B, ...)

    Args:
        DY: Gradient w.r.t. output [M*k, N] (grouped by expert)
        W: Expert weights [E, K, N] (NOT transposed — kernel handles W^T internally)
        sorted_expert_idxs: Expert assignments sorted [M*k]
        sorted_scattered_idxs: Original indices sorted [M*k]
        k: Fan-out (top-k)
        lora_A: LoRA A weights [r*E, K]
        lora_B: LoRA B weights [N, r*E]
        scaling: LoRA scaling factor
        dy_grouped: Whether DY is in grouped (expert-sorted) order (default True)
        dx_grouped: Whether to output dX in grouped order (default False)
        out: Optional pre-allocated output buffer

    Returns:
        dX: Input gradient [M*k, K]
    """
    assert sorted_scattered_idxs.size(0) == sorted_expert_idxs.size(0)

    E = W.size(0)
    K = W.size(1)
    N = W.size(2)
    R = lora_A.size(0) // E

    BLOCK_R = _block_r_for_rank(R)

    L_scattered = sorted_expert_idxs.size(0)

    # M for the kernel is DY.size(0) when dy_grouped, else the original M
    if dy_grouped:
        M = DY.size(0)
        fan_out = 1  # DY is already expanded
    else:
        M = DY.size(0)
        fan_out = k

    if out is None:
        output = torch.empty((L_scattered, K), device=DY.device, dtype=DY.dtype)
    else:
        assert out.size(0) == L_scattered and out.size(1) == K
        output = out

    def grid(META):
        return (
            triton.cdiv(L_scattered, META["BLOCK_M"]) * triton.cdiv(K, META["BLOCK_K"]),
        )

    _scatter2scatter_lora_dX[grid](
        DY,
        DY.stride(0),
        DY.stride(1),
        W,
        W.stride(0),
        W.stride(1),
        W.stride(2),
        output,
        output.stride(0),
        output.stride(1),
        lora_A,
        lora_A.stride(0),
        lora_A.stride(1),
        lora_B,
        lora_B.stride(0),
        lora_B.stride(1),
        sorted_scattered_idxs,
        sorted_expert_idxs,
        FAN_OUT=fan_out,
        M=M,
        K=K,
        N=N,
        E=E,
        ACTUAL_R=R,
        # BLOCK_M is autotuned (injected by triton.autotune from Config kwargs)
        BLOCK_R=BLOCK_R,
        ACC_TYPE=tl.float32,
        scaling=scaling,
        allow_tf32=ALLOW_TF32,
        dy_grouped=dy_grouped,
        dx_grouped=dx_grouped,
    )

    return output


# =============================================================================
# Backward Kernel: LoRA gradient computation (dA, dB)
# =============================================================================


def _group_bwd_lora_configs():
    """Generate backward (dA/dB) kernel autotune configs.

    Search space includes smaller tile sizes and fewer pipeline stages to
    support GPUs with limited shared memory (e.g. ~99KB on some GPUs).

    Search space:
      BLOCK_M:    {32, 64, 128, 256}   (token-loop tile)
      BLOCK_K:    {32, 64, 128, 256}
      BLOCK_N:    {32, 64, 128, 256}
      num_warps:  {4, 8}
      num_stages: {3, 4, 5}

    The backward kernel also uses BLOCK_R (from LoRA rank), but that is
    determined by the rank and not autotunable.
    """
    configs = []
    for block_m, block_k, block_n, warps, stages in product(
        [32, 64, 128, 256],  # BLOCK_M
        [32, 64, 128, 256],  # BLOCK_K
        [32, 64, 128, 256],  # BLOCK_N
        [4, 8],  # num_warps
        [3, 4, 5],  # num_stages
    ):
        configs.append(
            triton.Config(
                {"BLOCK_M": block_m, "BLOCK_K": block_k, "BLOCK_N": block_n},
                num_stages=stages,
                num_warps=warps,
            )
        )
    return configs


def _prune_bwd_lora_configs(configs, named_args, **kwargs):
    """Prune backward configs based on SMEM capacity and register pressure.

    The backward kernel loads X[BLOCK_M, BLOCK_K] and DY[BLOCK_M, BLOCK_N]
    in the inner loop, plus holds A[BLOCK_R, BLOCK_K] and B[BLOCK_N, BLOCK_R]
    for the full expert. We estimate SMEM based on the dominant terms.
    """
    smem_cap = _get_smem_capacity()
    block_r = named_args.get("BLOCK_R", 64)

    scored = []
    for config in configs:
        block_m = config.kwargs["BLOCK_M"]
        block_k = config.kwargs["BLOCK_K"]
        block_n = config.kwargs["BLOCK_N"]
        # Inner loop loads X[M,K] and DY[M,N], pipeline over M iterations
        smem_base = _estimate_smem_usage(config.num_stages, block_m, block_n, block_k)
        # A[BLOCK_R, BLOCK_K] and B[BLOCK_N, BLOCK_R] held for the full expert
        smem_lora = (block_r * block_k + block_n * block_r) * 2
        smem = smem_base + smem_lora

        # Register pressure: dA_acc[R,K], dB_acc[N,R], x[M,K], dy[M,N],
        # a[R,K], b[N,R], xa[M,R], dy_b[M,R]
        est_regs = _estimate_register_pressure(
            config.num_warps,
            (block_r, block_k),  # dA_acc
            (block_n, block_r),  # dB_acc
            (block_m, block_k),  # x tile
            (block_m, block_n),  # dy tile
            (block_r, block_k),  # a tile
            (block_n, block_r),  # b tile
            (block_m, block_r),  # xa intermediate
        )
        if est_regs > _MAX_REGS_SOFT_LIMIT:
            continue

        scored.append((smem, config))

    pruned = [c for s, c in scored if s <= smem_cap - _SMEM_SLACK]
    if pruned:
        return pruned
    if scored:
        # All surviving configs exceed SMEM — return the one with smallest usage
        scored.sort(key=lambda x: x[0])
        return [scored[0][1]]
    # All configs pruned by register pressure — fall back to smallest tiles
    return [
        min(
            configs,
            key=lambda c: (
                c.kwargs["BLOCK_M"] * c.kwargs["BLOCK_K"] * c.kwargs["BLOCK_N"]
            ),
        )
    ]


@triton.autotune(
    configs=_group_bwd_lora_configs(),
    key=["M", "N", "K"],
    prune_configs_by={"early_config_prune": _prune_bwd_lora_configs},
    reset_to_zero=["DLA_ptr", "DLB_ptr"],
)
@triton.heuristics(
    {
        "NO_K_MASK": lambda args: (args["K"] % args["BLOCK_K"]) == 0,
        "NO_N_MASK": lambda args: (args["N"] % args["BLOCK_N"]) == 0,
    }
)
@triton.jit
def _group_bwd_lora(
    # Inputs
    DY_ptr,
    stride_dym,
    stride_dyn,
    X_ptr,
    stride_xm,
    stride_xk,
    # LoRA weights (needed for cross-terms)
    LA_ptr,
    stride_la_r,
    stride_la_k,  # A: [r*E, K]
    LB_ptr,
    stride_lb_n,
    stride_lb_r,  # B: [N, r*E]
    # Gradient outputs
    DLA_ptr,
    stride_dla_r,
    stride_dla_k,
    DLB_ptr,
    stride_dlb_n,
    stride_dlb_r,
    # Expert offsets
    expert_offsets_ptr,
    # Dimensions
    M,
    K: tl.constexpr,
    N: tl.constexpr,
    ACTUAL_R: tl.constexpr,  # True LoRA rank (for weight indexing)
    BLOCK_R: tl.constexpr,  # Padded tile size >= max(ACTUAL_R, 16)
    scaling,
    # Block sizes
    BLOCK_M: tl.constexpr,
    BLOCK_K: tl.constexpr,
    BLOCK_N: tl.constexpr,
    ACC_TYPE: tl.constexpr,
    allow_tf32: tl.constexpr,
    NO_K_MASK: tl.constexpr,
    NO_N_MASK: tl.constexpr,
):
    """
    Compute LoRA gradients for each expert on grouped data.

    Grid: (E * cdiv(K, BLOCK_K), cdiv(N, BLOCK_N))

    For expert e:
      dA[e] = scaling * (dY @ B[e])^T @ X   -> [r, K], accumulate over M tokens
      dB[e] = scaling * dY^T @ (X @ A[e]^T)  -> [N, r], accumulate over M tokens

    ACTUAL_R is the true LoRA rank. BLOCK_R >= ACTUAL_R is padded for tl.dot min size.
    """
    pid0 = tl.program_id(axis=0)
    pid1 = tl.program_id(axis=1)

    K_BLOCK_COUNT = tl.cdiv(K, BLOCK_K)
    E_idx = pid0 // K_BLOCK_COUNT
    K_block_id = pid0 % K_BLOCK_COUNT
    N_block_id = pid1

    # Get expert's token range from cumulative offsets
    if E_idx == 0:
        start_idx = 0
    else:
        start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32)
    end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32)
    num_tokens = end_idx - start_idx

    if num_tokens > 0:
        M_block = tl.arange(0, BLOCK_M)
        K_block = K_block_id * BLOCK_K + tl.arange(0, BLOCK_K)
        K_mask = K_block < K
        N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
        N_mask = N_block < N
        R_block = tl.arange(0, BLOCK_R)
        R_mask = R_block < ACTUAL_R  # Mask for padding

        lora_offset = E_idx * ACTUAL_R

        # Determine input element type for consistent casting.
        INPUT_DTYPE = X_ptr.dtype.element_ty

        # Load B[e]: [BLOCK_N, BLOCK_R] (masked on R and N, other=0 for padding)
        B_blk_ptrs = (
            LB_ptr
            + N_block[:, None] * stride_lb_n
            + (lora_offset + R_block)[None, :] * stride_lb_r
        )
        b_e = tl.load(B_blk_ptrs, mask=N_mask[:, None] & R_mask[None, :], other=0.0).to(
            INPUT_DTYPE
        )

        # Load A[e]: [BLOCK_R, BLOCK_K] (masked on R and K, other=0 for padding)
        A_blk_ptrs = (
            LA_ptr
            + (lora_offset + R_block)[:, None] * stride_la_r
            + K_block[None, :] * stride_la_k
        )
        a_e = tl.load(A_blk_ptrs, mask=R_mask[:, None] & K_mask[None, :], other=0.0).to(
            INPUT_DTYPE
        )

        # Accumulators
        dA_acc = tl.zeros((BLOCK_R, BLOCK_K), dtype=ACC_TYPE)
        dB_acc = tl.zeros((BLOCK_N, BLOCK_R), dtype=ACC_TYPE)

        iters = tl.cdiv(num_tokens, BLOCK_M)
        for i in range(iters):
            M_idx = start_idx + i * BLOCK_M + M_block
            M_mask = M_idx < end_idx

            # Load X: [BLOCK_M, BLOCK_K]
            X_blk_ptrs = (
                X_ptr + M_idx[:, None] * stride_xm + K_block[None, :] * stride_xk
            )
            x = tl.load(
                X_blk_ptrs, mask=M_mask[:, None] & K_mask[None, :], other=0.0
            ).to(INPUT_DTYPE)

            # Load dY: [BLOCK_M, BLOCK_N]
            DY_blk_ptrs = (
                DY_ptr + M_idx[:, None] * stride_dym + N_block[None, :] * stride_dyn
            )
            dy = tl.load(
                DY_blk_ptrs, mask=M_mask[:, None] & N_mask[None, :], other=0.0
            ).to(INPUT_DTYPE)

            # X @ A[e]^T: [M, K] @ [K, R] -> [M, R]
            xa = tl.dot(x, tl.trans(a_e), allow_tf32=allow_tf32)

            # dY @ B[e]: [M, N] @ [N, R] -> [M, R]
            dy_b = tl.dot(dy, b_e, allow_tf32=allow_tf32)

            # Cast intermediates to input dtype for subsequent tl.dot calls
            # (tl.dot requires both operands to have the same dtype)
            dy_b_cast = dy_b.to(INPUT_DTYPE)
            xa_cast = xa.to(INPUT_DTYPE)

            # dA += (dY @ B)^T @ X: [R, M] @ [M, K] -> [R, K]
            dA_acc += tl.dot(tl.trans(dy_b_cast), x, allow_tf32=allow_tf32)

            # dB += dY^T @ (X @ A^T): [N, M] @ [M, R] -> [N, R]
            dB_acc += tl.dot(tl.trans(dy), xa_cast, allow_tf32=allow_tf32)

        # Store dA with scaling (atomic add since multiple N_blocks contribute)
        # Only store the actual R rows, not the padded ones
        DLA_blk_ptrs = (
            DLA_ptr
            + (lora_offset + R_block)[:, None] * stride_dla_r
            + K_block[None, :] * stride_dla_k
        )
        tl.atomic_add(
            DLA_blk_ptrs,
            (dA_acc * scaling).to(DLA_ptr.dtype.element_ty),
            mask=R_mask[:, None] & K_mask[None, :],
        )

        # Store dB with scaling (atomic add since multiple K_blocks contribute)
        DLB_blk_ptrs = (
            DLB_ptr
            + N_block[:, None] * stride_dlb_n
            + (lora_offset + R_block)[None, :] * stride_dlb_r
        )
        tl.atomic_add(
            DLB_blk_ptrs,
            (dB_acc * scaling).to(DLB_ptr.dtype.element_ty),
            mask=N_mask[:, None] & R_mask[None, :],
        )


def _group_bwd_split_configs():
    """Autotune configs for split dA/dB kernels."""
    configs = []
    for block_m, block_dim, warps, stages in product(
        [32, 64, 128],  # BLOCK_M (token tile)
        [32, 64, 128, 256],  # BLOCK_DIM (K for dA, N for dB — output tile)
        [4, 8],  # num_warps
        [3, 4, 5],  # num_stages
    ):
        configs.append(
            triton.Config(
                {"BLOCK_M": block_m, "BLOCK_DIM": block_dim},
                num_stages=stages,
                num_warps=warps,
            )
        )
    return configs


def _prune_split_configs(configs, named_args, **kwargs):
    """Prune split kernel configs based on SMEM capacity and register pressure."""
    smem_cap = _get_smem_capacity()
    block_r = named_args.get("BLOCK_R", 64)

    # Fixed inner tile for reduction dimension
    BLOCK_INNER = 64

    pruned = []
    for config in configs:
        block_m = config.kwargs["BLOCK_M"]
        block_dim = config.kwargs["BLOCK_DIM"]
        # Inner loop loads: input[M, INNER] and other[M, INNER_or_DIM]
        smem = config.num_stages * BLOCK_INNER * (block_m + block_dim) * 2
        # LoRA weights held in registers: [INNER, R] or [R, DIM]
        smem += (block_r * max(block_dim, BLOCK_INNER)) * 2

        # Register pressure check
        est_regs = _estimate_register_pressure(
            config.num_warps,
            (block_r, block_dim),  # acc
            (block_m, BLOCK_INNER),  # input tile
            (block_m, block_dim),  # other tile
            (block_r, BLOCK_INNER),  # lora weight
        )
        if est_regs > _MAX_REGS_SOFT_LIMIT:
            continue

        if smem <= smem_cap - _SMEM_SLACK:
            pruned.append(config)

    if pruned:
        return pruned
    configs.sort(key=lambda c: c.kwargs["BLOCK_M"] * c.kwargs["BLOCK_DIM"])
    return [configs[0]]


@triton.autotune(
    configs=_group_bwd_split_configs(),
    key=["M", "K", "N"],
    prune_configs_by={"early_config_prune": _prune_split_configs},
)
@triton.heuristics(
    {
        "NO_DIM_MASK": lambda args: (
            (args["K"] % args["BLOCK_DIM"]) == 0
            if args["COMPUTE_DA"]
            else (args["N"] % args["BLOCK_DIM"]) == 0
        ),
    }
)
@triton.jit
def _group_bwd_lora_split(
    # Data tensors (DY and X are always present)
    DY_ptr,
    stride_dym,
    stride_dyn,
    X_ptr,
    stride_xm,
    stride_xk,
    # LoRA weight for the inner reduction (B for dA, A for dB)
    LW_ptr,
    stride_lw0,
    stride_lw1,
    # Output gradient tensor (dA or dB)
    OUT_ptr,
    stride_out0,
    stride_out1,
    # Expert offsets
    expert_offsets_ptr,
    # Dimensions
    M,
    K: tl.constexpr,
    N: tl.constexpr,
    ACTUAL_R: tl.constexpr,
    BLOCK_R: tl.constexpr,
    INNER_DIM: tl.constexpr,  # reduction dimension (N for dA, K for dB)
    scaling,
    # Mode flag
    COMPUTE_DA: tl.constexpr,  # True = compute dA, False = compute dB
    # Tile sizes
    BLOCK_M: tl.constexpr,
    BLOCK_DIM: tl.constexpr,
    ACC_TYPE: tl.constexpr,
    allow_tf32: tl.constexpr,
    NO_DIM_MASK: tl.constexpr,
):
    """
    Unified split kernel for LoRA gradient computation.

    When COMPUTE_DA=True:
      dA[e] = scaling * (dY @ B[e])^T @ X  →  [R, K]
      Grid: (E, cdiv(K, BLOCK_DIM))
      - outer_ptr/stride = X (read [M, K_block])
      - inner reduction over N using DY and B
      - output shape [BLOCK_R, BLOCK_DIM]

    When COMPUTE_DA=False:
      dB[e] = scaling * dY^T @ (X @ A[e]^T)  →  [N, R]
      Grid: (E, cdiv(N, BLOCK_DIM))
      - outer_ptr/stride = DY (read [M, N_block])
      - inner reduction over K using X and A
      - output shape [BLOCK_DIM, BLOCK_R]

    No atomic adds — each (E, dim_block) pair is written by exactly one block.
    """
    E_idx = tl.program_id(0)
    dim_block_id = tl.program_id(1)

    if E_idx == 0:
        start_idx = 0
    else:
        start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32)
    end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32)
    num_tokens = end_idx - start_idx

    # Output dimension tile (K for dA, N for dB)
    if COMPUTE_DA:
        OUT_DIM: tl.constexpr = K  # type: ignore[no-redef]
    else:
        OUT_DIM: tl.constexpr = N  # type: ignore[no-redef]
    dim_block = dim_block_id * BLOCK_DIM + tl.arange(0, BLOCK_DIM)
    dim_mask = dim_block < OUT_DIM
    R_block = tl.arange(0, BLOCK_R)
    R_mask = R_block < ACTUAL_R
    lora_offset = E_idx * ACTUAL_R

    # Output pointers — layout differs: dA is [R, K], dB is [N, R]
    if COMPUTE_DA:
        out_blk_ptrs = (
            OUT_ptr
            + (lora_offset + R_block)[:, None] * stride_out0
            + dim_block[None, :] * stride_out1
        )
        out_mask = R_mask[:, None] & dim_mask[None, :]
    else:
        out_blk_ptrs = (
            OUT_ptr
            + dim_block[:, None] * stride_out0
            + (lora_offset + R_block)[None, :] * stride_out1
        )
        out_mask = dim_mask[:, None] & R_mask[None, :]

    if num_tokens > 0:
        M_block = tl.arange(0, BLOCK_M)
        INPUT_DTYPE = X_ptr.dtype.element_ty
        BLOCK_INNER: tl.constexpr = 64
        inner_iters = tl.cdiv(INNER_DIM, BLOCK_INNER)

        if COMPUTE_DA:
            acc = tl.zeros((BLOCK_R, BLOCK_DIM), dtype=ACC_TYPE)
        else:
            acc = tl.zeros((BLOCK_DIM, BLOCK_R), dtype=ACC_TYPE)

        M_iters = tl.cdiv(num_tokens, BLOCK_M)
        for i in range(M_iters):
            M_idx = start_idx + i * BLOCK_M + M_block
            M_mask = M_idx < end_idx

            if COMPUTE_DA:
                # Load X[M, K_block] (the "outer" tensor for dA)
                outer = tl.load(
                    X_ptr + M_idx[:, None] * stride_xm + dim_block[None, :] * stride_xk,
                    mask=M_mask[:, None] & dim_mask[None, :],
                    other=0.0,
                ).to(INPUT_DTYPE)

                # Reduce DY[M, :] @ B[e][:, R] over N → [M, R]
                reduced = tl.zeros((BLOCK_M, BLOCK_R), dtype=ACC_TYPE)
                inner_range = tl.arange(0, BLOCK_INNER)
                for j in range(inner_iters):
                    inn_off = j * BLOCK_INNER + inner_range
                    inn_mask = inn_off < N

                    dy_tile = tl.load(
                        DY_ptr
                        + M_idx[:, None] * stride_dym
                        + inn_off[None, :] * stride_dyn,
                        mask=M_mask[:, None] & inn_mask[None, :],
                        other=0.0,
                    ).to(INPUT_DTYPE)
                    # B layout: [N, r*E] → stride_lw0=N stride, stride_lw1=r*E stride
                    lw_tile = tl.load(
                        LW_ptr
                        + inn_off[:, None] * stride_lw0
                        + (lora_offset + R_block)[None, :] * stride_lw1,
                        mask=inn_mask[:, None] & R_mask[None, :],
                        other=0.0,
                    ).to(INPUT_DTYPE)
                    reduced += tl.dot(dy_tile, lw_tile, allow_tf32=allow_tf32)

                # dA += (DY@B)^T @ X: [R, M] @ [M, K_block] → [R, K_block]
                acc += tl.dot(
                    tl.trans(reduced.to(INPUT_DTYPE)), outer, allow_tf32=allow_tf32
                )
            else:
                # Load DY[M, N_block] (the "outer" tensor for dB)
                outer = tl.load(
                    DY_ptr
                    + M_idx[:, None] * stride_dym
                    + dim_block[None, :] * stride_dyn,
                    mask=M_mask[:, None] & dim_mask[None, :],
                    other=0.0,
                ).to(INPUT_DTYPE)

                # Reduce X[M, :] @ A[e][:, :].T over K → [M, R]
                reduced = tl.zeros((BLOCK_M, BLOCK_R), dtype=ACC_TYPE)
                inner_range = tl.arange(0, BLOCK_INNER)
                for j in range(inner_iters):
                    inn_off = j * BLOCK_INNER + inner_range
                    inn_mask = inn_off < K

                    x_tile = tl.load(
                        X_ptr
                        + M_idx[:, None] * stride_xm
                        + inn_off[None, :] * stride_xk,
                        mask=M_mask[:, None] & inn_mask[None, :],
                        other=0.0,
                    ).to(INPUT_DTYPE)
                    # A layout: [r*E, K] → stride_lw0=r*E stride, stride_lw1=K stride
                    # We want A[e]^T: [K, R], so load as [K_inner, R]
                    lw_tile = tl.load(
                        LW_ptr
                        + (lora_offset + R_block)[None, :] * stride_lw0
                        + inn_off[:, None] * stride_lw1,
                        mask=inn_mask[:, None] & R_mask[None, :],
                        other=0.0,
                    ).to(INPUT_DTYPE)
                    reduced += tl.dot(x_tile, lw_tile, allow_tf32=allow_tf32)

                # dB += DY^T @ (X@A^T): [N_block, M] @ [M, R] → [N_block, R]
                acc += tl.dot(
                    tl.trans(outer), reduced.to(INPUT_DTYPE), allow_tf32=allow_tf32
                )

        tl.store(
            out_blk_ptrs, (acc * scaling).to(OUT_ptr.dtype.element_ty), mask=out_mask
        )
    else:
        # Zero out this expert's slice — needed because output uses empty_like
        if COMPUTE_DA:
            tl.store(
                out_blk_ptrs,
                tl.zeros((BLOCK_R, BLOCK_DIM), dtype=OUT_ptr.dtype.element_ty),
                mask=out_mask,
            )
        else:
            tl.store(
                out_blk_ptrs,
                tl.zeros((BLOCK_DIM, BLOCK_R), dtype=OUT_ptr.dtype.element_ty),
                mask=out_mask,
            )


def group_bwd_lora(
    DY: torch.Tensor,
    X: torch.Tensor,
    lora_A: torch.Tensor,
    lora_B: torch.Tensor,
    expert_offsets: torch.Tensor,
    E: int,
    scaling: float,
    sorted_scattered_idxs: Optional[torch.Tensor] = None,
    k: int = 1,
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Compute LoRA gradients for A and B on expert-grouped data.

    Uses split dA/dB kernels that eliminate atomic adds by giving each
    (expert, output_block) pair its own thread block.

    Args:
        DY: Gradient w.r.t. output [M_total, N] (grouped by expert)
        X: Input [M_total, K] (grouped by expert)
        lora_A: LoRA A weights [r*E, K]
        lora_B: LoRA B weights [N, r*E]
        expert_offsets: Cumulative token counts per expert [E]
        E: Number of experts
        scaling: LoRA scaling factor

    Returns:
        dA: Gradient for A [r*E, K]
        dB: Gradient for B [N, r*E]
    """
    R = lora_A.size(0) // E
    K = X.size(1)
    N = DY.size(1)

    # No zero-init needed: the split kernels write zeros for experts with
    # zero routed tokens directly in the kernel (else branch).
    dA = torch.empty_like(lora_A)
    dB = torch.empty_like(lora_B)

    BLOCK_R = _block_r_for_rank(R)

    def grid_dA(META):
        return (E, triton.cdiv(K, META["BLOCK_DIM"]))

    _group_bwd_lora_split[grid_dA](
        DY,
        DY.stride(0),
        DY.stride(1),
        X,
        X.stride(0),
        X.stride(1),
        lora_B,
        lora_B.stride(0),
        lora_B.stride(1),
        dA,
        dA.stride(0),
        dA.stride(1),
        expert_offsets,
        M=DY.size(0),
        K=K,
        N=N,
        ACTUAL_R=R,
        BLOCK_R=BLOCK_R,
        INNER_DIM=N,
        scaling=scaling,
        COMPUTE_DA=True,
        ACC_TYPE=tl.float32,
        allow_tf32=ALLOW_TF32,
    )

    def grid_dB(META):
        return (E, triton.cdiv(N, META["BLOCK_DIM"]))

    _group_bwd_lora_split[grid_dB](
        DY,
        DY.stride(0),
        DY.stride(1),
        X,
        X.stride(0),
        X.stride(1),
        lora_A,
        lora_A.stride(0),
        lora_A.stride(1),
        dB,
        dB.stride(0),
        dB.stride(1),
        expert_offsets,
        M=DY.size(0),
        K=K,
        N=N,
        ACTUAL_R=R,
        BLOCK_R=BLOCK_R,
        INNER_DIM=K,
        scaling=scaling,
        COMPUTE_DA=False,
        ACC_TYPE=tl.float32,
        allow_tf32=ALLOW_TF32,
    )

    return dA, dB


# =============================================================================
# Backward Kernel: Fused gather + LoRA gradient (dA, dB) — eliminates group()
# =============================================================================


@triton.autotune(
    configs=_group_bwd_lora_configs(),
    key=["M", "N", "K"],
    prune_configs_by={"early_config_prune": _prune_bwd_lora_configs},
    reset_to_zero=["DLA_ptr", "DLB_ptr"],
)
@triton.heuristics(
    {
        "NO_K_MASK": lambda args: (args["K"] % args["BLOCK_K"]) == 0,
        "NO_N_MASK": lambda args: (args["N"] % args["BLOCK_N"]) == 0,
    }
)
@triton.jit
def _group_bwd_lora_fused(
    # Inputs (ungrouped or grouped)
    DY_ptr,
    stride_dym,
    stride_dyn,
    X_ptr,
    stride_xm,
    stride_xk,
    # Scatter indices for gather-on-load
    sorted_scattered_idxs_ptr,
    FAN_OUT: tl.constexpr,
    # LoRA weights (needed for cross-terms)
    LA_ptr,
    stride_la_r,
    stride_la_k,  # A: [r*E, K]
    LB_ptr,
    stride_lb_n,
    stride_lb_r,  # B: [N, r*E]
    # Gradient outputs
    DLA_ptr,
    stride_dla_r,
    stride_dla_k,
    DLB_ptr,
    stride_dlb_n,
    stride_dlb_r,
    # Expert offsets
    expert_offsets_ptr,
    # Real expert offsets (for M_mask when using token rounding, else same as expert_offsets_ptr)
    real_expert_offsets_ptr,
    # Dimensions
    M,
    K: tl.constexpr,
    N: tl.constexpr,
    ACTUAL_R: tl.constexpr,
    BLOCK_R: tl.constexpr,
    scaling,
    # Block sizes
    BLOCK_M: tl.constexpr,
    BLOCK_K: tl.constexpr,
    BLOCK_N: tl.constexpr,
    ACC_TYPE: tl.constexpr,
    allow_tf32: tl.constexpr,
    NO_K_MASK: tl.constexpr,
    NO_N_MASK: tl.constexpr,
    # Whether DY is already in grouped (expert-sorted) order
    dy_grouped: tl.constexpr = False,
):
    """
    Fused gather + LoRA gradient computation. Same as _group_bwd_lora but
    reads X from ungrouped buffers using sorted_scattered_idxs for indirect
    indexing, eliminating the need for a separate group(X) call.

    When dy_grouped=False (default): both X and DY are read via indirect
    indexing through sorted_scattered_idxs.  This eliminates both group()
    calls entirely.

    When dy_grouped=True: DY is already in grouped order (e.g. gate_up_proj
    backward where grouped_out=True) and is read directly.  Only X uses
    indirect indexing.  This avoids the group(X) allocation while
    still supporting the grouped DY case.

    Grid: (E * cdiv(K, BLOCK_K), cdiv(N, BLOCK_N))

    For expert e:
      dA[e] = scaling * (dY @ B[e])^T @ X   -> [r, K]
      dB[e] = scaling * dY^T @ (X @ A[e]^T)  -> [N, r]

    Supports token rounding: expert_offsets_ptr gives the iteration range
    (padded to BLOCK_M multiples), real_expert_offsets_ptr gives the real
    token count for M_mask (to exclude padding tokens).
    """
    pid0 = tl.program_id(axis=0)
    pid1 = tl.program_id(axis=1)

    K_BLOCK_COUNT = tl.cdiv(K, BLOCK_K)
    E_idx = pid0 // K_BLOCK_COUNT
    K_block_id = pid0 % K_BLOCK_COUNT
    N_block_id = pid1

    # Get expert's token range from cumulative offsets
    # start_idx/end_idx from expert_offsets_ptr: iteration range (possibly padded)
    # real_end_idx from real_expert_offsets_ptr: for M_mask (real token count)
    if E_idx == 0:
        start_idx = 0
        real_start_idx = 0
    else:
        start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32)
        real_start_idx = tl.load(real_expert_offsets_ptr + E_idx - 1).to(tl.int32)
    end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32)
    real_end_idx = tl.load(real_expert_offsets_ptr + E_idx).to(tl.int32)
    num_tokens = end_idx - start_idx

    if num_tokens > 0:
        M_block = tl.arange(0, BLOCK_M)
        K_block = K_block_id * BLOCK_K + tl.arange(0, BLOCK_K)
        K_mask = K_block < K
        N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
        N_mask = N_block < N
        R_block = tl.arange(0, BLOCK_R)
        R_mask = R_block < ACTUAL_R

        lora_offset = E_idx * ACTUAL_R

        # Determine input element type for consistent casting.
        INPUT_DTYPE = X_ptr.dtype.element_ty

        # Load B[e] and A[e] — same as non-fused kernel
        B_blk_ptrs = (
            LB_ptr
            + N_block[:, None] * stride_lb_n
            + (lora_offset + R_block)[None, :] * stride_lb_r
        )
        b_e = tl.load(B_blk_ptrs, mask=N_mask[:, None] & R_mask[None, :], other=0.0).to(
            INPUT_DTYPE
        )

        A_blk_ptrs = (
            LA_ptr
            + (lora_offset + R_block)[:, None] * stride_la_r
            + K_block[None, :] * stride_la_k
        )
        a_e = tl.load(A_blk_ptrs, mask=R_mask[:, None] & K_mask[None, :], other=0.0).to(
            INPUT_DTYPE
        )

        # Accumulators
        dA_acc = tl.zeros((BLOCK_R, BLOCK_K), dtype=ACC_TYPE)
        dB_acc = tl.zeros((BLOCK_N, BLOCK_R), dtype=ACC_TYPE)

        real_num_tokens = real_end_idx - real_start_idx
        iters = tl.cdiv(num_tokens, BLOCK_M)
        for i in range(iters):
            M_idx = start_idx + i * BLOCK_M + M_block
            # Use real token count for masking (excludes padding tokens)
            M_local = i * BLOCK_M + M_block
            M_mask = M_local < real_num_tokens

            # Fused gather: load scatter indices for indirect X access
            scatter_idx = tl.load(
                sorted_scattered_idxs_ptr + M_idx, mask=M_mask, other=0
            ).to(tl.int32)
            X_token_idx = scatter_idx // FAN_OUT  # X is [M, K], not expanded by k

            # Load X via indirect index: [BLOCK_M, BLOCK_K]
            X_blk_ptrs = (
                X_ptr + X_token_idx[:, None] * stride_xm + K_block[None, :] * stride_xk
            )
            x = tl.load(
                X_blk_ptrs, mask=M_mask[:, None] & K_mask[None, :], other=0.0
            ).to(INPUT_DTYPE)

            # Load DY: indirect via scatter_idx when ungrouped, direct via M_idx when grouped
            if dy_grouped:
                DY_blk_ptrs = (
                    DY_ptr + M_idx[:, None] * stride_dym + N_block[None, :] * stride_dyn
                )
            else:
                DY_blk_ptrs = (
                    DY_ptr
                    + scatter_idx[:, None] * stride_dym
                    + N_block[None, :] * stride_dyn
                )
            dy = tl.load(
                DY_blk_ptrs, mask=M_mask[:, None] & N_mask[None, :], other=0.0
            ).to(INPUT_DTYPE)

            # X @ A[e]^T: [M, K] @ [K, R] -> [M, R]
            xa = tl.dot(x, tl.trans(a_e), allow_tf32=allow_tf32)

            # dY @ B[e]: [M, N] @ [N, R] -> [M, R]
            dy_b = tl.dot(dy, b_e, allow_tf32=allow_tf32)

            dy_b_cast = dy_b.to(INPUT_DTYPE)
            xa_cast = xa.to(INPUT_DTYPE)

            # dA += (dY @ B)^T @ X: [R, M] @ [M, K] -> [R, K]
            dA_acc += tl.dot(tl.trans(dy_b_cast), x, allow_tf32=allow_tf32)

            # dB += dY^T @ (X @ A^T): [N, M] @ [M, R] -> [N, R]
            dB_acc += tl.dot(tl.trans(dy), xa_cast, allow_tf32=allow_tf32)

        # Store dA with scaling (atomic add since multiple N_blocks contribute)
        DLA_blk_ptrs = (
            DLA_ptr
            + (lora_offset + R_block)[:, None] * stride_dla_r
            + K_block[None, :] * stride_dla_k
        )
        tl.atomic_add(
            DLA_blk_ptrs,
            (dA_acc * scaling).to(DLA_ptr.dtype.element_ty),
            mask=R_mask[:, None] & K_mask[None, :],
        )

        # Store dB with scaling (atomic add since multiple K_blocks contribute)
        DLB_blk_ptrs = (
            DLB_ptr
            + N_block[:, None] * stride_dlb_n
            + (lora_offset + R_block)[None, :] * stride_dlb_r
        )
        tl.atomic_add(
            DLB_blk_ptrs,
            (dB_acc * scaling).to(DLB_ptr.dtype.element_ty),
            mask=N_mask[:, None] & R_mask[None, :],
        )


def group_bwd_lora_fused(
    DY: torch.Tensor,
    X: torch.Tensor,
    lora_A: torch.Tensor,
    lora_B: torch.Tensor,
    expert_offsets: torch.Tensor,
    sorted_scattered_idxs: torch.Tensor,
    E: int,
    k: int,
    scaling: float,
    real_expert_offsets: Optional[torch.Tensor] = None,
    dy_grouped: bool = False,
) -> tuple[torch.Tensor, torch.Tensor]:
    """
    Fused gather + LoRA gradient computation. Same result as
    group(X) + group(DY) + group_bwd_lora(DY, X, ...) but without
    the intermediate grouped buffers.

    Args:
        DY: Gradient w.r.t. output [M*k, N].
            If dy_grouped=False: ungrouped (original token order), read via
            indirect indexing through sorted_scattered_idxs.
            If dy_grouped=True: already in grouped (expert-sorted) order,
            read directly.
        X: Input [M, K] (ungrouped, original token order).  Always read via
            indirect indexing through sorted_scattered_idxs.
        lora_A: LoRA A weights [r*E, K]
        lora_B: LoRA B weights [N, r*E]
        expert_offsets: Cumulative token counts per expert [E]
            (or padded offsets if using token rounding)
        sorted_scattered_idxs: Maps grouped position -> original position [M*k]
            (or padded version if using token rounding)
        E: Number of experts
        k: Fan-out (top-k)
        scaling: LoRA scaling factor
        real_expert_offsets: Original cumulative counts for M_mask when using
            token rounding. If None, expert_offsets is used for both.
        dy_grouped: Whether DY is already in grouped order (default False).
            When True, avoids indirect indexing for DY, used for gate_up_proj
            backward where grouped_out=True.

    Returns:
        dA: Gradient for A [r*E, K]
        dB: Gradient for B [N, r*E]
    """
    R = lora_A.size(0) // E
    K = X.size(1)
    N = DY.size(1)

    # Zero-init for atomic accumulation
    dA = torch.zeros_like(lora_A)
    dB = torch.zeros_like(lora_B)

    BLOCK_R = _block_r_for_rank(R)

    if real_expert_offsets is None:
        real_expert_offsets = expert_offsets

    def grid(META):
        return (
            E * triton.cdiv(K, META["BLOCK_K"]),
            triton.cdiv(N, META["BLOCK_N"]),
        )

    _group_bwd_lora_fused[grid](
        DY,
        DY.stride(0),
        DY.stride(1),
        X,
        X.stride(0),
        X.stride(1),
        sorted_scattered_idxs,
        FAN_OUT=k,
        LA_ptr=lora_A,
        stride_la_r=lora_A.stride(0),
        stride_la_k=lora_A.stride(1),
        LB_ptr=lora_B,
        stride_lb_n=lora_B.stride(0),
        stride_lb_r=lora_B.stride(1),
        DLA_ptr=dA,
        stride_dla_r=dA.stride(0),
        stride_dla_k=dA.stride(1),
        DLB_ptr=dB,
        stride_dlb_n=dB.stride(0),
        stride_dlb_r=dB.stride(1),
        expert_offsets_ptr=expert_offsets,
        real_expert_offsets_ptr=real_expert_offsets,
        M=sorted_scattered_idxs.size(0),
        K=K,
        N=N,
        ACTUAL_R=R,
        BLOCK_R=BLOCK_R,
        scaling=scaling,
        ACC_TYPE=tl.float32,
        allow_tf32=ALLOW_TF32,
        dy_grouped=dy_grouped,
    )

    return dA, dB


================================================
FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/ops.py
================================================
# SPDX-License-Identifier: Apache-2.0
# Adapted from https://github.com/shawntan/scattermoe
# Copyright (c) Shawn Tan and ScatterMoE Contributors
# Licensed under the Apache License, Version 2.0
# See https://github.com/shawntan/scattermoe/blob/main/LICENSE

from typing import Optional

import torch
import triton
import triton.language as tl

BLOCK_M = 128
ALLOW_TF32 = True


@triton.jit
def _compute_expert_block(
    E_idx,
    E_mask,
    M_in_idx,
    N_block,
    N_mask,
    X_ptr,
    stride_xm,
    stride_xk,
    W_ptr,
    stride_we,
    stride_wk,
    stride_wn,
    K,
    acc,
    no_k_mask,
    BLOCK_K,
    allow_tf32=True,
):
    K_block = tl.arange(0, BLOCK_K)
    X_blk_ptrs = X_ptr + M_in_idx[:, None] * stride_xm + K_block[None, :] * stride_xk
    W_blk_ptrs = (
        W_ptr
        + K_block[:, None] * stride_wk
        + N_block[None, :] * stride_wn
        + E_idx * stride_we
    )
    iters = tl.cdiv(K, BLOCK_K)

    for K_block_id in range(iters):
        if no_k_mask:
            x = tl.load(X_blk_ptrs, mask=E_mask[:, None])
            w = tl.load(W_blk_ptrs, mask=N_mask[None, :])
        else:
            K_mask = (K_block_id * BLOCK_K + K_block) < K
            x = tl.load(X_blk_ptrs, mask=E_mask[:, None] & K_mask[None, :])
            w = tl.load(W_blk_ptrs, mask=K_mask[:, None] & N_mask[None, :])

        X_blk_ptrs += BLOCK_K * stride_xk
        W_blk_ptrs += BLOCK_K * stride_wk
        acc = tl.dot(x, w, acc, allow_tf32=allow_tf32)
    return acc


def _scatter2scatter_configs():
    return [
        triton.Config({"BLOCK_N": 128, "BLOCK_K": 32}, num_stages=4, num_warps=4),
    ]


@triton.autotune(
    configs=_scatter2scatter_configs(),
    key=["M", "N", "K"],
)
@triton.heuristics(
    {
        "NO_K_MASK": lambda args: (args["K"] % args["BLOCK_K"]) == 0,
        "NO_N_MASK": lambda args: (args["N"] % args["BLOCK_N"]) == 0,
    }
)
@triton.jit
def _scatter2scatter(
    X_ptr,
    stride_xm: tl.constexpr,
    stride_xk: tl.constexpr,
    W_ptr,
    stride_we,
    stride_wk: tl.constexpr,
    stride_wn: tl.constexpr,
    Y_ptr,
    stride_ym: tl.constexpr,
    stride_yn: tl.constexpr,
    B_ptr,
    stride_be: tl.constexpr,
    stride_bn: tl.constexpr,
    grouped_idx_ptr,
    expert_idxs_ptr,
    # block_start_idx_ptr,
    FAN_OUT: tl.constexpr,
    M,
    K: tl.constexpr,
    N: tl.constexpr,
    E: tl.constexpr,
    BLOCK_M: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_K: tl.constexpr,
    ACC_TYPE: tl.constexpr,
    # OUT_M,
    allow_tf32: tl.constexpr,
    x_grouped: tl.constexpr,
    y_grouped: tl.constexpr,
    NO_K_MASK: tl.constexpr,
    NO_N_MASK: tl.constexpr,
):
    pid = tl.program_id(axis=0)

    N_BLOCK_COUNT = tl.cdiv(N, BLOCK_N)
    M_block_id = pid // N_BLOCK_COUNT
    N_block_id = pid % N_BLOCK_COUNT

    M_block = M_block_id * BLOCK_M + tl.arange(0, BLOCK_M)
    N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
    N_mask = N_block < N
    M_boundary_mask = M_block < (FAN_OUT * M)
    E_idxs = tl.load(expert_idxs_ptr + M_block, mask=M_boundary_mask, other=E)

    no_k_mask = K % BLOCK_K == 0

    acc = tl.zeros((BLOCK_M, BLOCK_N), dtype=ACC_TYPE)
    E_first_idx = tl.min(E_idxs)
    E_last_idx = tl.minimum(tl.max(E_idxs), E - 1)
    M_idx = tl.load(grouped_idx_ptr + M_block, mask=M_boundary_mask).to(tl.int32)
    for E_idx in range(E_first_idx, E_last_idx + 1):
        E_mask = E_idxs == E_idx
        E_M_idx = M_idx
        if x_grouped:
            M_in_idx = M_block
        else:
            M_in_idx = E_M_idx // FAN_OUT
        acc = _compute_expert_block(
            E_idx,
            E_mask,
            M_in_idx,
            N_block,
            N_mask,
            X_ptr,
            stride_xm,
            stride_xk,
            W_ptr,
            stride_we,
            stride_wk,
            stride_wn,
            K,
            acc,
            no_k_mask,
            BLOCK_K,
            allow_tf32=allow_tf32,
        )

    if B_ptr is not None:
        B_blk_ptrs = B_ptr + E_idxs[:, None] * stride_be + N_block[None, :] * stride_bn
        acc += tl.load(B_blk_ptrs, mask=M_boundary_mask[:, None] & N_mask[None, :])

    if y_grouped:
        M_out_idx = M_block
    else:
        M_out_idx = M_idx
    Y_blk_ptrs = Y_ptr + (M_out_idx[:, None] * stride_ym + N_block[None, :] * stride_yn)
    tl.store(Y_blk_ptrs, acc, mask=M_boundary_mask[:, None] & N_mask[None, :])


def scatter2scatter(
    X,
    W,
    sorted_expert_idxs,
    sorted_scattered_idxs,
    k,
    b=None,
    x_grouped=False,
    y_grouped=False,
    out=None,
):
    assert sorted_scattered_idxs.size(0) == sorted_expert_idxs.size(0)
    assert sorted_scattered_idxs.size(0) == X.size(0) * k
    # Pre-kernel setup
    y_dim = W.size(-1)
    L_scattered = sorted_expert_idxs.size(0)
    if out is None:
        output = torch.empty((L_scattered, y_dim), device=X.device, dtype=X.dtype)
    else:
        assert out.size(0) == L_scattered and out.size(1) == y_dim
        output = out

    scatter2scatter_compileable(
        output,
        W,
        X,
        k,
        sorted_expert_idxs,
        sorted_scattered_idxs,
        b,
        x_grouped,
        y_grouped,
    )
    return output


@torch.library.custom_op("scattermoe::scatter2scatter", mutates_args={"output"})
def scatter2scatter_compileable(
    output: torch.Tensor,
    W: torch.Tensor,
    X: torch.Tensor,
    k: int,
    sorted_expert_idxs: torch.Tensor,
    sorted_scattered_idxs: torch.Tensor,
    b: Optional[torch.Tensor],
    x_grouped: bool,
    y_grouped: bool,
) -> None:
    def grid(META):
        grid_num = (
            triton.cdiv(sorted_expert_idxs.size(0), META["BLOCK_M"])
            * triton.cdiv(META["N"], META["BLOCK_N"]),
        )
        return grid_num

    if b is None:
        b = None
        stride_be = stride_bn = 0
    else:
        stride_be, stride_bn = b.stride()

    _scatter2scatter[grid](
        # X_ptr, stride_xm, stride_xk,
        X,
        X.stride(0),
        X.stride(1),
        # W_ptr, stride_we, stride_wk, stride_wn,
        W,
        W.stride(0),
        W.stride(1),
        W.stride(2),
        # Y_ptr, stride_ym, stride_yn,
        output,
        output.stride(0),
        output.stride(1),
        # B_ptr, stride_be, stride_bn
        b,
        stride_be,
        stride_bn,
        grouped_idx_ptr=sorted_scattered_idxs,
        expert_idxs_ptr=sorted_expert_idxs,
        # block_start_idx_ptr=padded_block_idxs,
        FAN_OUT=k,
        M=X.size(0),
        K=X.size(1),
        N=output.size(1),
        E=W.size(0),
        BLOCK_M=BLOCK_M,
        ACC_TYPE=tl.float32,
        allow_tf32=ALLOW_TF32,
        x_grouped=x_grouped,
        y_grouped=y_grouped,
    )


def _config_XtY():
    return [
        triton.Config(
            {"BLOCK_N": 128, "BLOCK_K": 128, "BLOCK_M": 32}, num_stages=4, num_warps=4
        ),
    ]


def group_bwd_W(DY, X, expert_offsets, E, has_bias=False):
    DWt = torch.zeros((E, DY.size(-1), X.size(-1)), device=DY.device, dtype=DY.dtype)
    DW = DWt.permute(0, 2, 1)
    if has_bias:
        Db = torch.zeros((E, DY.size(-1)), device=DY.device, dtype=DY.dtype)
    else:
        Db = None
    groupXtY_compileable(E, DW, Db, DY, X, expert_offsets)
    return DW, Db


@torch.library.custom_op("scattermoe::groupXtY", mutates_args={"DW", "Db"})
def groupXtY_compileable(
    E: int,
    DW: torch.Tensor,
    Db: Optional[torch.Tensor],
    DY: torch.Tensor,
    X: torch.Tensor,
    expert_offsets: torch.Tensor,
) -> None:
    def grid(META):
        grid = (
            E * triton.cdiv(META["K"], META["BLOCK_K"]),
            triton.cdiv(META["N"], META["BLOCK_N"]),
        )
        return grid

    if Db is None:
        stride_dbe = 0
        stride_dbn = 0
    else:
        stride_dbe, stride_dbn = Db.stride()

    _groupXtY[grid](
        # DY_ptr, stride_dym, stride_dyk,
        DY,
        DY.stride(0),
        DY.stride(1),
        # X_ptr, stride_xm, stride_xn,
        X,
        X.stride(0),
        X.stride(1),
        # DW_ptr, stride_dwe, stride_dwk, stride_dwn,
        DW,
        DW.stride(0),
        DW.stride(1),
        DW.stride(2),
        # Db_ptr, stride_dwe, stride_dbn,
        Db,
        stride_dbe,
        stride_dbn,
        # expert_offsets_ptr,
        expert_offsets,
        # K: tl.constexpr, N: tl.constexpr,
        M=DY.size(0),
        N=DY.size(-1),
        K=X.size(-1),
        # ACC_TYPE: tl.constexpr,
        ACC_TYPE=tl.float32,
        allow_tf32=ALLOW_TF32,
    )


@triton.autotune(
    configs=_config_XtY(),
    key=["M", "N", "K"],
)
@triton.heuristics(
    {
        "NO_K_MASK": lambda args: (args["K"] % args["BLOCK_K"]) == 0,
        "NO_N_MASK": lambda args: (args["N"] % args["BLOCK_N"]) == 0,
    }
)
@triton.jit
def _groupXtY(
    DY_ptr,
    stride_dym,
    stride_dyk,
    X_ptr,
    stride_xm,
    stride_xn,
    DW_ptr,
    stride_dwe,
    stride_dwk,
    stride_dwn,
    Db_ptr,
    stride_dbe,
    stride_dbn,
    expert_offsets_ptr,
    M,
    K: tl.constexpr,
    N: tl.constexpr,
    BLOCK_M: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_K: tl.constexpr,
    ACC_TYPE: tl.constexpr,
    allow_tf32: tl.constexpr,
    NO_K_MASK: tl.constexpr,
    NO_N_MASK: tl.constexpr,
):
    pid0 = tl.program_id(axis=0)
    pid1 = tl.program_id(axis=1)
    num0 = tl.num_programs(0)
    num1 = tl.num_programs(1)
    # pid1, pid0 = tl.swizzle2d(pid1, pid0, num1, num0, 128)
    pid0, pid1 = tl.swizzle2d(pid0, pid1, num0, num1, 4)

    K_BLOCK_COUNT = tl.cdiv(K, BLOCK_K)
    E_idx = pid0 // K_BLOCK_COUNT
    K_block_id = pid0 % K_BLOCK_COUNT
    N_block_id = pid1

    if E_idx == 0:
        start_idx = 0
    else:
        start_idx = tl.load(expert_offsets_ptr + E_idx - 1).to(tl.int32)
    end_idx = tl.load(expert_offsets_ptr + E_idx).to(tl.int32)

    if end_idx > start_idx:
        M_block = tl.max_contiguous(start_idx + tl.arange(0, BLOCK_M), BLOCK_M)

        K_block = K_block_id * BLOCK_K + tl.arange(0, BLOCK_K)
        K_mask = K_block < K
        K_block = tl.max_contiguous(tl.multiple_of(K_block % K, BLOCK_K), BLOCK_K)

        N_block = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
        N_mask = N_block < N
        N_block = tl.max_contiguous(tl.multiple_of(N_block % N, BLOCK_N), BLOCK_N)

        M_idxs = M_block
        xt_blk_ptrs = X_ptr + K_block[:, None] * stride_xn + M_idxs[None, :] * stride_xm
        dy_blk_ptrs = (
            DY_ptr + M_idxs[:, None] * stride_dym + N_block[None, :] * stride_dyk
        )
        if (Db_ptr is not None) and (K_block_id == 0):
            _xty_and_bias(
                E_idx,
                start_idx,
                end_idx,
                M_block,
                K_block,
                K_mask,
                N_block,
                N_mask,
                dy_blk_ptrs,
                stride_dym,
                xt_blk_ptrs,
                stride_xm,
                DW_ptr,
                stride_dwe,
                stride_dwk,
                stride_dwn,
                Db_ptr,
                stride_dbe,
                stride_dbn,
                BLOCK_M,
                BLOCK_N,
                BLOCK_K,
                ACC_TYPE,
                allow_tf32,
                NO_K_MASK,
                NO_N_MASK,
                compute_bias=True,
            )
        else:
            _xty_and_bias(
                E_idx,
                start_idx,
                end_idx,
                M_block,
                K_block,
                K_mask,
                N_block,
                N_mask,
                dy_blk_ptrs,
                stride_dym,
                xt_blk_ptrs,
                stride_xm,
                DW_ptr,
                stride_dwe,
                stride_dwk,
                stride_dwn,
                Db_ptr,
                stride_dbe,
                stride_dbn,
                BLOCK_M,
                BLOCK_N,
                BLOCK_K,
                ACC_TYPE,
                allow_tf32,
                NO_K_MASK,
                NO_N_MASK,
                compute_bias=False,
            )


@triton.jit
def _xty_and_bias(
    E_idx,
    start_idx,
    end_idx,
    M_block,
    K_block,
    K_mask,
    N_block,
    N_mask,
    dy_blk_ptrs,
    stride_dym,
    xt_blk_ptrs,
    stride_xm,
    DW_ptr,
    stride_dwe,
    stride_dwk,
    stride_dwn,
    Db_ptr,
    stride_dbe,
    stride_dbn,
    BLOCK_M,
    BLOCK_N,
    BLOCK_K,
    ACC_TYPE,
    allow_tf32,
    NO_K_MASK,
    NO_N_MASK,
    compute_bias: tl.constexpr,
):
    if compute_bias:
        db_acc = tl.zeros((BLOCK_N,), dtype=ACC_TYPE)
    else:
        db_acc = None

    acc = tl.zeros((BLOCK_K, BLOCK_N), dtype=ACC_TYPE)
    iters = tl.cdiv(end_idx - start_idx, BLOCK_M)
    for i in range(0, iters):
        M_mask = (i * BLOCK_M + M_block) < end_idx
        if NO_K_MASK:
            xt = tl.load(xt_blk_ptrs, mask=M_mask[None, :])
        else:
            xt = tl.load(xt_blk_ptrs, mask=K_mask[:, None] & M_mask[None, :])
        if NO_N_MASK:
            dy = tl.load(dy_blk_ptrs, mask=M_mask[:, None])
        else:
            dy = tl.load(dy_blk_ptrs, mask=M_mask[:, None] & N_mask[None, :])

        acc += tl.dot(xt, dy, out_dtype=ACC_TYPE, allow_tf32=allow_tf32)

        xt_blk_ptrs += BLOCK_M * stride_xm
        dy_blk_ptrs += BLOCK_M * stride_dym

        if compute_bias:
            db_acc += tl.sum(dy, axis=0)

    DW_blk_ptrs = (
        DW_ptr
        + E_idx * stride_dwe
        + K_block[:, None] * stride_dwk
        + N_block[None, :] * stride_dwn
    )
    acc = acc.to(DW_blk_ptrs.dtype.element_ty)
    tl.store(DW_blk_ptrs, acc, mask=K_mask[:, None] & N_mask[None, :])
    if compute_bias:
        Db_blk_ptrs = Db_ptr + E_idx * stride_dbe + N_block * stride_dbn
        tl.store(Db_blk_ptrs, db_acc, mask=N_mask)


def _config_grouping():
    return [
        triton.Config({"BLOCK_N": 256, "BLOCK_K": 128}, num_stages=4, num_warps=4),
        # triton.Config({'BLOCK_N': 128, 'BLOCK_K': 64}, num_stages=4, num_warps=4),
        # triton.Config({'BLOCK_N': 64, 'BLOCK_K': 32}, num_stages=4, num_warps=4),
    ]


def group(A, sorted_expert_idxs, coeff=None, fan_out=1, out=None):
    N = sorted_expert_idxs.size(0)
    K = A.size(1)
    assert A.size(0) * fan_out == N
    if out is not None:
        Y = out
    else:
        Y = torch.empty((N, K), dtype=A.dtype, device=A.device)
    group_compileable(A, K, N, Y, coeff, coeff is not None, fan_out, sorted_expert_idxs)
    return Y


@torch.library.custom_op("scattermoe::group", mutates_args={"Y"})
def group_compileable(
    A: torch.Tensor,
    K: int,
    N: int,
    Y: torch.Tensor,
    coeff: Optional[torch.Tensor],
    has_coeff: bool,
    fan_out: int,
    sorted_expert_idxs: torch.Tensor,
) -> None:
    def grid(META):
        grid_num = (triton.cdiv(META["N"], META["BLOCK_N"]),)
        return grid_num

    _group[grid](
        # A_ptr, stride_an, stride_ai,
        A,
        A.stride(0),
        A.stride(1),
        has_coeff,
        coeff,
        fan_out,
        # Y_ptr, stride_yn, stride_yk,
        Y,
        Y.stride(0),
        Y.stride(1),
        # grouped_idx_ptr,
        sorted_expert_idxs,
        # N: tl.constexpr, K: tl.constexpr,
        N,
        K,
    )


@triton.autotune(configs=_config_grouping(), key=["K"])
@triton.heuristics({"NO_K_MASK": lambda args: (args["K"] % args["BLOCK_K"]) == 0})
@triton.jit
def _group(
    src_ptr,
    stride_sn,
    stride_sk,
    has_coeff: tl.constexpr,
    coeff_ptr,
    FAN_OUT: tl.constexpr,
    tgt_ptr,
    stride_tn,
    stride_ti,
    grouped_idx_ptr,
    N,
    K: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_K: tl.constexpr,
    NO_K_MASK: tl.constexpr,
):
    pid = tl.program_id(axis=0)

    N_block_id = pid
    N_blk = N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)
    N_mask = N_blk < N
    N_blk = tl.max_contiguous(tl.multiple_of(N_blk % N, BLOCK_N), BLOCK_N)
    N_idx = tl.load(grouped_idx_ptr + N_blk, mask=N_mask, other=0)

    K_blk = tl.arange(0, BLOCK_K)
    src_blk_ptrs = (
        src_ptr + (N_idx // FAN_OUT)[:, None] * stride_sn + K_blk[None, :] * stride_sk
    )
    tgt_blk_ptrs = tgt_ptr + N_blk[:, None] * stride_tn + K_blk[None, :] * stride_ti

    if has_coeff:
        c = tl.load(coeff_ptr + N_idx, mask=N_mask)[:, None]

    iters = tl.cdiv(K, BLOCK_K)
    for i in range(0, iters):
        if NO_K_MASK or i < iters - 1:
            block = tl.load(src_blk_ptrs, mask=N_mask[:, None])
            if has_coeff:
                block *= c
            tl.store(tgt_blk_ptrs, block, mask=N_mask[:, None])

        else:
            K_mask = (i * BLOCK_K + K_blk) < K
            mask = N_mask[:, None] & K_mask[None, :]
            block = tl.load(src_blk_ptrs, mask=mask)
            if has_coeff:
                block *= c
            tl.store(tgt_blk_ptrs, block, mask=mask)
        src_blk_ptrs += BLOCK_K * stride_sk
        tgt_blk_ptrs += BLOCK_K * stride_ti


================================================
FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/kernels/single.py
================================================
# SPDX-License-Identifier: Apache-2.0
# Adapted from https://github.com/shawntan/scattermoe
# Copyright (c) Shawn Tan and ScatterMoE Contributors
# Licensed under the Apache License, Version 2.0
# See https://github.com/shawntan/scattermoe/blob/main/LICENSE

import torch
import triton
import triton.language as tl


@triton.jit
def _single2scatter(
    X_ptr,
    stride_xm,
    stride_xk,
    W_ptr,
    stride_we,
    stride_wk,
    stride_wn,
    Y_ptr,
    stride_ym,
    stride_yn,
    expert_idxs_ptr,
    FAN_OUT: tl.constexpr,
    K: tl.constexpr,
    N: tl.constexpr,
    E: tl.constexpr,
    BLOCK_N: tl.constexpr,
    BLOCK_K: tl.constexpr,
    ACC_TYPE: tl.constexpr,
):
    pid0 = tl.program_id(axis=0)
    pid1 = tl.program_id(axis=1)

    N_block_id = pid0
    if FAN_OUT == 1:
        in_idx = pid1
    else:
        in_idx = 0
    out_idx = pid1

    K_block = tl.arange(0, BLOCK_K)
    N_block = tl.max_contiguous(
        tl.multiple_of((N_block_id * BLOCK_N + tl.arange(0, BLOCK_N)) % N, BLOCK_N),
        BLOCK_N,
    )
    E_idx = tl.load(expert_idxs_ptr + pid1)
    X_blk_ptrs = X_ptr + in_idx * stride_xm + K_block[:, None] * stride_xk
    W_blk_ptrs = (
        W_ptr
        + E_idx * stride_we
        + K_block[:, None] * stride_wk
        + N_block[None, :] * stride_wn
    )
    N_mask = N_block < N
    acc = tl.zeros((1, BLOCK_N), dtype=ACC_TYPE)
    for _K_block_id in range(0, tl.cdiv(K, BLOCK_K)):
        K_mask = K_block < K
        x = tl.load(X_blk_ptrs, mask=K_mask[:, None], other=0.0)
        w = tl.load(W_blk_ptrs, mask=K_mask[:, None] & N_mask[None, :], other=0.0)
        acc += tl.sum(x * w, axis=0)[None, :]
        X_blk_ptrs += BLOCK_K * stride_xk
        W_blk_ptrs += BLOCK_K * stride_wk
        K_block += BLOCK_K
    Y_blk_ptrs = Y_ptr + out_idx * stride_ym + N_block[None, :] * stride_yn
    tl.store(Y_blk_ptrs, acc, mask=N_mask[None, :])


def single2scatter(X, W, expert_idxs):
    E, xdim, ydim = W.size()
    k = expert_idxs.size(1)
    assert X.size(0) == k or X.size(0) == 1
    Y = torch.empty((k, ydim), device=X.device, dtype=X.dtype)
    BLOCK_N = 128
    BLOCK_K = 128
    grid = triton.cdiv(ydim, BLOCK_N), k
    _single2scatter[grid](
        X,
        X.stride(0),
        X.stride(1),
        W,
        W.stride(0),
        W.stride(1),
        W.stride(2),
        Y,
        Y.stride(0),
        Y.stride(1),
        expert_idxs,
        FAN_OUT=Y.size(0) // X.size(0),
        K=xdim,
        N=ydim,
        E=E,
        BLOCK_N=BLOCK_N,
        BLOCK_K=BLOCK_K,
        ACC_TYPE=tl.float32,
    )
    return Y


================================================
FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/layers.py
================================================
# SPDX-License-Identifier: Apache-2.0
#
# Original work Copyright (c) Shawn Tan and ScatterMoE Contributors
# Adapted from https://github.com/shawntan/scattermoe
# See https://github.com/shawntan/scattermoe/blob/main/LICENSE
#
# Modifications and LoRA adaptation Copyright (c) Axolotl AI
# Licensed under the Apache License, Version 2.0

"""
ScatterMoE layer replacements for HuggingFace MoE architectures.

Provides drop-in forward replacements that use ScatterMoE kernels for
acceleration. When used via the HF ``kernels`` library
(``replace_kernel_forward_from_hub``), these classes replace the forward
method of the original MoE block.

LoRA support
------------
When peft wraps parameters via ``target_parameters``, the ``self.experts``
submodule becomes a chain of ``ParamWrapper`` objects and the ``self.gate``
router may also become a ``ParamWrapper``.  The ``HFScatterMoEGatedMLP``
forward detects this and automatically:

1. Unwraps ``self.gate`` to the base router, applying gate LoRA delta
2. Unwraps ``self.experts`` to the base ``OlmoeExperts`` module
3. Extracts LoRA A/B weights and scaling from each wrapper
4. Converts B layout from peft rank-major to scattermoe expert-major
5. Routes to ``parallel_linear_lora`` for fused LoRA computation
6. Passes through ``self.shared_expert`` / ``self.shared_expert_gate``
   (peft wraps their linear layers with standard LoRA, no special handling)
"""

import torch
from torch import nn
from torch.nn import functional as F

from .parallel_experts import flatten_sort_count, parallel_linear
from .parallel_linear_lora import get_lora_params_from_wrapper, parallel_linear_lora

# =============================================================================
# LoRA layout conversion utilities (peft <-> scattermoe)
# =============================================================================


def peft_lora_B_to_scattermoe(peft_B, num_experts, rank):
    """Convert peft rank-major lora_B ``[out, E*r]`` to scattermoe
    expert-major ``[N, r*E]``.

    peft reshapes B to ``[out, r, E]`` (rank-major).
    scattermoe slices B as ``[:, e*r:(e+1)*r]`` (expert-major).
    """
    N = peft_B.shape[0]
    return (
        peft_B.reshape(N, rank, num_experts)
        .permute(0, 2, 1)
        .contiguous()
        .reshape(N, num_experts * rank)
    )


def peft_lora_to_scattermoe(peft_A, peft_B, num_experts, rank):
    """Convert peft LoRA weights to scattermoe layout (with A<->B swap).

    peft operates on the parameter in its native storage layout ``[E, dim1, dim2]``
    where ``in_features=dim1, out_features=dim2``.  ScatterMoE transposes the
    parameter (``W = param.transpose(2, 1)``) giving ``[E, dim2, dim1]`` with
    ``K=dim2, N=dim1``.  Because of this transposition, peft's A and B roles
    are swapped relative to scattermoe's convention.

    peft gives:
        lora_A ``[r*E, dim1]``, lora_B ``[dim2, r*E]``

    scattermoe needs:
        lora_A ``[r*E, K=dim2]``, lora_B ``[N=dim1, r*E]``

    This function swaps A<->B and converts B from rank-major to expert-major.
    Uses vectorized tensor operations (no Python loop over experts).

    Works for **both** gate_up_proj and down_proj since the transposition
    issue is the same for any parameter.
    """
    peft_B_em = peft_lora_B_to_scattermoe(peft_B, num_experts, rank)

    dim1 = peft_A.shape[1]  # peft in_features -> scattermoe N
    dim2 = peft_B_em.shape[0]  # peft out_features -> scattermoe K

    # smoe_A: per expert, transpose B_e [dim2, r] -> [r, dim2]
    # [dim2, E*r] -> [dim2, E, r] -> [E, r, dim2] -> [E*r, dim2]
    smoe_A = (
        peft_B_em.reshape(dim2, num_experts, rank)
        .permute(1, 2, 0)
        .contiguous()
        .reshape(rank * num_experts, dim2)
    )

    # smoe_B: per expert, transpose A_e [r, dim1] -> [dim1, r]
    # [E*r, dim1] -> [E, r, dim1] -> [dim1, E, r] -> [dim1, E*r]
    smoe_B = (
        peft_A.reshape(num_experts, rank, dim1)
        .permute(2, 0, 1)
        .contiguous()
        .reshape(dim1, num_experts * rank)
    )

    return smoe_A, smoe_B


def peft_down_proj_lora_to_scattermoe(peft_A, peft_B, num_experts, rank):
    """Deprecated alias for :func:`peft_lora_to_scattermoe`."""
    return peft_lora_to_scattermoe(peft_A, peft_B, num_experts, rank)


# =============================================================================
# ParamWrapper unwrapping
# =============================================================================


def _unwrap_gate_lora(gate_module):
    """Unwrap peft ``ParamWrapper`` on the router gate.

    When peft targets ``gate.weight``, ``self.gate`` becomes::

        ParamWrapper(weight)
          -> base_layer: OlmoeTopKRouter (the real module)

    This function detects the wrapping and returns the base router, its
    weight tensor, and an optional LoRA delta tensor.

    Returns:
        (base_gate, gate_weight, gate_lora_delta_or_None)

        ``base_gate`` is the original router module (with ``.top_k``,
        ``.num_experts``, ``.norm_topk_prob``).
        ``gate_weight`` is the base router weight (may be a DTensor under FSDP).
        ``gate_lora_delta_or_None`` is the LoRA delta tensor if LoRA is active,
        else ``None``.  Kept separate to avoid mixing DTensor + Tensor in an add.
    """
    if hasattr(gate_module, "base_layer") and hasattr(gate_module, "lora_A"):
        base_gate = gate_module.base_layer
        lora_A, lora_B, scaling = get_lora_params_from_wrapper(gate_module)
        if lora_A is not None:
            # gate weight: [num_experts, hidden_size]
            # lora_A: [r, hidden_size], lora_B: [num_experts, r]
            # delta = scaling * B @ A = [num_experts, hidden_size]
            delta = scaling * (lora_B @ lora_A)
            return base_gate, base_gate.weight, delta
        else:
            return base_gate, base_gate.weight, None
    else:
        # No wrapping — gate is the original module
        return gate_module, gate_module.weight, None


def _convert_smoe_lora(lora_A, lora_B, num_experts, rank, scaling):
    """Convert peft LoRA weights to scattermoe layout."""
    smoe_A, smoe_B = peft_lora_to_scattermoe(lora_A, lora_B, num_experts, rank)
    return (smoe_A, smoe_B, scaling)


def _unwrap_experts_lora(experts_module):
    """Walk a peft ``ParamWrapper`` chain on ``self.experts``.

    When peft targets ``experts.gate_up_proj`` and ``experts.down_proj`` via
    ``target_parameters``, ``self.experts`` becomes a nested chain::

        ParamWrapper(down_proj)
          -> base_layer: ParamWrapper(gate_up_proj)
              -> base_layer: OlmoeExperts (the real module)

    This function walks the chain, collects LoRA params keyed by
    ``parameter_name``, and returns the base experts module.

    Returns:
        (base_experts, gup_lora, down_lora)

        Each ``*_lora`` is either ``(smoe_A, smoe_B, scaling)`` or ``None``.
        A/B are already in scattermoe layout.
    """
    # Collect ParamWrapper layers by their parameter_name
    wrappers = {}
    module = experts_module
    while hasattr(module, "base_layer") and hasattr(module, "lora_A"):
        param_name = getattr(module, "parameter_name", None)
        if param_name is not None:
            wrappers[param_name] = module
        module = module.base_layer

    base_experts = module

    if not wrappers:
        return base_experts, None, None

    # Determine num_experts from base module
    num_experts = getattr(base_experts, "num_experts", None)
    if num_experts is None:
        # Fallback: infer from parameter shape
        gup = getattr(base_experts, "gate_up_proj", None)
        if gup is not None:
            num_experts = gup.shape[0]

    # Extract gate_up_proj LoRA (needs A<->B swap due to transposition)
    gup_lora = None
    gup_wrapper = wrappers.get("gate_up_proj")
    if gup_wrapper is not None:
        lora_A, lora_B, scaling = get_lora_params_from_wrapper(gup_wrapper)
        if lora_A is not None:
            rank = lora_A.shape[0] // num_experts
            gup_lora = _convert_smoe_lora(lora_A, lora_B, num_experts, rank, scaling)

    # Extract down_proj LoRA (needs A<->B swap due to transposition)
    down_lora = None
    down_wrapper = wrappers.get("down_proj")
    if down_wrapper is not None:
        lora_A, lora_B, scaling = get_lora_params_from_wrapper(down_wrapper)
        if lora_A is not None:
            rank = lora_A.shape[0] // num_experts
            down_lora = _convert_smoe_lora(lora_A, lora_B, num_experts, rank, scaling)

    return base_experts, gup_lora, down_lora


# =============================================================================
# Routing helpers
# =============================================================================


def _softmax_topk_route(
    moe_block, base_gate, hidden_states, gate_weight, gate_lora_delta
):
    """Softmax→topk routing (Qwen, OLMoE, Mixtral, MiniMax).

    Returns:
        (routing_weights [T, K], selected_experts [T, K], top_k, num_experts)
    """
    router_logits = F.linear(hidden_states, gate_weight)
    if gate_lora_delta is not None:
        router_logits = router_logits + F.linear(hidden_states, gate_lora_delta)
    routing_weights = F.softmax(router_logits, dim=-1, dtype=torch.float32)

    top_k = base_gate.top_k
    num_experts = base_gate.num_experts
    routing_weights, selected_experts = torch.topk(routing_weights, top_k, dim=-1)

    if getattr(base_gate, "norm_topk_prob", True):
        routing_weights = routing_weights / routing_weights.sum(dim=-1, keepdim=True)

    return routing_weights, selected_experts, top_k, num_experts


def _sigmoid_topk_route(
    moe_block, base_gate, hidden_states, gate_weight, gate_lora_delta
):
    """Sigmoid→topk routing (GLM, DeepSeek V3, MiniMax M2).

    Supports:
    - ``e_score_correction_bias`` on gate or moe_block
    - Group-based expert selection when ``n_group > 1``
    - ``routed_scaling_factor`` applied to final weights
    - Final weights gathered from original sigmoid probs (not bias-corrected)

    Returns:
        (routing_weights [T, K], selected_experts [T, K], top_k, num_experts)
    """
    router_logits = F.linear(hidden_states.float(), gate_weight.float())
    if gate_lora_delta is not None:
        router_logits = router_logits + F.linear(
            hidden_states.float(), gate_lora_delta.float()
        )
    router_probs = router_logits.sigmoid()  # [T, E]

    top_k = getattr(moe_block, "top_k", getattr(base_gate, "top_k", None))
    num_experts = getattr(moe_block, "n_routed_experts", gate_weight.shape[0])

    # Bias-corrected scores for expert selection (not used for final weights).
    # glm_moe_dsa/deepseek_v3 store the bias on gate; minimax_m2 on the block.
    e_score_correction_bias = getattr(base_gate, "e_score_correction_bias", None)
    if e_score_correction_bias is None:
        e_score_correction_bias = getattr(moe_block, "e_score_correction_bias", None)
    if e_score_correction_bias is not None:
        scores_for_choice = router_probs + e_score_correction_bias
    else:
        scores_for_choice = router_probs

    # Group-based selection: pick top groups, mask the rest
    n_group = getattr(moe_block, "n_group", 1)
    if n_group > 1:
        group_scores = (
            scores_for_choice.view(-1, n_group, num_experts // n_group)
            .topk(2, dim=-1)[0]
            .sum(dim=-1)
        )  # [T, n_group]
        topk_group = getattr(moe_block, "topk_group", n_group)
        group_idx = torch.topk(group_scores, k=topk_group, dim=-1, sorted=False)[1]
        group_mask = torch.zeros_like(group_scores)
        group_mask.scatter_(1, group_idx, 1)
        score_mask = (
            group_mask.unsqueeze(-1)
            .expand(-1, n_group, num_experts // n_group)
            .reshape(-1, num_experts)
        )
        scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)

    # Final topk from (possibly masked) scores
    topk_indices = torch.topk(scores_for_choice, k=top_k, dim=-1, sorted=False)[1]

    # Gather weights from original sigmoid scores (not bias-corrected)
    topk_weights = router_probs.gather(1, topk_indices)

    # Optional renormalization + scaling
    if getattr(moe_block, "norm_topk_prob", True):
        topk_weights = topk_weights / (topk_weights.sum(dim=-1, keepdim=True) + 1e-20)
    routed_scaling_factor = getattr(moe_block, "routed_scaling_factor", 1.0)
    topk_weights = topk_weights * routed_scaling_factor

    return topk_weights, topk_indices, top_k, num_experts


def _route(moe_block, base_gate, hidden_states, gate_weight, gate_lora_delta):
    """Dispatch to the correct routing strategy based on block attributes.

    Detects sigmoid routing by the presence of ``e_score_correction_bias``
    on either the gate or the moe_block.
    """
    has_sigmoid = (
        getattr(base_gate, "e_score_correction_bias", None) is not None
        or getattr(moe_block, "e_score_correction_bias", None) is not None
    )
    if has_sigmoid:
        return _sigmoid_topk_route(
            moe_block, base_gate, hidden_states, gate_weight, gate_lora_delta
        )
    return _softmax_topk_route(
        moe_block, base_gate, hidden_states, gate_weight, gate_lora_delta
    )


# =============================================================================
# Shared expert helpers
# =============================================================================


def _compute_shared_expert(moe_block, hidden_states_flat):
    """Compute shared expert output if the block has one.

    Handles singular (qwen2_moe: ``shared_expert``), plural
    (glm_moe_dsa/deepseek_v3: ``shared_experts``), and MLP
    (hunyuan_v1_moe: ``shared_mlp``) attribute names.

    peft wraps individual linear layers inside the shared expert with
    standard LoRA — calling forward() handles this transparently.
    """
    shared_expert = (
        getattr(moe_block, "shared_expert", None)
        or getattr(moe_block, "shared_experts", None)
        or getattr(moe_block, "shared_mlp", None)
    )
    if shared_expert is None:
        return None

    shared_expert_output = shared_expert(hidden_states_flat)

    # Optional sigmoid gate (Qwen2MoE pattern).
    # shared_expert_gate may also be peft-wrapped (standard LoRA
    # on nn.Linear), its forward() applies LoRA automatically.
    shared_expert_gate = getattr(moe_block, "shared_expert_gate", None)
    if shared_expert_gate is not None:
        shared_expert_output = (
            F.sigmoid(shared_expert_gate(hidden_states_flat)) * shared_expert_output
        )

    return shared_expert_output


# =============================================================================
# Layer classes
# =============================================================================


class ScatterMoEGatedMLP(nn.Module):
    def forward(self, layer_input):
        """
        Forward pass of the mixture of experts layer.

        Args:
            layer_input (Tensor):
                Input tensor.

        Returns:
            Tensor:
                Output tensor.
        """
        bsz, length, emb_size = layer_input.size()
        layer_input = layer_input.reshape(-1, emb_size)
        # compute the top_k routing decision
        router_logits = self.router.layer(layer_input)
        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        routing_weights, selected_experts = torch.topk(
            routing_weights, self.router.top_k, dim=-1
        )
        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
        routing_weights = routing_weights.to(layer_input.dtype)
        sorted_expert_idxs, sorted_scattered_idxs, expert_offsets = flatten_sort_count(
            selected_experts, num_experts=self.router.num_experts
        )

        # compute experts
        gates, h = parallel_linear(
            layer_input,
            self.input_linear.weight.transpose(2, 1),
            self.router.top_k,
            sorted_expert_idxs,
            sorted_scattered_idxs,
            expert_offsets,
            grouped_in=False,
            grouped_out=True,
        ).chunk(2, dim=-1)
        h = self.activation(gates) * h
        layer_output = parallel_linear(
            h,
            self.output_linear.weight.transpose(2, 1),
            1,
            sorted_expert_idxs,
            sorted_scattered_idxs,
            expert_offsets,
            grouped_in=True,
            grouped_out=False,
            gates=routing_weights,
        )
        layer_output = layer_output.view(bsz, length, emb_size)
        return layer_output


class HFScatterMoEGatedMLP(nn.Module):
    """
    ScatterMoE-accelerated forward pass for HF MoEs.

    Used as a kernel layer via the HF ``kernels`` library.  The ``forward``
    method replaces the original SparseMoeBlock.forward.

    Supports:

    * **Softmax→topk routing**: OLMoE, Qwen2/3MoE, Mixtral, MiniMax
    * **Sigmoid→topk routing**: GLM, DeepSeek V3, MiniMax M2
    * **Full-parameter training**: uses ``parallel_linear`` (base ScatterMoE)
    * **LoRA fine-tuning**: detects peft ``ParamWrapper`` on ``self.experts``,
      extracts adapter weights, and uses ``parallel_linear_lora`` (fused kernel)
    """

    @staticmethod
    def forward(self: nn.Module, layer_input: torch.Tensor):
        """
        Forward pass using ScatterMoE kernels.

        Args:
            self: The MoeSparseMoeBlock module containing:
                - self.gate: Router (or peft ParamWrapper wrapping it)
                - self.experts: Experts module (or peft ParamWrapper chain)
                - self.shared_expert(s): Optional shared expert
                - self.shared_expert_gate: Optional shared expert gate
            layer_input: Input tensor [batch_size, seq_len, hidden_size]

        Returns:
            Tensor: [batch_size, seq_len, hidden_size]
        """
        batch_size, sequence_length, hidden_dim = layer_input.shape
        hidden_states_flat = layer_input.view(-1, hidden_dim)

        # ====================================================================
        # Shared Expert (if present, e.g. Qwen2MoE, DeepSeek V3)
        # ====================================================================
        shared_expert_output = _compute_shared_expert(self, hidden_states_flat)

        # ====================================================================
        # Router Computation (with optional gate LoRA)
        # ====================================================================
        base_gate, gate_weight, gate_lora_delta = _unwrap_gate_lora(self.gate)
        routing_weights, selected_experts, top_k, num_experts = _route(
            self, base_gate, hidden_states_flat, gate_weight, gate_lora_delta
        )
        routing_weights = routing_weights.to(hidden_states_flat.dtype)

        sorted_expert_idxs, sorted_scattered_idxs, expert_offsets = flatten_sort_count(
            selected_experts, num_experts=num_experts
        )

        # ====================================================================
        # Detect LoRA (peft ParamWrapper) and extract adapter weights
        # ====================================================================
        experts, gup_lora, down_lora = _unwrap_experts_lora(self.experts)

        # ====================================================================
        # Selective expert weight dequantization
        # ====================================================================
        # When experts are BnB-quantized (quantize_moe_experts), dequantize
        # only the active experts instead of all E. This saves ~97% memory
        # for the transient dequant buffer when few experts are active.
        use_selective = (
            getattr(self, "_use_selective_dequant", False)
            and hasattr(experts, "parametrizations")
            and "gate_up_proj" in experts.parametrizations
        )

        if use_selective:
            from axolotl.integrations.kernels.libs.scattermoe_lora.selective_dequant import (
                get_active_experts,
                remap_expert_indices,
                selective_expert_weights,
                selective_lora_weights,
            )

            active_experts = get_active_experts(sorted_expert_idxs, num_experts)
            remapped_expert_idxs, compact_offsets = remap_expert_indices(
                sorted_expert_idxs,
                expert_offsets,
                active_experts,
                num_experts,
            )
            # Dequantize only active experts' weights
            gate_up_W = selective_expert_weights(
                experts,
                "gate_up_proj",
                active_experts,
            ).transpose(2, 1)  # [num_active, hidden, 2*inter]

            # Remap LoRA weights to match compact expert indices
            if gup_lora is not None:
                gup_A, gup_B, gup_scaling = gup_lora
                gup_A, gup_B = selective_lora_weights(
                    gup_A,
                    gup_B,
                    active_experts,
                    num_experts,
                )
                gup_lora = (gup_A, gup_B, gup_scaling)

            # Use remapped indices for ScatterMoE kernels
            sei_gup = remapped_expert_idxs
            eo_gup = compact_offsets
        else:
            gate_up_W = experts.gate_up_proj.transpose(2, 1)  # [E, hidden, 2*inter]
            sei_gup = sorted_expert_idxs
            eo_gup = expert_offsets

        # ====================================================================
        # Gate + Up projection
        # ====================================================================
        if gup_lora is not None:
            gup_A, gup_B, gup_scaling = gup_lora
            gup = parallel_linear_lora(
                hidden_states_flat,
                gate_up_W,
                top_k,
                sei_gup,
                sorted_scattered_idxs,
                eo_gup,
                lora_A=gup_A,
                lora_B=gup_B,
                scaling=gup_scaling,
                grouped_in=False,
                grouped_out=True,
                use_fused_dX=True,
                use_fused_gather=True,
            )
        else:
            gup = parallel_linear(
                hidden_states_flat,
                gate_up_W,
                top_k,
                sei_gup,
                sorted_scattered_idxs,
                eo_gup,
                grouped_in=False,
                grouped_out=True,
            )

        gates, h = gup.chunk(2, dim=-1)
        h = experts.act_fn(gates) * h

        # ====================================================================
        # Down projection
        # ====================================================================
        if use_selective:
            down_W = selective_expert_weights(
                experts,
                "down_proj",
                active_experts,
            ).transpose(2, 1)  # [num_active, inter, hidden]

            if down_lora is not None:
                down_A, down_B, down_scaling = down_lora
                down_A, down_B = selective_lora_weights(
                    down_A,
                    down_B,
                    active_experts,
                    num_experts,
                )
                down_lora = (down_A, down_B, down_scaling)

            sei_down = remapped_expert_idxs
            eo_down = compact_offsets
        else:
            down_W = experts.down_proj.transpose(2, 1)  # [E, inter, hidden]
            sei_down = sorted_expert_idxs
            eo_down = expert_offsets

        if down_lora is not None:
            down_A, down_B, down_scaling = down_lora
            expert_output = parallel_linear_lora(
                h,
                down_W,
                1,
                sei_down,
                sorted_scattered_idxs,
                eo_down,
                lora_A=down_A,
                lora_B=down_B,
                scaling=down_scaling,
                gates=routing_weights,
                grouped_in=True,
                grouped_out=False,
                use_fused_dX=True,
                use_fused_gather=True,
            )
        else:
            expert_output = parallel_linear(
                h,
                down_W,
                1,
                sei_down,
                sorted_scattered_idxs,
                eo_down,
                grouped_in=True,
                grouped_out=False,
                gates=routing_weights,
            )

        # ====================================================================
        # Combine with shared expert and reshape
        # ====================================================================
        if shared_expert_output is not None:
            expert_output = expert_output + shared_expert_output

        expert_output = expert_output.view(batch_size, sequence_length, hidden_dim)
        return expert_output


================================================
FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/lora_ops.py
================================================
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) Axolotl AI
# Licensed under the Apache License, Version 2.0

"""
ParallelExperts module with LoRA support.

Provides a drop-in replacement for ScatterMoE's ParallelExperts that
uses the fused LoRA kernel when adapter weights are attached.
"""

from typing import Optional

import torch
import torch.nn as nn

from .parallel_linear_lora import parallel_linear_lora


class ParallelExperts(nn.Module):
    """
    Parallel Experts with fused LoRA support.

    Drop-in replacement for the original ParallelExperts. When LoRA parameters
    are attached via set_lora(), the forward pass uses a fused kernel:
        Y = X @ W + scaling * (X @ A^T) @ B^T
    """

    def __init__(
        self,
        num_experts: int,
        input_size: int,
        output_size: int,
        bias: bool = False,
    ) -> None:
        super().__init__()
        self.weight = nn.Parameter(torch.empty(num_experts, output_size, input_size))
        if bias:
            self.bias = nn.Parameter(torch.empty(num_experts, output_size))
        else:
            self.bias = None
        self.num_experts = num_experts
        self.input_size = input_size
        self.output_size = output_size
        self._lora_A: torch.Tensor | None = None
        self._lora_B: torch.Tensor | None = None
        self._lora_scaling: float | None = None
        self.reset_parameters()

    def reset_parameters(self) -> None:
        nn.init.normal_(self.weight, std=0.02)
        if self.bias is not None:
            nn.init.zeros_(self.bias)

    def extra_repr(self) -> str:
        return (
            f"num_experts={self.num_experts}, "
            f"input_size={self.input_size}, "
            f"output_size={self.output_size}"
        )

    def set_lora(self, lora_A: torch.Tensor, lora_B: torch.Tensor, scaling: float):
        """Attach LoRA parameters for fused computation."""
        self._lora_A = lora_A
        self._lora_B = lora_B
        self._lora_scaling = scaling

    def clear_lora(self):
        """Remove LoRA parameters."""
        self._lora_A = None
        self._lora_B = None
        self._lora_scaling = None

    def forward(
        self,
        inputs: torch.Tensor,
        k: int,
        sorted_expert_idxs: torch.Tensor,
        sorted_scattered_idxs: torch.Tensor,
        expert_offsets: torch.Tensor,
        gates: Optional[torch.Tensor] = None,
        grouped_in: bool = False,
        grouped_out: bool = False,
    ) -> torch.Tensor:
        return parallel_linear_lora(
            inputs,
            self.weight.permute(0, 2, 1),  # [E, input, output]
            k,
            sorted_expert_idxs,
            sorted_scattered_idxs,
            expert_offsets,
            lora_A=self._lora_A,
            lora_B=self._lora_B,
            scaling=self._lora_scaling if self._lora_scaling is not None else 1.0,
            expert_biases=self.bias,
            gates=gates,
            grouped_in=grouped_in,
            grouped_out=grouped_out,
        )


================================================
FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/parallel_experts.py
================================================
# SPDX-License-Identifier: Apache-2.0
# Adapted from https://github.com/shawntan/scattermoe
# Copyright (c) Shawn Tan and ScatterMoE Contributors
# Licensed under the Apache License, Version 2.0
# See https://github.com/shawntan/scattermoe/blob/main/LICENSE

from typing import Optional

import torch
import torch.nn as nn

from . import kernels


@torch.library.custom_op("scattermoe::bincount", mutates_args={})
def compileable_bincount(x: torch.Tensor, minlength: int) -> torch.Tensor:
    return x.bincount(minlength=minlength)


@compileable_bincount.register_fake
def _(x: torch.Tensor, minlength: int) -> torch.Tensor:
    return torch.empty(minlength, dtype=torch.long, device=x.device)


@torch.compile
def flatten_sort_count(expert_idxs: torch.Tensor, num_experts: int):
    with torch.no_grad():
        flattened_expert_idxs = expert_idxs.flatten()
        sorted_expert_idxs, sorted_scattered_idxs = torch.sort(flattened_expert_idxs)
        expert_counts = compileable_bincount(
            flattened_expert_idxs, minlength=num_experts
        )
        expert_offsets = expert_counts.cumsum(-1)
        return sorted_expert_idxs, sorted_scattered_idxs, expert_offsets


class ParallelLinear(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        x: torch.Tensor,
        expert_weights: torch.Tensor,
        k: int,
        sorted_expert_idxs: torch.Tensor,
        sorted_scattered_idxs: torch.Tensor,
        expert_offsets: torch.Tensor,
        expert_biases: Optional[torch.Tensor] = None,
        gates: Optional[torch.Tensor] = None,
        grouped_in: bool = False,
        grouped_out: bool = False,
    ):
        with torch.device(x.device):
            output = kernels.ops.scatter2scatter(
                X=x,
                W=expert_weights,
                b=expert_biases,
                k=k,
                sorted_expert_idxs=sorted_expert_idxs,
                sorted_scattered_idxs=sorted_scattered_idxs,
                x_grouped=grouped_in,
                y_grouped=grouped_out,
            )
            if gates is not None:
                output_expanded = output.view(
                    gates.size(0), gates.size(1), output.size(-1)
                )
                output = (gates.unsqueeze(1) @ output_expanded).squeeze(1)
            else:
                output_expanded = None

            ctx.save_for_backward(
                x,
                expert_weights,
                expert_biases,
                sorted_expert_idxs,
                sorted_scattered_idxs,
                expert_offsets,
                gates,
                output_expanded,
            )
            ctx.grouped_in = grouped_in
            ctx.grouped_out = grouped_out
            ctx.k = k
        return output

    @staticmethod
    def backward(ctx, grad_out: torch.Tensor):
        with torch.device(grad_out.device):
            (
                x,
                expert_weights,
                expert_biases,
                sorted_expert_idxs,
                sorted_scattered_idxs,
                expert_offsets,
                gates,
                output_expanded,
            ) = ctx.saved_tensors
            k = ctx.k
            grouped_in = ctx.grouped_in
            grouped_out = ctx.grouped_out

            if gates is not None:
                # calculate gates gradient
                # d_gates = torch.bmm(output_expanded, grad_out[:, :, None]).squeeze(-1)
                d_gates = (output_expanded @ grad_out.unsqueeze(-1)).squeeze(-1)
                gates_flat = gates.flatten()
                gate_fan = gates.size(1)
                grouped_grad_out = output_expanded.flatten(
                    0, 1
                )  # reuse expanded buffer later
            else:
                d_gates = None
                gates_flat = None
                gate_fan = 1
                grouped_grad_out = None

            if grouped_out:
                grouped_grad_out = grad_out
            else:
                grouped_grad_out = kernels.ops.group(
                    grad_out,
                    sorted_scattered_idxs,
                    fan_out=gate_fan,
                    coeff=gates_flat,
                    out=grouped_grad_out,
                )
            if grouped_in:
                grouped_x = x
                d_expanded_input = None
            else:
                grouped_x = kernels.ops.group(x, sorted_scattered_idxs, fan_out=k)
                d_expanded_input = grouped_x

            d_weights, d_biases = kernels.ops.group_bwd_W(
                DY=grouped_grad_out,
                X=grouped_x,
                expert_offsets=expert_offsets,
                E=expert_weights.size(0),
                has_bias=expert_biases is not None,
            )

            d_expanded_input = kernels.ops.scatter2scatter(
                X=grouped_grad_out,
                x_grouped=True,
                W=expert_weights.permute(0, 2, 1),
                sorted_expert_idxs=sorted_expert_idxs,
                sorted_scattered_idxs=sorted_scattered_idxs,
                k=1,
                y_grouped=grouped_in,
                out=d_expanded_input,  # Reuse grouped_x buffer
            )

            if k == 1:
                d_input = d_expanded_input
            else:
                d_input = d_expanded_input.view(
                    x.size(0), k, d_expanded_input.size(-1)
                ).sum(-2)
        return (
            # x, expert_weights,
            d_input,
            d_weights,
            # k, sorted_expert_idxs, sorted_scattered_idxs, expert_offsets,
            None,
            None,
            None,
            None,
            # bias, gates
            d_biases,
            d_gates,
            # grouped_in, grouped_out,
            None,
            None,
        )


def parallel_linear(
    inputs,
    expert_weights,
    k,
    sorted_expert_idxs,
    sorted_scattered_idxs,
    expert_offsets,
    expert_biases=None,
    gates=None,
    grouped_in=False,
    grouped_out=False,
):
    results = ParallelLinear.apply(
        inputs,
        expert_weights,
        k,
        sorted_expert_idxs,
        sorted_scattered_idxs,
        expert_offsets,
        expert_biases,
        gates,
        grouped_in,
        grouped_out,
    )
    return results


class ParallelExperts(nn.Module):
    def __init__(self, num_experts, input_size, output_size, bias=False) -> None:
        super().__init__()
        self.weight = nn.Parameter(torch.empty(num_experts, output_size, input_size))

        if bias:
            self.bias = nn.Parameter(torch.empty(num_experts, output_size))
        else:
            self.bias = None

        self.num_experts = num_experts
        self.input_size = input_size
        self.output_size = output_size
        self.reset_parameters()

    def extra_repr(self):
        return "num_experts={}, input_size={}, output_size={}".format(
            self.num_experts, self.input_size, self.output_size
        )

    def reset_parameters(self) -> None:
        nn.init.normal_(self.weight, std=0.02)
        if self.bias is not None:
            nn.init.zeros_(self.bias)

    def forward(
        self,
        inputs,
        k,
        sorted_expert_idxs,
        sorted_scattered_idxs,
        expert_offsets,
        gates=None,
        grouped_in=False,
        grouped_out=False,
    ):
        results = parallel_linear(
            inputs,
            self.weight.permute(0, 2, 1),
            k,
            sorted_expert_idxs,
            sorted_scattered_idxs,
            expert_offsets,
            expert_biases=self.bias,
            gates=gates,
            grouped_in=grouped_in,
            grouped_out=grouped_out,
        )
        return results


================================================
FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/parallel_linear_lora.py
================================================
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) Axolotl AI
# Licensed under the Apache License, Version 2.0

"""
ScatterMoE + LoRA Autograd Function
====================================

Provides the autograd function and Python interface for fused ScatterMoE + LoRA.

Key design for LoRA training:
  - Expert weights W are FROZEN (no gradient computed for W).
  - Only LoRA adapter weights (A, B) receive gradients.
  - The input gradient dX is still computed (needed for upstream layers).
  - This avoids the expensive group_bwd_W computation entirely.

Forward:
  Y = X @ W + scaling * (X @ A^T) @ B^T

Backward (W frozen):
  dX = dY @ W^T + scaling * (dY @ B) @ A          (via scatter2scatter for base, separate for LoRA)
  dA = scaling * (dY @ B)^T @ X                     (per-expert, on grouped data)
  dB = scaling * dY^T @ (X @ A^T)                   (per-expert, on grouped data)
"""

from typing import Optional

import torch

from .kernels import ops as base_ops
from .kernels.lora_ops import (
    group_bwd_lora,
    group_bwd_lora_fused,
    scatter2scatter_lora,
    scatter2scatter_lora_dX,
)


class ScatterMoELoRA(torch.autograd.Function):
    """
    Autograd function for fused ScatterMoE + LoRA with frozen expert weights.

    This function is optimized for the LoRA fine-tuning scenario where:
    - Expert weights W are frozen (requires_grad=False)
    - Only LoRA A and B matrices receive gradients
    - Input gradients are computed for upstream layer backprop
    """

    @staticmethod
    def forward(
        ctx,
        x: torch.Tensor,
        expert_weights: torch.Tensor,
        k: int,
        sorted_expert_idxs: torch.Tensor,
        sorted_scattered_idxs: torch.Tensor,
        expert_offsets: torch.Tensor,
        lora_A: torch.Tensor,
        lora_B: torch.Tensor,
        scaling: float,
        expert_biases: Optional[torch.Tensor] = None,
        gates: Optional[torch.Tensor] = None,
        grouped_in: bool = False,
        grouped_out: bool = False,
        use_fused_dX: bool = False,
        use_fused_gather: bool = False,
    ):
        with torch.device(x.device):
            # Fused forward: Y = X @ W + scaling * (X @ A^T) @ B^T
            output = scatter2scatter_lora(
                X=x,
                W=expert_weights,
                sorted_expert_idxs=sorted_expert_idxs,
                sorted_scattered_idxs=sorted_scattered_idxs,
                k=k,
                lora_A=lora_A,
                lora_B=lora_B,
                scaling=scaling,
                b=expert_biases,
                x_grouped=grouped_in,
                y_grouped=grouped_out,
            )

            # Handle gating (weighted combination of top-k expert outputs)
            if gates is not None:
                output_expanded = output.view(
                    gates.size(0), gates.size(1), output.size(-1)
                )
                output = (gates.unsqueeze(1) @ output_expanded).squeeze(1)
            else:
                output_expanded = None

            ctx.save_for_backward(
                x,
                lora_A,
                lora_B,
                sorted_expert_idxs,
                sorted_scattered_idxs,
                expert_offsets,
                gates,
                output_expanded,
            )
            # Store frozen weights as plain Python attributes instead of
            # save_for_backward.  This avoids:
            # 1. Version-check conflicts with FSDP unshard/reshard
            # 2. Pinning all-gathered parameters via saved_tensors hooks
            # 3. Interfering with activation offloading pack/unpack hooks
            # Safe because expert_weights are frozen (requires_grad=False).
            ctx.expert_weights = expert_weights
            ctx.expert_biases = expert_biases
            ctx.grouped_in = grouped_in
            ctx.grouped_out = grouped_out
            ctx.k = k
            ctx.scaling = scaling
            ctx.use_fused_dX = use_fused_dX
            ctx.use_fused_gather = use_fused_gather

        return output

    @staticmethod
    def backward(ctx, grad_out: torch.Tensor):
        with torch.device(grad_out.device):
            (
                x,
                lora_A,
                lora_B,
                sorted_expert_idxs,
                sorted_scattered_idxs,
                expert_offsets,
                gates,
                output_expanded,
            ) = ctx.saved_tensors
            expert_weights = ctx.expert_weights

            k = ctx.k
            scaling = ctx.scaling
            grouped_in = ctx.grouped_in
            grouped_out = ctx.grouped_out
            E = expert_weights.size(0)

            # ------------------------------------------------------------------
            # Gate gradients (if using top-k gating with routing weights)
            # ------------------------------------------------------------------
            if gates is not None:
                # d_gates[t, j] = output_expanded[t, j, :] . grad_out[t, :]
                d_gates = (output_expanded @ grad_out.unsqueeze(-1)).squeeze(-1)
                gates_flat = gates.flatten()
                gate_fan = gates.size(1)
                # Reuse output_expanded buffer for grouped_grad_out
                grouped_grad_out = output_expanded.flatten(0, 1)
            else:
                d_gates = None
                gates_flat = None
                gate_fan = 1
                grouped_grad_out = None

            # ------------------------------------------------------------------
            # LoRA gradients (dA, dB) and setup for dX
            # ------------------------------------------------------------------
            # Fused gather uses sorted_scattered_idxs for indirect X access
            # in the Triton kernel, avoiding the group(x) allocation.
            #
            # can_fuse_gather: X is ungrouped and not too large for scatter loads
            #   - When gates is None and grouped_out=False: both DY and X ungrouped
            #   - When grouped_out=True (gate_up_proj): DY already grouped, X ungrouped
            #     -> use dy_grouped=True in the fused kernel
            M_total = sorted_scattered_idxs.size(0)
            K_dim = x.size(-1)
            N_dim = expert_weights.size(-1)
            fuse_gather_workload = M_total * max(K_dim, N_dim)
            _FUSE_GATHER_THRESHOLD = 2**24  # ~16M elements

            can_fuse_gather = (
                ctx.use_fused_gather
                and not grouped_in  # X must be ungrouped for scatter access
                and gates is None  # gate coeff requires multiplicative gather
                and fuse_gather_workload < _FUSE_GATHER_THRESHOLD
            )

            if can_fuse_gather:
                # ------------------------------------------------------------------
                # Fused path: skip group(x) entirely
                # ------------------------------------------------------------------
                d_expanded_input = None

                d_lora_A, d_lora_B = group_bwd_lora_fused(
                    DY=grad_out,
                    X=x,
                    lora_A=lora_A,
                    lora_B=lora_B,
                    expert_offsets=expert_offsets,
                    sorted_scattered_idxs=sorted_scattered_idxs,
                    E=E,
                    k=k,
                    scaling=scaling,
                    dy_grouped=grouped_out,
                )

                # Prepare grouped_grad_out for the dX path (needed by both
                # the fused dX kernel when grouped_out=True, and the non-fused path)
                if grouped_out:
                    grouped_grad_out = grad_out
                elif not ctx.use_fused_dX:
                    grouped_grad_out = base_ops.group(
                        grad_out,
                        sorted_scattered_idxs,
                        fan_out=gate_fan,
                        coeff=gates_flat,
                        out=grouped_grad_out,
                    )
            else:
                # ------------------------------------------------------------------
                # Original path: explicit group() calls
                # ------------------------------------------------------------------
                if grouped_out:
                    grouped_grad_out = grad_out
                else:
                    grouped_grad_out = base_ops.group(
                        grad_out,
                        sorted_scattered_idxs,
                        fan_out=gate_fan,
                        coeff=gates_flat,
                        out=grouped_grad_out,
                    )

                if grouped_in:
                    grouped_x = x
                    d_expanded_input = None
                else:
                    grouped_x = base_ops.group(x, sorted_scattered_idxs, fan_out=k)
                    d_expanded_input = grouped_x  # Will be overwritten; reuse buffer

                d_lora_A, d_lora_B = group_bwd_lora(
                    DY=grouped_grad_out,
                    X=grouped_x,
                    lora_A=lora_A,
                    lora_B=lora_B,
                    expert_offsets=expert_offsets,
                    E=E,
                    scaling=scaling,
                )

            # ------------------------------------------------------------------
            # Input gradient: dX = dY @ W^T + scaling * (dY @ B) @ A
            # ------------------------------------------------------------------
            if ctx.use_fused_dX:
                if can_fuse_gather and not grouped_out:
                    # Fully fused: read ungrouped DY via scatter pattern
                    d_expanded_input = scatter2scatter_lora_dX(
                        DY=grad_out,
                        W=expert_weights,
                        sorted_expert_idxs=sorted_expert_idxs,
                        sorted_scattered_idxs=sorted_scattered_idxs,
                        k=1,
                        lora_A=lora_A,
                        lora_B=lora_B,
                        scaling=scaling,
                        dy_grouped=False,
                        dx_grouped=grouped_in,
                        out=d_expanded_input,
                    )
                else:
                    # Fused dX only: read from pre-grouped DY
                    d_expanded_input = scatter2scatter_lora_dX(
                        DY=grouped_grad_out,
                        W=expert_weights,
                        sorted_expert_idxs=sorted_expert_idxs,
                        sorted_scattered_idxs=sorted_scattered_idxs,
                        k=1,
                        lora_A=lora_A,
                        lora_B=lora_B,
                        scaling=scaling,
                        dy_grouped=True,
                        dx_grouped=grouped_in,
                        out=d_expanded_input,
                    )
            else:
                # Original path: separate base scatter2scatter + LoRA Python loop
                d_expanded_input = base_ops.scatter2scatter(
                    X=grouped_grad_out,
                    x_grouped=True,
                    W=expert_weights.permute(0, 2, 1),  # [E, N, K]
                    sorted_expert_idxs=sorted_expert_idxs,
                    sorted_scattered_idxs=sorted_scattered_idxs,
                    k=1,
                    y_grouped=grouped_in,
                    out=d_expanded_input,
                )

                # LoRA part: dX_lora = scaling * (dY @ B) @ A
                if scaling != 0.0:
                    d_input_lora_grouped = _compute_lora_input_grad(
                        grouped_grad_out,
                        lora_A,
                        lora_B,
                        expert_offsets,
                        E,
                        scaling,
                    )
                    if grouped_in:
                        d_expanded_input.add_(d_input_lora_grouped)
                    else:
                        # Scatter-add LoRA gradient directly into d_expanded_input.
                        # Avoids allocating a zeros_like + add result
                        d_expanded_input[sorted_scattered_idxs] += d_input_lora_grouped

            # Reduce over top-k if k > 1
            if k == 1:
                d_input = d_expanded_input
            else:
                d_input = d_expanded_input.view(
                    x.size(0), k, d_expanded_input.size(-1)
                ).sum(-2)

            # W is frozen during LoRA training -- skip weight gradient
            d_weights = (
                torch.zeros_like(expert_weights)
                if expert_weights.requires_grad
                else None
            )
            d_biases = None

        return (
            d_input,
            d_weights,
            None,
            None,
            None,
            None,  # k, sorted indices, offsets
            d_lora_A,
            d_lora_B,
            None,  # lora_A, lora_B, scaling
            d_biases,
            d_gates,
            None,
            None,  # grouped_in, grouped_out
            None,  # use_fused_dX
            None,  # use_fused_gather
        )


def _compute_lora_input_grad(
    grouped_grad_out: torch.Tensor,
    lora_A: torch.Tensor,
    lora_B: torch.Tensor,
    expert_offsets: torch.Tensor,
    E: int,
    scaling: float,
) -> torch.Tensor:
    """
    Compute the LoRA contribution to the input gradient:
      dX_lora = scaling * (dY @ B) @ A

    Uses PyTorch ops on expert-grouped data.
    Each expert e: dX_e = scaling * (dY_e @ B_e) @ A_e
    """
    R = lora_A.size(0) // E
    K = lora_A.size(1)
    M_total = grouped_grad_out.size(0)

    d_input_lora = torch.zeros(
        (M_total, K), device=grouped_grad_out.device, dtype=grouped_grad_out.dtype
    )

    compute_dtype = grouped_grad_out.dtype

    prev_offset = 0
    for e in range(E):
        curr_offset = expert_offsets[e].item()
        if curr_offset > prev_offset:
            dy_e = grouped_grad_out[prev_offset:curr_offset]  # [M_e, N]
            a_e = lora_A[e * R : (e + 1) * R, :].to(compute_dtype)  # [r, K]
            b_e = lora_B[:, e * R : (e + 1) * R].to(compute_dtype)  # [N, r]

            # dX_e = scaling * (dY_e @ B_e) @ A_e
            dy_b = dy_e @ b_e  # [M_e, r]
            dx_e = scaling * (dy_b @ a_e)  # [M_e, K]
            d_input_lora[prev_offset:curr_offset] = dx_e

        prev_offset = curr_offset

    return d_input_lora


# =============================================================================
# Helper: Extract LoRA params from PEFT ParamWrapper
# =============================================================================


def get_lora_params_from_wrapper(module) -> tuple:
    """
    Extract LoRA parameters from a PEFT ParamWrapper.

    Returns:
        (lora_A, lora_B, scaling) if LoRA is active, else (None, None, None)
    """
    if not hasattr(module, "lora_A") or not hasattr(module, "lora_B"):
        return None, None, None

    active_adapters = getattr(module, "active_adapters", ["default"])
    if not active_adapters:
        return None, None, None

    adapter_name = active_adapters[0]

    lora_A_dict = getattr(module, "lora_A", {})
    lora_B_dict = getattr(module, "lora_B", {})
    scaling_dict = getattr(module, "scaling", {})

    if adapter_name not in lora_A_dict:
        return None, None, None

    lora_A = lora_A_dict[adapter_name].weight
    lora_B = lora_B_dict[adapter_name].weight
    scaling = scaling_dict[adapter_name]

    return lora_A, lora_B, scaling


# =============================================================================
# Drop-in replacement for parallel_linear
# =============================================================================


def parallel_linear_lora(
    inputs: torch.Tensor,
    expert_weights: torch.Tensor,
    k: int,
    sorted_expert_idxs: torch.Tensor,
    sorted_scattered_idxs: torch.Tensor,
    expert_offsets: torch.Tensor,
    lora_A: Optional[torch.Tensor] = None,
    lora_B: Optional[torch.Tensor] = None,
    scaling: float = 1.0,
    expert_biases: Optional[torch.Tensor] = None,
    gates: Optional[torch.Tensor] = None,
    grouped_in: bool = False,
    grouped_out: bool = False,
    use_fused_dX: bool = False,
    use_fused_gather: bool = False,
):
    """
    Drop-in replacement for parallel_linear that supports LoRA.

    If lora_A and lora_B are provided, uses fused LoRA kernel.
    Otherwise falls back to standard scatter2scatter.
    """
    if lora_A is not None and lora_B is not None:
        return ScatterMoELoRA.apply(
            inputs,
            expert_weights,
            k,
            sorted_expert_idxs,
            sorted_scattered_idxs,
            expert_offsets,
            lora_A,
            lora_B,
            scaling,
            expert_biases,
            gates,
            grouped_in,
            grouped_out,
            use_fused_dX,
            use_fused_gather,
        )
    else:
        from .parallel_experts import ParallelLinear

        return ParallelLinear.apply(
            inputs,
            expert_weights,
            k,
            sorted_expert_idxs,
            sorted_scattered_idxs,
            expert_offsets,
            expert_biases,
            gates,
            grouped_in,
            grouped_out,
        )


================================================
FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/selective_dequant.py
================================================
"""
Selective Expert Dequantization
===============================

Instead of dequantizing all E expert weight matrices at once (which creates
a ~1 GB transient buffer for 256 experts), only dequantize the experts that
are actually routed to by the current batch's top-k selection.

For Qwen3.5-35B-A3B (E=256, top_k=8, hidden=2048, intermediate=512):
  - Full dequant: [256, 2048, 1024] = 1,074 MB per projection
  - Selective (8 active): [8, 2048, 1024] = 33.5 MB per projection
  - Savings: ~97% memory reduction per layer

This module provides format-agnostic selective weight extraction:
  - BnB 4-bit (nf4/fp4): slice quantized data + absmax per expert
  - bf16/fp32: direct indexing (no dequant needed)
  - FP8: slice + cast

The ScatterMoE kernel itself doesn't change — we remap expert indices
from global (0..E-1) to compact (0..num_active-1) and pass the smaller
weight tensor.
"""

import torch
import torch.nn as nn


def get_active_experts(sorted_expert_idxs: torch.Tensor, E: int) -> torch.Tensor:
    """Get sorted unique expert indices from the routing output.

    Args:
        sorted_expert_idxs: Expert assignments sorted by expert id [T*k]
        E: Total number of experts

    Returns:
        active: Sorted unique expert indices [num_active]
    """
    return torch.unique(sorted_expert_idxs)


def remap_expert_indices(
    sorted_expert_idxs: torch.Tensor,
    expert_offsets: torch.Tensor,
    active_experts: torch.Tensor,
    E: int,
) -> tuple[torch.Tensor, torch.Tensor]:
    """Remap global expert indices to compact indices.

    Maps expert ids from [0..E-1] to [0..num_active-1], preserving the
    sort order. Also compacts expert_offsets to only active experts.

    Args:
        sorted_expert_idxs: [T*k] expert ids in sorted order
        expert_offsets: [E] cumulative token counts (original)
        active_experts: [num_active] sorted unique expert ids
        E: Total number of experts

    Returns:
        remapped_idxs: [T*k] expert ids in [0..num_active-1]
        compact_offsets: [num_active] cumulative token counts
    """
    # Build remap table: global_id -> compact_id
    remap = torch.empty(E, dtype=torch.long, device=sorted_expert_idxs.device)
    remap[active_experts] = torch.arange(
        len(active_experts), device=sorted_expert_idxs.device
    )

    remapped_idxs = remap[sorted_expert_idxs]

    # Compact the expert_offsets: only keep active experts' cumulative counts
    compact_offsets = expert_offsets[active_experts]

    return remapped_idxs, compact_offsets


def _selective_dequant_bnb4(
    raw_param: torch.Tensor,
    quant_state,
    active_experts: torch.Tensor,
    expert_shape: tuple[int, int],
) -> torch.Tensor:
    """Dequantize only selected experts from BnB 4-bit packed data.

    The raw parameter is a flattened 4-bit packed tensor. Each expert's
    data is contiguous (stored in expert-major order), so we can gather
    the packed data and absmax blocks for active experts, then dequantize
    as one contiguous block.

    Args:
        raw_param: Flattened uint8 tensor of packed 4-bit weights
        quant_state: BnB QuantState with absmax, blocksize, code, etc.
        active_experts: [num_active] expert indices to dequantize
        expert_shape: (dim1, dim2) shape per expert (e.g. (1024, 2048))

    Returns:
        Dequantized weights [num_active, dim1, dim2] in original dtype
    """
    import bitsandbytes.functional as F  # noqa: N812
    from bitsandbytes.functional import QuantState

    expert_numel = expert_shape[0] * expert_shape[1]
    packed_per_expert = expert_numel // 2  # 4-bit = 2 values per byte
    blocks_per_expert = expert_numel // quant_state.blocksize
    num_active = len(active_experts)

    if blocks_per_expert == 0:
        # Expert is smaller than one quantization block — blocks span across
        # expert boundaries, so per-expert slicing isn't possible.
        # Fallback: full dequantize + index.
        full = F.dequantize_4bit(raw_param, quant_state)
        E_total = full.numel() // expert_numel
        return full.reshape(E_total, *expert_shape)[active_experts]

    # Use fused Triton kernel for NF4 (handles selective gather + dequant in one pass)
    if quant_state.quant_type == "nf4" and raw_param.dtype == torch.uint8:
        from axolotl.integrations.kernels.libs.scattermoe_lora.selective_dequant_kernel import (
            selective_dequant_nf4_triton,
        )

        # Handle nested (double) quantization: dequantize absmax first
        # BnB uses dequantize_blockwise (not _4bit) for nested absmax + offset
        if quant_state.nested:
            absmax = F.dequantize_blockwise(quant_state.absmax, quant_state.state2)
            absmax += quant_state.offset
            if absmax.dtype != torch.float32:
                absmax = absmax.float()
        else:
            absmax = quant_state.absmax

        return selective_dequant_nf4_triton(
            packed_data=raw_param,
            absmax=absmax,
            active_experts=active_experts,
            expert_shape=expert_shape,
            blocksize=quant_state.blocksize,
            dtype=quant_state.dtype,
            codebook=quant_state.code,
        )

    # Fallback: gather + BnB dequant (for fp4 or non-uint8 packed formats)
    raw_flat = raw_param.reshape(-1)

    offsets_qt = (
        active_experts.long()[:, None] * packed_per_expert
        + torch.arange(packed_per_expert, device=raw_param.device)[None, :]
    ).reshape(-1)
    qt_gathered = raw_flat[offsets_qt]

    offsets_abs = (
        active_experts.long()[:, None] * blocks_per_expert
        + torch.arange(blocks_per_expert, device=raw_param.device)[None, :]
    ).reshape(-1)

    if quant_state.nested:
        full_absmax = F.dequantize_blockwise(quant_state.absmax, quant_state.state2)
        full_absmax += quant_state.offset
        if full_absmax.dtype != torch.float32:
            full_absmax = full_absmax.float()
        absmax_gathered = full_absmax[offsets_abs]
    else:
        absmax_gathered = quant_state.absmax[offsets_abs]

    qt_gathered = qt_gathered.unsqueeze(1) if qt_gathered.dim() == 1 else qt_gathered

    gathered_qs = QuantState(
        absmax=absmax_gathered,
        shape=torch.Size([num_active * expert_numel]),
        blocksize=quant_state.blocksize,
        quant_type=quant_state.quant_type,
        code=quant_state.code,
        dtype=quant_state.dtype,
    )

    deq = F.dequantize_4bit(qt_gathered, gathered_qs)
    return deq.reshape(num_active, *expert_shape)


def _selective_index_dense(
    param: torch.Tensor,
    active_experts: torch.Tensor,
) -> torch.Tensor:
    """Select experts from a dense (bf16/fp32) weight tensor.

    Simple indexing — no dequantization needed.
    """
    return param[active_experts]


def selective_expert_weights(
    experts_module: nn.Module,
    param_name: str,
    active_experts: torch.Tensor,
) -> torch.Tensor:
    """Extract and dequantize only the active experts' weights.

    Format-agnostic: dispatches based on whether the parameter is
    BnB 4-bit quantized (via parametrize), FP8, or dense bf16/fp32.

    Args:
        experts_module: The base experts module (e.g. Qwen3_5MoeExperts)
        param_name: "gate_up_proj" or "down_proj"
        active_experts: [num_active] sorted unique expert indices

    Returns:
        Compact weight tensor [num_active, dim1, dim2] ready for ScatterMoE
    """
    # Check if the parameter is BnB-quantized via parametrize
    if (
        hasattr(experts_module, "parametrizations")
        and param_name in experts_module.parametrizations
    ):
        param_list = experts_module.parametrizations[param_name]
        parametrization = param_list[0]

        # BnB 4-bit parametrization
        if hasattr(parametrization, "quant_state"):
            # The raw quantized data is on the ParametrizationList, not the
            # individual Bnb4bitParametrization module
            raw_param = param_list.original
            qs = parametrization.quant_state
            # qs.shape is the original tensor shape before flattening.
            # For MoE experts it's [E, d1, d2] (3D) or [total_elements] (1D).
            orig_shape = qs.shape
            if isinstance(orig_shape, torch.Size) and len(orig_shape) == 3:
                expert_shape = (orig_shape[1], orig_shape[2])
            elif isinstance(orig_shape, torch.Size) and len(orig_shape) == 1:
                # Flattened — need to infer from module attributes
                E_total = getattr(experts_module, "num_experts", None)
                if E_total is None:
                    E_total = int(active_experts.max().item()) + 1
                expert_numel = orig_shape[0] // E_total
                d2 = getattr(experts_module, "hidden_dim", None) or getattr(
                    experts_module, "intermediate_dim", None
                )
                if d2 and expert_numel % d2 == 0:
                    expert_shape = (expert_numel // d2, d2)
                else:
                    full = getattr(experts_module, param_name)
                    return full[active_experts]
            else:
                full = getattr(experts_module, param_name)
                return full[active_experts]

            return _selective_dequant_bnb4(raw_param, qs, active_experts, expert_shape)

    # Dense parameter (bf16/fp32) — direct indexing
    param = getattr(experts_module, param_name)
    if param.dim() == 3:
        return param[active_experts]

    # Fallback: full access
    return param


def selective_lora_weights(
    lora_A: torch.Tensor,
    lora_B: torch.Tensor,
    active_experts: torch.Tensor,
    E: int,
) -> tuple[torch.Tensor, torch.Tensor]:
    """Select LoRA A and B weights for only the active experts.

    LoRA layout (scattermoe format):
      A: [r*E, K] — expert e occupies rows [e*r : (e+1)*r]
      B: [N, r*E] — expert e occupies cols [e*r : (e+1)*r]

    Returns compact:
      A: [r*num_active, K]
      B: [N, r*num_active]
    """
    R = lora_A.size(0) // E

    # Vectorized gather: active_experts[:, None] * R + arange(R)[None, :]
    row_idx = (
        active_experts.long()[:, None] * R
        + torch.arange(R, device=lora_A.device)[None, :]
    ).reshape(-1)

    compact_A = lora_A[row_idx]  # [r*num_active, K]
    compact_B = lora_B[:, row_idx]  # [N, r*num_active]

    return compact_A, compact_B


================================================
FILE: src/axolotl/integrations/kernels/libs/scattermoe_lora/selective_dequant_kernel.py
================================================
"""
Triton kernel for fused selective expert gather + NF4 dequantization.

Instead of:
  1. Gather packed uint8 data for active experts (memory copy)
  2. Gather absmax for active experts (memory copy)
  3. Call BnB dequantize_4bit CUDA kernel

This kernel does all three in one pass:
  - Reads packed NF4 bytes from expert-strided positions
  - Looks up the NF4 codebook
  - Multiplies by the per-block absmax
  - Writes bf16 output directly

This eliminates the intermediate gather buffer entirely.
"""

import torch
import triton
import triton.language as tl

# NF4 codebook (16 values, precomputed by BnB)
# These are the normalized float4 reconstruction values
NF4_CODEBOOK = [
    -1.0,
    -0.6961928009986877,
    -0.5250730514526367,
    -0.39491748809814453,
    -0.28444138169288635,
    -0.18477343022823334,
    -0.09105003625154495,
    0.0,
    0.07958029955625534,
    0.16093020141124725,
    0.24611230194568634,
    0.33791524171829224,
    0.44070982933044434,
    0.5626170039176941,
    0.7229568362236023,
    1.0,
]


@triton.jit
def _selective_dequant_nf4_kernel(
    # Input: packed NF4 data (flattened, expert-major order)
    packed_ptr,
    # Input: absmax values (flattened, expert-major order)
    absmax_ptr,
    # Input: active expert indices
    active_experts_ptr,
    # Input: NF4 codebook (16 float values)
    codebook_ptr,
    # Output: dequantized bf16 weights [num_active, expert_numel]
    out_ptr,
    stride_out_e,  # stride for expert dim in output
    # Dimensions
    num_active,
    packed_per_expert,  # expert_numel // 2
    blocks_per_expert,  # expert_numel // blocksize
    blocksize: tl.constexpr,
    # Tile size
    BLOCK_SIZE: tl.constexpr,  # elements per thread block (must be multiple of 2)
):
    """
    Each program processes BLOCK_SIZE elements from one expert.

    Grid: (num_active, cdiv(expert_numel, BLOCK_SIZE))

    For each output element:
      1. Compute which byte in packed data contains this element
      2. Extract the 4-bit nibble (high or low)
      3. Look up in NF4 codebook
      4. Scale by absmax for this block
    """
    expert_local_idx = tl.program_id(0)  # which active expert (0..num_active-1)
    block_id = tl.program_id(1)  # which element block

    # Load the global expert index
    expert_global = tl.load(active_experts_ptr + expert_local_idx).to(tl.int64)

    expert_numel = packed_per_expert * 2  # 2 elements per packed byte
    elem_offset = block_id * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
    mask = elem_offset < expert_numel

    # Each element is packed as: byte[i//2], low nibble for even i, high for odd i
    byte_idx = elem_offset // 2
    is_high = (elem_offset % 2) == 1

    # Read packed bytes from the global expert's region
    packed_global_offset = expert_global * packed_per_expert + byte_idx
    packed_bytes = tl.load(packed_ptr + packed_global_offset, mask=mask, other=0).to(
        tl.int32
    )

    # Extract 4-bit nibble
    # BnB packing: high nibble = even element, low nibble = odd element
    nibble = tl.where(is_high, packed_bytes & 0xF, (packed_bytes >> 4) & 0xF)

    # NF4 codebook lookup
    # Load all 16 codebook values (small, fits in registers)
    # Use gather from codebook pointer
    code_val = tl.load(codebook_ptr + nibble, mask=mask, other=0.0)

    # Load absmax for this element's quantization block
    block_idx = elem_offset // blocksize
    absmax_global_offset = expert_global * blocks_per_expert + block_idx
    absmax_val = tl.load(absmax_ptr + absmax_global_offset, mask=mask, other=1.0)

    # Dequantize: value = codebook[nibble] * absmax
    result = code_val * absmax_val

    # Store to output
    out_offset = expert_local_idx * stride_out_e + elem_offset
    tl.store(out_ptr + out_offset, result.to(out_ptr.dtype.element_ty), mask=mask)


def selective_dequant_nf4_triton(
    packed_data: torch.Tensor,
    absmax: torch.Tensor,
    active_experts: torch.Tensor,
    expert_shape: tuple[int, int],
    blocksize: int,
    dtype: torch.dtype = torch.bfloat16,
    codebook: torch.Tensor | None = None,
) -> torch.Tensor:
    """Fused selective gather + NF4 dequantization via Triton kernel.

    Args:
        packed_data: Flattened packed NF4 data [total_packed] or [total_packed, 1]
        absmax: Per-block scaling factors [total_blocks]
        active_experts: Sorted indices of experts to dequantize [num_active]
        expert_shape: (dim1, dim2) per expert
        blocksize: Quantization block size
        dtype: Output dtype (default bf16)
        codebook: NF4 lookup table [16] (uses default NF4 codebook if None)

    Returns:
        Dequantized weights [num_active, dim1, dim2]
    """
    num_active = active_experts.shape[0]
    expert_numel = expert_shape[0] * expert_shape[1]
    packed_per_expert = expert_numel // 2
    blocks_per_expert = expert_numel // blocksize

    # Prepare codebook on device
    if codebook is None:
        codebook = torch.tensor(
            NF4_CODEBOOK, dtype=torch.float32, device=packed_data.device
        )
    else:
        codebook = codebook.to(device=packed_data.device, dtype=torch.float32)

    # Flatten inputs
    packed_flat = packed_data.reshape(-1)
    absmax_flat = absmax.reshape(-1).float()  # absmax is usually fp32

    # Output buffer
    out = torch.empty(num_active, expert_numel, dtype=dtype, device=packed_data.device)

    BLOCK_SIZE = 1024  # Process 1024 elements per thread block

    grid = (num_active, triton.cdiv(expert_numel, BLOCK_SIZE))

    _selective_dequant_nf4_kernel[grid](
        packed_flat,
        absmax_flat,
        active_experts,
        codebook,
        out,
        out.stride(0),
        num_active=num_active,
        packed_per_expert=packed_per_expert,
        blocks_per_expert=blocks_per_expert,
        blocksize=blocksize,
        BLOCK_SIZE=BLOCK_SIZE,
    )

    return out.reshape(num_active, *expert_shape)


================================================
FILE: src/axolotl/integrations/kernels/plugin.py
================================================
import importlib
import os
from pathlib import Path

import torch

from axolotl.integrations.base import BasePlugin
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def _check_sonicmoe_gpu_compat():
    """Validate GPU compute capability for SonicMoE and configure env.

    Supported: Hopper (sm_90), Blackwell (sm_100 - sm_103).
    B300 (sm_103) additionally requires Triton 3.6.0.
    """
    if not torch.cuda.is_available():
        return

    cc = torch.cuda.get_device_capability()

    if cc < (9, 0):
        raise RuntimeError(
            f"SonicMoE requires Hopper (sm_90) or Blackwell (sm_100+) GPU, "
            f"but detected sm_{cc[0]}{cc[1]}."
        )

    if cc > (10, 3):
        raise RuntimeError(
            f"SonicMoE does not yet support sm_{cc[0]}{cc[1]}. "
            f"Supported: Hopper (sm_90) and Blackwell (sm_100 - sm_103)."
        )

    # Blackwell (sm_100+): enable QuACK GEMM kernels
    if cc >= (10, 0):
        os.environ.setdefault("USE_QUACK_GEMM", "1")
        LOG.info(
            f"Blackwell GPU (sm_{cc[0]}{cc[1]}) detected, enabling USE_QUACK_GEMM=1"
        )

    # B300 (sm_103): requires Triton 3.6.0
    if cc == (10, 3):
        triton_spec = importlib.util.find_spec("triton")
        if triton_spec is None:
            raise RuntimeError(
                "B300 (sm_103) requires Triton 3.6.0, but Triton is not installed."
            )
        import triton

        triton_version = tuple(int(x) for x in triton.__version__.split(".")[:2])
        if triton_version != (3, 6):
            raise RuntimeError(
                f"B300 (sm_103) requires Triton 3.6.x, but found {triton.__version__}."
            )


class KernelsPlugin(BasePlugin):
    def get_input_args(self):
        return "axolotl.integrations.kernels.KernelsArgs"

    def pre_model_load(self, cfg):
        from axolotl.integrations.kernels.constants import SPARSE_MOE_BLOCK

        # Prefer text backbone type for VLMs, but fall back to base type
        # when the text type isn't in the supported mapping (e.g. qwen3_5_moe_text)
        moe_model_type = cfg.model_config_type_text or cfg.model_config_type
        if (
            moe_model_type not in SPARSE_MOE_BLOCK
            and cfg.model_config_type in SPARSE_MOE_BLOCK
        ):
            moe_model_type = cfg.model_config_type

        if cfg.use_scattermoe:
            self._register_kernels()
            self._kernelize_model(moe_model_type)
        elif cfg.use_sonicmoe:
            if not importlib.util.find_spec("sonicmoe"):
                raise RuntimeError(
                    "SonicMoE is not installed. See installation instructions at "
                    "https://github.com/axolotl-ai-cloud/axolotl/blob/main/src/axolotl/integrations/kernels/README.md#sonicmoe-installation"
                )

            _check_sonicmoe_gpu_compat()

            from axolotl.integrations.kernels.sonicmoe import patch_sonicmoe

            LOG.info(f"Applying SonicMoE patches for model type: {moe_model_type}")
            patch_sonicmoe(
                moe_model_type,
                torch_compile=bool(getattr(cfg, "torch_compile", False)),
            )

    def _register_kernels(self):
        from kernels import (
            LocalLayerRepository,
            Mode,
            register_kernel_mapping,
        )

        plugin_root = Path(__file__).parent
        register_kernel_mapping(
            {
                "HFScatterMoEParallelExperts": {
                    "cuda": {
                        Mode.TRAINING: LocalLayerRepository(
                            repo_path=plugin_root / "libs" / "scattermoe_lora",
                            package_name="scattermoe_lora",
                            layer_name="HFScatterMoEGatedMLP",
                        ),
                        Mode.INFERENCE: LocalLayerRepository(
                            repo_path=plugin_root / "libs" / "scattermoe_lora",
                            package_name="scattermoe_lora",
                            layer_name="HFScatterMoEGatedMLP",
                        ),
                    },
                }
            }
        )

    def add_callbacks_pre_trainer(self, cfg, model):
        callbacks = []
        if cfg.use_scattermoe:
            from axolotl.integrations.kernels.autotune_callback import (
                AutotuneReportCallback,
            )

            callbacks.append(AutotuneReportCallback())
        return callbacks

    def _kernelize_model(self, model_type: str):
        from kernels import replace_kernel_forward_from_hub

        from axolotl.integrations.kernels.constants import resolve_moe_block_classes

        for model_moe_cls in resolve_moe_block_classes(model_type):
            replace_kernel_forward_from_hub(
                model_moe_cls, "HFScatterMoEParallelExperts"
            )


================================================
FILE: src/axolotl/integrations/kernels/sonicmoe/__init__.py
================================================
from .patch import patch_sonicmoe

__all__ = ["patch_sonicmoe"]


================================================
FILE: src/axolotl/integrations/kernels/sonicmoe/patch.py
================================================
"""
SonicMoE patching for SparseMoeBlock forward pass.

Monkeypatches the SparseMoeBlock class for a given model type to use
SonicMoE's optimized kernels. Two forward paths are supported:

1. **General routing path** (routing_fn is not None):
   Uses a custom routing function + ``moe_general_routing_inputs``.
   Suitable for models with non-standard routing (softmax->topk, sigmoid->topk).

2. **Fused topk->softmax path** (routing_fn is None):
   Uses ``moe_TC_softmax_topk_layer`` which fuses routing + expert computation.
   Suitable for models with simple topk->softmax routing.

Weight format conversion (interleave/deinterleave) is handled by the
WeightConverter system, so the forward assumes weights are already in
interleaved format.

Shared experts are handled generically: if the block has a ``shared_expert``
or ``shared_experts`` attribute, its output is computed alongside the routed
experts and added to the final output. An optional ``shared_expert_gate``
applies sigmoid gating to the shared expert contribution.
"""

import torch
import torch.nn.functional as F

from axolotl.integrations.kernels.constants import resolve_moe_block_classes
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def patch_sonicmoe(model_type: str, torch_compile: bool = False):
    """Main entry point: patch SparseMoeBlock for SonicMoE support.

    Args:
        model_type: The HuggingFace model type (e.g. "qwen3_moe").
        torch_compile: If True, wrap routing functions with torch.compile
            for kernel fusion (fuses softmax+topk+renorm into fewer launches).
    """
    from .routing import get_model_moe_config
    from .weight_converter import register_sonicmoe_weight_converter

    routing_fn, activation, router_attr = get_model_moe_config(model_type)

    if torch_compile and routing_fn is not None:
        routing_fn = _try_compile_routing(routing_fn)

    for moe_cls in resolve_moe_block_classes(model_type):
        _patch_forward(moe_cls, routing_fn, activation, router_attr)
    register_sonicmoe_weight_converter(model_type)


def _try_compile_routing(routing_fn):
    """Attempt to torch.compile the routing function, fall back to eager on failure."""
    try:
        compiled_fn = torch.compile(routing_fn, mode="reduce-overhead", dynamic=False)
        LOG.info(f"torch.compile enabled for routing function: {routing_fn.__name__}")
        return compiled_fn
    except Exception as exc:  # pylint: disable=broad-except
        LOG.warning(
            f"torch.compile failed for routing function {routing_fn.__name__}, "
            f"falling back to eager: {exc}"
        )
        return routing_fn


def _patch_forward(moe_cls, routing_fn, activation, router_attr):
    """Monkeypatch the SparseMoeBlock class with a SonicMoE forward.

    The patched forward handles shared experts generically: if
    ``self.shared_expert`` or ``self.shared_experts`` exists, it is computed
    and added to the routed output. If ``self.shared_expert_gate`` also exists,
    it applies sigmoid gating to the shared expert contribution (as in qwen2_moe).

    Args:
        moe_cls: The SparseMoeBlock class to patch.
        routing_fn: Routing function (e.g. softmax_topk_routing), or None
            for the fused moe_TC_softmax_topk_layer path.
        activation: SonicMoE ActivationType enum value.
        router_attr: Name of the router module attribute on the MoE block.
    """
    if hasattr(moe_cls, "_original_forward"):
        LOG.info(f"{moe_cls.__name__}.forward already patched with SonicMoE, skipping")
        return

    original_forward = moe_cls.forward

    if routing_fn is not None:
        _make_general_forward(moe_cls, routing_fn, activation)
    else:
        _make_fused_forward(moe_cls, activation, router_attr)

    moe_cls._original_forward = original_forward
    LOG.info(f"Patched {moe_cls.__name__}.forward with SonicMoE implementation")


def _make_general_forward(moe_cls, routing_fn, activation):
    """Create forward using routing_fn + moe_general_routing_inputs."""

    def sonicmoe_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        from sonicmoe import moe_general_routing_inputs

        batch_size, sequence_length, hidden_dim = hidden_states.shape
        hidden_states_flat = hidden_states.view(-1, hidden_dim)

        # Shared expert (computed early, matching original model ordering)
        shared_expert_output = _compute_shared_expert(self, hidden_states_flat)

        # Routing
        router_scores, token_indices, expert_indices, _router_logits = routing_fn(
            hidden_states_flat, self
        )

        # Permute weights to SonicMoE layout:
        #   gate_up: [E, 2*I, H] -> [2*I, H, E]
        #   down:    [E, H, I]   -> [H, I, E]
        gate_up_weight = self.experts.gate_up_proj.permute(1, 2, 0)
        down_weight = self.experts.down_proj.permute(1, 2, 0)
        E = gate_up_weight.shape[-1]

        output, _ = moe_general_routing_inputs(
            hidden_states_flat,
            router_scores,
            token_indices,
            expert_indices,
            gate_up_weight,
            None,  # b1 (no gate/up bias)
            down_weight,
            None,  # b2 (no down bias)
            E,
            torch.cuda.current_stream().cuda_stream,
            activation,
            False,  # is_inference_mode
        )

        # Add shared expert contribution if present
        if shared_expert_output is not None:
            if hasattr(self, "shared_expert_gate"):
                shared_expert_output = (
                    F.sigmoid(self.shared_expert_gate(hidden_states_flat))
                    * shared_expert_output
                )
            output = output + shared_expert_output

        return output.view(batch_size, sequence_length, hidden_dim)

    moe_cls.forward = sonicmoe_forward


def _make_fused_forward(moe_cls, activation, router_attr):
    """Create forward using moe_TC_softmax_topk_layer (topk -> softmax)."""

    def sonicmoe_fused_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        from sonicmoe import moe_TC_softmax_topk_layer

        batch_size, sequence_length, hidden_dim = hidden_states.shape
        hidden_states_flat = hidden_states.view(-1, hidden_dim)

        # Shared expert (computed early, matching original model ordering)
        shared_expert_output = _compute_shared_expert(self, hidden_states_flat)

        router = getattr(self, router_attr)

        # Permute weights to SonicMoE layout:
        #   gate_up: [E, 2*I, H] -> [2*I, H, E]
        #   down:    [E, H, I]   -> [H, I, E]
        gate_up_weight = self.experts.gate_up_proj.permute(1, 2, 0)
        down_weight = self.experts.down_proj.permute(1, 2, 0)

        output, _router_logits, _expert_freq = moe_TC_softmax_topk_layer(
            hidden_states_flat,
            router.weight,
            gate_up_weight,
            None,  # b1 (no gate/up bias)
            down_weight,
            None,  # b2 (no down bias)
            router.top_k,
            torch.cuda.current_stream().cuda_stream,
            activation,
            False,  # is_inference_mode
        )

        # Add shared expert contribution if present
        if shared_expert_output is not None:
            if hasattr(self, "shared_expert_gate"):
                shared_expert_output = (
                    F.sigmoid(self.shared_expert_gate(hidden_states_flat))
                    * shared_expert_output
                )
            output = output + shared_expert_output

        return output.view(batch_size, sequence_length, hidden_dim)

    moe_cls.forward = sonicmoe_fused_forward


def _compute_shared_expert(moe_block, hidden_states_flat):
    """Compute shared expert output if the block has one.

    Handles singular (qwen2_moe: ``shared_expert``), plural
    (glm_moe_dsa/deepseek_v3: ``shared_experts``), and MLP
    (hunyuan_v1_moe: ``shared_mlp``) attribute names.
    """
    shared_expert = (
        getattr(moe_block, "shared_expert", None)
        or getattr(moe_block, "shared_experts", None)
        or getattr(moe_block, "shared_mlp", None)
    )
    if shared_expert is not None:
        return shared_expert(hidden_states_flat)
    return None


================================================
FILE: src/axolotl/integrations/kernels/sonicmoe/routing.py
================================================
"""
Routing functions for SonicMoE integration.

Different MoE architectures use different routing strategies:
- qwen3_moe / qwen2_moe / qwen3_5_moe / qwen3_vl_moe / qwen3_omni_moe: softmax -> topk (with optional renormalization)
- gpt_oss: topk -> softmax (uses fused moe_TC_softmax_topk_layer, routing_fn=None)
- glm_moe_dsa: sigmoid -> topk (with group-based expert selection)
- mistral4: softmax -> group selection -> topk (with renormalization and scaling)

Each model type maps to a (routing_fn, activation_type, router_attr) triple.
When routing_fn is None, the fused moe_TC_softmax_topk_layer path is used.
"""

import torch
import torch.nn.functional as F


def get_model_moe_config(model_type: str):
    """Returns (routing_fn, activation, router_attr) for a given model type.

    Args:
        model_type: HuggingFace model type string.

    Returns:
        routing_fn: Callable or None. None signals the fused
            moe_TC_softmax_topk_layer path (topk -> softmax models).
        activation: SonicMoE ActivationType enum value.
        router_attr: Name of the router module attribute on the MoE block
            (e.g. "gate" or "router").

    The activation type cannot be derived from config.hidden_act because
    e.g. qwen3_moe reports "silu" but architecturally uses SwiGLU
    (act_fn(gate) * up pattern). So we specify it per model type.
    """
    from sonicmoe.enums import ActivationType

    if model_type in (
        "qwen2_moe",
        "qwen3_moe",
        "qwen3_5_moe",
        "qwen3_next",
        "qwen3_vl_moe",
        "qwen3_omni_moe",
        "olmoe",
        "mixtral",
        "minimax",
    ):
        return softmax_topk_routing, ActivationType.SWIGLU, "gate"
    elif model_type in ("mistral4",):
        return softmax_group_topk_routing, ActivationType.SWIGLU, "gate"
    elif model_type in (
        "glm_moe_dsa",
        "deepseek_v3",
        "glm4_moe",
        "glm4_moe_lite",
        "glm4v_moe",
        "minimax_m2",
    ):
        return sigmoid_topk_routing, ActivationType.SWIGLU, "gate"
    # elif model_type in ("ernie4_5_moe",):
    #     # Softmax→topk with e_score_correction_bias applied between softmax and topk.
    #     return ..., ActivationType.SWIGLU, "gate"
    # elif model_type in ("deepseek_v2",):
    #     # Softmax→topk with group_limited_greedy. Different attr names: num_group
    #     # (not n_group), gate is nn.Linear (not a router class).
    #     return ..., ActivationType.SWIGLU, "gate"
    # elif model_type in ("hunyuan_v1_moe",):
    #     # Softmax→topk but gate structure differs: gate.wg (not gate.weight),
    #     # top_k on block not gate, creates scatter routing matrix.
    #     return ..., ActivationType.SWIGLU, "gate"
    # Fused topk -> softmax path (routing_fn=None):
    # elif model_type in ("gpt_oss",):
    #     # NOTE: gpt_oss has a router bias which moe_TC_softmax_topk_layer
    #     # ignores (it only takes router_w, not bias). Also has transposed
    #     # weight layout [E, H, 2*I] and custom GLU activation.
    #     return None, ActivationType.SWIGLU, "router"
    else:
        raise ValueError(f"SonicMoE: unsupported model type '{model_type}'")


def softmax_topk_routing(
    hidden_states: torch.Tensor, moe_block
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """Qwen3/Qwen2-style routing: softmax -> topk -> optional renorm.

    Args:
        hidden_states: [T, H] flattened token representations
        moe_block: MoE block module (accesses moe_block.gate.*)

    Returns:
        router_scores: [T*K] flattened scores (float32)
        token_indices: [T*K] which token each entry belongs to (int32), sorted ascending
        expert_indices: [T*K] which expert (int32)
        router_logits: [T, E] original logits for aux loss
    """
    gate = moe_block.gate
    T, H = hidden_states.shape
    K = gate.top_k

    # Compute router logits and softmax over all experts
    router_logits = F.linear(hidden_states, gate.weight)  # [T, E]
    router_probs = F.softmax(router_logits, dim=-1, dtype=torch.float32)  # [T, E]

    # Select top-k experts per token
    top_values, top_indices = torch.topk(router_probs, K, dim=-1)  # [T, K] each

    # Renormalize if configured (default True for models without the attribute,
    # e.g. Mixtral/MiniMax which always normalize)
    if getattr(gate, "norm_topk_prob", True):
        top_values = top_values / top_values.sum(dim=-1, keepdim=True)

    # no-op: matches transformers which casts to softmax output dtype (float32).
    # top_values = top_values.to(router_probs.dtype)

    # Flatten for moe_general_routing_inputs.
    # Token indices are naturally sorted ascending from the [T, K] layout:
    # [0, 0, ..., 1, 1, ..., T-1, T-1, ...] — this is required by SonicMoE.
    # Expert sorting is handled internally by general_routing_router_metadata.
    token_indices = (
        torch.arange(T, device=hidden_states.device, dtype=torch.int32)
        .unsqueeze(1)
        .expand(T, K)
    )

    flat_scores = top_values.reshape(-1)  # [T*K]
    flat_token_idx = token_indices.reshape(-1)  # [T*K]
    flat_expert_idx = top_indices.to(torch.int32).reshape(-1)  # [T*K]

    return flat_scores, flat_token_idx, flat_expert_idx, router_logits


def softmax_group_topk_routing(
    hidden_states: torch.Tensor, moe_block
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """Mistral4-style routing: softmax -> group selection -> topk -> renorm -> scale."""
    gate = moe_block.gate
    T, H = hidden_states.shape
    K = moe_block.top_k
    E = getattr(moe_block, "n_routed_experts", gate.weight.shape[0])
    n_group = getattr(moe_block, "n_group", 1)

    router_logits = F.linear(hidden_states, gate.weight)  # [T, E]
    router_probs = F.softmax(router_logits, dim=-1, dtype=torch.float32)  # [T, E]

    scores_for_choice = router_probs

    # Group selection: pick top groups, mask the rest
    if n_group > 1:
        group_scores = (
            scores_for_choice.view(-1, n_group, E // n_group)
            .topk(2, dim=-1)[0]
            .sum(dim=-1)
        )
        group_idx = torch.topk(
            group_scores, k=moe_block.topk_group, dim=-1, sorted=False
        )[1]
        group_mask = torch.zeros_like(group_scores)
        group_mask.scatter_(1, group_idx, 1)
        score_mask = (
            group_mask.unsqueeze(-1).expand(-1, n_group, E // n_group).reshape(-1, E)
        )
        scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)

    topk_indices = torch.topk(scores_for_choice, k=K, dim=-1, sorted=False)[1]
    topk_weights = router_probs.gather(1, topk_indices)

    # Renormalization + scaling
    norm_topk_prob = getattr(moe_block, "norm_topk_prob", True)
    if norm_topk_prob:
        topk_weights = topk_weights / (topk_weights.sum(dim=-1, keepdim=True) + 1e-20)
    routed_scaling_factor = getattr(moe_block, "routed_scaling_factor", 1.0)
    topk_weights = topk_weights * routed_scaling_factor

    # Flatten for moe_general_routing_inputs
    token_indices = (
        torch.arange(T, device=hidden_states.device, dtype=torch.int32)
        .unsqueeze(1)
        .expand(T, K)
    )

    flat_scores = topk_weights.to(torch.float32).reshape(-1)  # [T*K]
    flat_token_idx = token_indices.reshape(-1)  # [T*K]
    flat_expert_idx = topk_indices.to(torch.int32).reshape(-1)  # [T*K]

    return flat_scores, flat_token_idx, flat_expert_idx, router_logits


def sigmoid_topk_routing(
    hidden_states: torch.Tensor, moe_block
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
    """Sigmoid-based routing: sigmoid -> optional group selection -> topk.

    Supports two variants:
    - **Group selection** (glm_moe_dsa, deepseek_v3, etc.): n_group > 1,
      bias on gate, group-based masking before topk.
    - **No group selection** (minimax_m2): n_group == 1 (or absent),
      bias on moe_block, straight topk from all experts.

    Final routing weights come from the original sigmoid scores (not
    bias-corrected), with optional renormalization and scaling.

    Args:
        hidden_states: [T, H] flattened token representations
        moe_block: MoE block module (accesses moe_block.gate.* and
            optional moe_block.n_group, .topk_group, .top_k, .norm_topk_prob,
            .routed_scaling_factor, .n_routed_experts)

    Returns:
        router_scores: [T*K] flattened scores (float32)
        token_indices: [T*K] which token each entry belongs to (int32), sorted ascending
        expert_indices: [T*K] which expert (int32)
        router_logits: [T, E] original logits for aux loss
    """
    gate = moe_block.gate
    T, H = hidden_states.shape
    K = moe_block.top_k
    E = getattr(moe_block, "n_routed_experts", gate.weight.shape[0])
    n_group = getattr(moe_block, "n_group", 1)

    # Compute router logits and sigmoid probabilities
    router_logits = F.linear(hidden_states.float(), gate.weight.float())  # [T, E]
    router_probs = router_logits.sigmoid()  # [T, E]

    # Bias-corrected scores for expert selection (not used for final weights).
    # glm_moe_dsa/deepseek_v3 store the bias on gate; minimax_m2 stores it on the block.
    e_score_correction_bias = getattr(gate, "e_score_correction_bias", None)
    if e_score_correction_bias is None:
        e_score_correction_bias = getattr(moe_block, "e_score_correction_bias", None)
    if e_score_correction_bias is None:
        raise AttributeError(
            f"sigmoid_topk_routing requires e_score_correction_bias on "
            f"gate ({type(gate)}) or moe_block ({type(moe_block)}), but neither has it"
        )
    scores_for_choice = router_probs + e_score_correction_bias

    # Group-based selection: pick top groups, mask the rest (skip when n_group == 1)
    if n_group > 1:
        group_scores = (
            scores_for_choice.view(-1, n_group, E // n_group)
            .topk(2, dim=-1)[0]
            .sum(dim=-1)
        )  # [T, n_group]
        group_idx = torch.topk(
            group_scores, k=moe_block.topk_group, dim=-1, sorted=False
        )[1]
        group_mask = torch.zeros_like(group_scores)
        group_mask.scatter_(1, group_idx, 1)
        score_mask = (
            group_mask.unsqueeze(-1).expand(-1, n_group, E // n_group).reshape(-1, E)
        )
        scores_for_choice = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)

    # Final topk from (possibly masked) scores
    topk_indices = torch.topk(scores_for_choice, k=K, dim=-1, sorted=False)[1]

    # Gather weights from original sigmoid scores (not bias-corrected)
    topk_weights = router_probs.gather(1, topk_indices)

    # Optional renormalization + scaling
    norm_topk_prob = getattr(moe_block, "norm_topk_prob", True)
    if norm_topk_prob:
        topk_weights = topk_weights / (topk_weights.sum(dim=-1, keepdim=True) + 1e-20)
    routed_scaling_factor = getattr(moe_block, "routed_scaling_factor", 1.0)
    topk_weights = topk_weights * routed_scaling_factor

    # Flatten for moe_general_routing_inputs.
    # Token indices are naturally sorted ascending from the [T, K] layout.
    token_indices = (
        torch.arange(T, device=hidden_states.device, dtype=torch.int32)
        .unsqueeze(1)
        .expand(T, K)
    )

    flat_scores = topk_weights.to(torch.float32).reshape(-1)  # [T*K]
    flat_token_idx = token_indices.reshape(-1)  # [T*K]
    flat_expert_idx = topk_indices.to(torch.int32).reshape(-1)  # [T*K]

    return flat_scores, flat_token_idx, flat_expert_idx, router_logits


================================================
FILE: src/axolotl/integrations/kernels/sonicmoe/weight_converter.py
================================================
"""
Custom WeightConverter operations for SonicMoE weight format conversion.

SonicMoE requires gate_up_proj weights in interleaved format:
- Standard (concatenated): [E, 2*I, H] where first I rows are gate, last I rows are up
- SonicMoE (interleaved): [E, 2*I, H] where rows alternate [g0, u0, g1, u1, ...]

These ConversionOps integrate with transformers' WeightConverter system so that
weights are transparently converted during loading and reverted during saving.
"""

from typing import Any

import torch
from einops import rearrange
from transformers.core_model_loading import ConversionOps

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def interleave_gate_up(tensor: torch.Tensor) -> torch.Tensor:
    """[gate..., up...] -> [g0, u0, g1, u1, ...] along the 2*I dimension."""
    return rearrange(tensor, "... (two out) h -> ... (out two) h", two=2)


def deinterleave_gate_up(tensor: torch.Tensor) -> torch.Tensor:
    """[g0, u0, g1, u1, ...] -> [gate..., up...] along the 2*I dimension."""
    return rearrange(tensor, "... (out two) h -> ... (two out) h", two=2)


class ConcatenatedToInterleaved(ConversionOps):
    """Convert concatenated gate/up projections to interleaved format.

    Input:  [E, 2*I, H] with gate=[E, :I, H] and up=[E, I:, H]
    Output: [E, 2*I, H] with rows alternating [g0, u0, g1, u1, ...]

    This operation is applied along ``dim`` (default 1, the 2*I dimension).
    """

    def __init__(self, dim: int = 1):
        self.dim = dim

    @torch.no_grad()
    def convert(
        self,
        input_dict: dict[str, Any],
        source_patterns: list[str],
        target_patterns: list[str],
        **kwargs,
    ) -> dict[str, torch.Tensor]:
        target_pattern = self._get_target_pattern(
            input_dict, source_patterns, target_patterns
        )
        tensors = next(iter(input_dict.values()))
        tensor = tensors[0] if isinstance(tensors, list) else tensors

        interleaved = interleave_gate_up(tensor)

        return {target_pattern: interleaved}

    def _get_target_pattern(
        self,
        input_dict: dict[str, Any],
        source_patterns: list[str],
        target_patterns: list[str],
    ) -> str:
        # Follow the same logic as Transpose.get_target_pattern
        if len(input_dict) != 1:
            raise ValueError("Undefined Operation encountered!")
        if len(target_patterns) > 1:
            if len(source_patterns) == 1:
                return source_patterns[0]
            raise ValueError("Undefined Operation encountered!")
        return target_patterns[0]

    @property
    def reverse_op(self) -> ConversionOps:
        return InterleavedToConcatenated(self.dim)


class InterleavedToConcatenated(ConversionOps):
    """Convert interleaved gate/up projections back to concatenated format.

    Input:  [E, 2*I, H] with rows alternating [g0, u0, g1, u1, ...]
    Output: [E, 2*I, H] with gate=[E, :I, H] and up=[E, I:, H]

    This is the reverse of ``ConcatenatedToInterleaved``.
    """

    def __init__(self, dim: int = 1):
        self.dim = dim

    @torch.no_grad()
    def convert(
        self,
        input_dict: dict[str, Any],
        source_patterns: list[str],
        target_patterns: list[str],
        **kwargs,
    ) -> dict[str, torch.Tensor]:
        target_pattern = self._get_target_pattern(
            input_dict, source_patterns, target_patterns
        )
        tensors = next(iter(input_dict.values()))
        tensor = tensors[0] if isinstance(tensors, list) else tensors

        concatenated = deinterleave_gate_up(tensor)

        return {target_pattern: concatenated}

    def _get_target_pattern(
        self,
        input_dict: dict[str, Any],
        source_patterns: list[str],
        target_patterns: list[str],
    ) -> str:
        if len(input_dict) != 1:
            raise ValueError("Undefined Operation encountered!")
        if len(target_patterns) > 1:
            if len(source_patterns) == 1:
                return source_patterns[0]
            raise ValueError("Undefined Operation encountered!")
        return target_patterns[0]

    @property
    def reverse_op(self) -> ConversionOps:
        return ConcatenatedToInterleaved(self.dim)


def register_sonicmoe_weight_converter(model_type: str):
    """Override the conversion mapping to add interleave step for gate_up_proj.

    Appends a ConcatenatedToInterleaved operation to the existing gate_up_proj
    converter chain. For example, qwen3_moe's chain becomes:
        MergeModulelist(dim=0) -> Concatenate(dim=1) -> ConcatenatedToInterleaved(dim=1)

    The reverse is auto-generated for saving:
        InterleavedToConcatenated(dim=1) -> Chunk(dim=1) -> SplitModulelist(dim=0)
    """
    from transformers.conversion_mapping import (
        get_checkpoint_conversion_mapping,
        register_checkpoint_conversion_mapping,
    )

    existing = get_checkpoint_conversion_mapping(model_type)
    if existing is None:
        LOG.warning(
            f"No conversion mapping found for model type '{model_type}'. "
            "SonicMoE weight interleaving will not be applied during checkpoint loading."
        )
        return

    # Find the gate_up_proj converter and append ConcatenatedToInterleaved
    patched = False
    for converter in existing:
        if hasattr(converter, "operations") and any(
            "gate_up_proj" in pat for pat in converter.target_patterns
        ):
            # Guard against double registration (e.g. plugin reloaded)
            if any(
                isinstance(op, ConcatenatedToInterleaved) for op in converter.operations
            ):
                LOG.info(
                    f"SonicMoE weight converter already registered for '{model_type}'"
                )
                return
            converter.operations.append(ConcatenatedToInterleaved(dim=1))
            patched = True
            break

    if not patched:
        LOG.warning(
            f"Could not find gate_up_proj converter for model type '{model_type}'. "
            "SonicMoE weight interleaving will not be applied during checkpoint loading."
        )
        return

    register_checkpoint_conversion_mapping(model_type, existing, overwrite=True)
    LOG.info(f"Registered SonicMoE weight converter for model type '{model_type}'")


================================================
FILE: src/axolotl/integrations/liger/LICENSE
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: src/axolotl/integrations/liger/README.md
================================================
# Liger Kernel Integration

Liger Kernel provides efficient Triton kernels for LLM training, offering:

- 20% increase in multi-GPU training throughput
- 60% reduction in memory usage
- Compatibility with both FSDP and DeepSpeed

See https://github.com/linkedin/Liger-Kernel

## Usage

```yaml
plugins:
  - axolotl.integrations.liger.LigerPlugin
liger_rope: true
liger_rms_norm: true
liger_glu_activation: true
liger_layer_norm: true
liger_fused_linear_cross_entropy: true

# FLCE-specific
liger_use_token_scaling: true
```

## Supported Models

- deepseek_v2
- gemma
- gemma2
- gemma3
- granite
- jamba
- llama
- mistral
- mixtral
- mllama
- mllama_text_model
- olmo2
- paligemma
- phi3
- qwen2
- qwen2_5_vl
- qwen2_vl

## Citation

```bib
@article{hsu2024ligerkernelefficienttriton,
      title={Liger Kernel: Efficient Triton Kernels for LLM Training},
      author={Pin-Lun Hsu and Yun Dai and Vignesh Kothapalli and Qingquan Song and Shao Tang and Siyu Zhu and Steven Shimizu and Shivam Sahni and Haowen Ning and Yanning Chen},
      year={2024},
      eprint={2410.10989},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2410.10989},
      journal={arXiv preprint arXiv:2410.10989},
}
```


================================================
FILE: src/axolotl/integrations/liger/__init__.py
================================================
# Copyright 2024 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Module for the Plugin for LIGER integraton with Axolotl.

Liger Kernel is the collection of Triton-native kernels for LLM Training.
It is designed to be performant, correct, and light-weight.
"""

from .args import LigerArgs
from .plugin import LigerPlugin

__all__ = [
    "LigerArgs",
    "LigerPlugin",
]


================================================
FILE: src/axolotl/integrations/liger/args.py
================================================
# Copyright 2024 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Module for handling LIGER input arguments.
"""

from pydantic import BaseModel, Field, model_validator

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


class LigerArgs(BaseModel):
    """
    Input args for LIGER.
    """

    liger_rope: bool | None = None
    liger_rms_norm: bool | None = None
    liger_layer_norm: bool | None = None
    liger_swiglu: bool | None = None
    liger_glu_activation: bool | None = None
    liger_cross_entropy: bool | None = None
    liger_fused_linear_cross_entropy: bool | None = None
    liger_use_token_scaling: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": (
                "Enables use_token_scaling in fused_linear_cross_entropy. "
                "When True, each token's loss is multiplied by its predicted probability (detached from gradients)."
            )
        },
    )

    @model_validator(mode="before")
    @classmethod
    def check_deprecated_swiglu(cls, data):
        if data.get("liger_swiglu") is not None:
            if data.get("liger_glu_activation") is not None:
                raise ValueError(
                    "You cannot have both `liger_swiglu` and `liger_glu_activation` set."
                )

            LOG.warning(
                "The 'liger_swiglu' argument is deprecated and will be removed in a future release. "
                "Please use 'liger_glu_activation' instead."
            )
            data["liger_glu_activation"] = data.pop("liger_swiglu")
        return data

    @model_validator(mode="before")
    @classmethod
    def check_tiled_mlp_conflict(cls, data):
        if (
            data.get("liger_glu_activation") is True
            and data.get("tiled_mlp") is True
            and not data.get("tiled_mlp_use_original_mlp")
        ):
            raise ValueError(
                "You cannot have both `liger_glu_activation` and `tiled_mlp` set without `tiled_mlp_use_original_mlp: true`."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_liger_rms_norm_tensor_parallel(cls, data):
        if data.get("liger_rms_norm") and data.get("tensor_parallel_size", 1) > 1:
            raise ValueError(
                "`liger_rms_norm` is incompatible with tensor parallelism, "
                "see https://github.com/linkedin/Liger-Kernel/issues/826"
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_liger_use_token_scaling_flce(cls, data):
        if data.get("liger_use_token_scaling") and not data.get(
            "liger_fused_linear_cross_entropy"
        ):
            raise ValueError(
                "`liger_use_token_scaling: true` requires `liger_fused_linear_cross_entropy` enabled."
            )

        return data

    @model_validator(mode="after")
    def check_tensor_parallel_size_liger_fused_linear_cross_entropy(self):
        # TODO @SalmanMohammadi this is a larger fix - investigate
        if self.tensor_parallel_size > 1 and self.liger_fused_linear_cross_entropy:
            raise ValueError("Tensor parallelism is not compatible with liger losses.")
        return self


================================================
FILE: src/axolotl/integrations/liger/models/__init__.py
================================================


================================================
FILE: src/axolotl/integrations/liger/models/base.py
================================================
"""
Generic FLCE patch for untested models similar to Llama
"""

from typing import Optional, Tuple, Union

import torch
from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
from liger_kernel.transformers.trainer.orpo_trainer import _FSDPForwardRedirection
from liger_kernel.utils import PEFT_AVAILABLE
from peft.utils import ModulesToSaveWrapper
from torch.distributed.fsdp import FullyShardedDataParallel
from transformers.modeling_outputs import CausalLMOutputWithPast

from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix


def lce_forward(
    self,
    *args,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    labels: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    skip_logits: Optional[bool] = None,
    **kwargs,
) -> Union[Tuple, CausalLMOutputWithPast]:
    r"""
    Args:
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
            This is useful when using packed tensor format (single dimension for batch and sequence length).
    """

    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )

    return_dict = (
        return_dict if return_dict is not None else self.config.use_return_dict
    )

    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        *args,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        **kwargs,
    )

    hidden_states = outputs[0]
    # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
    slice_indices = (
        slice(-logits_to_keep, None)
        if isinstance(logits_to_keep, int)
        else logits_to_keep
    )
    kept_hidden_states = hidden_states[:, slice_indices, :]

    shift_labels = kwargs.pop("shift_labels", None)
    logits = None
    loss = None

    # if in training mode, don't materialize logits
    if skip_logits and labels is None and shift_labels is None:
        raise ValueError("skip_logits is True, but labels and shift_labels are None")

    if skip_logits is None:
        # By default, if in training mode, don't materialize logits
        skip_logits = self.training and (labels is not None or shift_labels is not None)

    if skip_logits:
        loss = lce_maybe_trainable_lm_head(
            self,
            hidden_states=kept_hidden_states,
            hidden_size=self.config.hidden_size,
            labels=labels,
            shift_labels=shift_labels,
            **kwargs,
        )

    else:
        logits = self.lm_head(kept_hidden_states)
        if labels is not None:
            loss = self.loss_function(
                logits=logits,
                labels=labels,
                vocab_size=self.config.vocab_size,
                **kwargs,
            )

    if not return_dict:
        output = (logits,) + outputs[1:]
        return (loss,) + output if loss is not None else output

    return CausalLMOutputWithPast(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )


def lce_maybe_trainable_lm_head(
    self, hidden_states, hidden_size, labels, shift_labels, **loss_kwargs
):
    lm_head = self.lm_head

    # Unwrap the module if lm_head has been added as trainable module in PEFT LoRA configuration,
    # i.e. listed in the modules_to_save field of LoraConfig, so the lm_head weights are read
    # from the unwrapped module.
    # See https://huggingface.co/docs/peft/package_reference/lora for reference.
    if PEFT_AVAILABLE and isinstance(lm_head, ModulesToSaveWrapper):
        lm_head = lm_head.modules_to_save.default

    # If FSDP is used and lm_head is trainable, e.g., during full fine-tuning or with LoRA,
    # reading the lm_head module weights and calling the kernel must be done within FSDP forward pass
    # so the module entire parameters are summoned and kept in memory during the kernel execution.
    if isinstance(lm_head, FullyShardedDataParallel):
        return _FSDPForwardRedirection()(
            lm_head,
            _liger_for_causal_lm_loss,
            lm_head.module,
            hidden_states,
            hidden_size,
            labels,
            shift_labels,
            **loss_kwargs,
        )

    # FSDP is not used so we can read the lm_head weights and call the kernel directly
    return _liger_for_causal_lm_loss(
        lm_head=self.lm_head,
        hidden_states=hidden_states,
        hidden_size=hidden_size,
        labels=labels,
        shift_labels=shift_labels,
        **loss_kwargs,
    )


def _liger_for_causal_lm_loss(
    lm_head, hidden_states, hidden_size, labels, shift_labels, **loss_kwargs
):
    return LigerForCausalLMLoss(
        hidden_states=hidden_states,
        lm_head_weight=lm_head.weight,
        labels=labels,
        hidden_size=hidden_size,
        shift_labels=shift_labels,
        **loss_kwargs,
    )


def patch_lce_forward(
    model_type,
):
    try:
        # Dynamically import the module and MLP class
        module_path = f"transformers.models.{model_type}.modeling_{model_type}"
        model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
        module = __import__(module_path, fromlist=[f"{model_cls_prefix}ForCausalLM"])
        model_cls = getattr(module, f"{model_cls_prefix}ForCausalLM")

        model_cls.forward = lce_forward

    except (ImportError, AttributeError) as e:
        raise RuntimeError(
            f"Could not import ForCausalLM class for model_type: {model_type}. "
            f"Error: {str(e)}"
        ) from e


================================================
FILE: src/axolotl/integrations/liger/models/deepseekv2.py
================================================
"""
DeepseekV2 model with LigerFusedLinearCrossEntropyLoss
"""

from typing import List, Optional, Tuple, Union

import torch
from liger_kernel.transformers.fused_linear_cross_entropy import (
    LigerFusedLinearCrossEntropyLoss,
)
from torch.nn import CrossEntropyLoss
from transformers.modeling_outputs import CausalLMOutputWithPast


def lce_forward(
    self,
    input_ids: torch.LongTensor = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[List[torch.FloatTensor]] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, CausalLMOutputWithPast]:
    r"""
    Args:
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.

    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, DeepseekV2ForCausalLM

    >>> model = DeepseekV2ForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
    >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

    >>> prompt = "Hey, are you conscious? Can you talk to me?"
    >>> inputs = tokenizer(prompt, return_tensors="pt")

    >>> # Generate
    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
    ```"""
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    return_dict = (
        return_dict if return_dict is not None else self.config.use_return_dict
    )

    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
    )

    hidden_states = outputs[0]

    loss = None
    logits = None

    if self.training:
        shift_hidden_states = hidden_states[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()

        # flatten tokens
        shift_hidden_states = shift_hidden_states.view(-1, self.config.hidden_size)
        shift_labels = shift_labels.view(-1)

        lce = LigerFusedLinearCrossEntropyLoss()
        loss = lce(self.lm_head.weight, shift_hidden_states, shift_labels)
    else:
        logits = self.lm_head(hidden_states)
        logits = logits.float()

        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)

    if not return_dict:
        output = (logits,) + outputs[1:]
        return (loss,) + output if loss is not None else output

    return CausalLMOutputWithPast(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )


================================================
FILE: src/axolotl/integrations/liger/models/jamba.py
================================================
"""
Jamba model with LigerFusedLinearCrossEntropyLoss
"""

from typing import Optional, Tuple, Union

import torch
from liger_kernel.transformers.fused_linear_cross_entropy import (
    LigerFusedLinearCrossEntropyLoss,
)
from torch.nn import CrossEntropyLoss
from transformers.modeling_outputs import MoeCausalLMOutputWithPast
from transformers.models.jamba.modeling_jamba import (
    HybridMambaAttentionDynamicCache,
    load_balancing_loss_func,
)


def lce_forward(
    self,
    input_ids: torch.LongTensor = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[HybridMambaAttentionDynamicCache] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    output_router_logits: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    num_logits_to_keep: Optional[Union[int, None]] = None,
) -> Union[Tuple, MoeCausalLMOutputWithPast]:
    r"""
    Args:
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        num_logits_to_keep (`int` or `None`, *optional*):
            Calculate logits for the last `num_logits_to_keep` tokens. If `None`, calculate logits for all
            `input_ids`. Only last token logits are needed for generation, and calculating them only for that token
            can save memory, which becomes pretty significant for long sequences.

    Returns:

    Example:

    ```python
    >>> from transformers import AutoTokenizer, JambaForCausalLM

    >>> model = JambaForCausalLM.from_pretrained("ai21labs/Jamba-v0.1")
    >>> tokenizer = AutoTokenizer.from_pretrained("ai21labs/Jamba-v0.1")

    >>> prompt = "Hey, are you conscious? Can you talk to me?"
    >>> inputs = tokenizer(prompt, return_tensors="pt")

    >>> # Generate
    >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
    >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
    "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
    ```"""

    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_router_logits = (
        output_router_logits
        if output_router_logits is not None
        else self.config.output_router_logits
    )

    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    return_dict = (
        return_dict if return_dict is not None else self.config.use_return_dict
    )

    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        output_router_logits=output_router_logits,
        cache_position=cache_position,
        return_dict=return_dict,
    )

    hidden_states = outputs[0]

    loss = None
    logits = None

    if self.training:
        shift_hidden_states = hidden_states[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()

        # flatten tokens
        shift_hidden_states = shift_hidden_states.view(-1, self.config.hidden_size)
        shift_labels = shift_labels.view(-1)

        lce = LigerFusedLinearCrossEntropyLoss()
        loss = lce(self.lm_head.weight, shift_hidden_states, shift_labels)
    else:
        if num_logits_to_keep is None:
            logits = self.lm_head(hidden_states)
        else:
            logits = self.lm_head(hidden_states[..., -num_logits_to_keep:, :])
        logits = logits.float()

        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)

    aux_loss = None
    if output_router_logits:
        aux_loss = load_balancing_loss_func(
            outputs.router_logits if return_dict else outputs[-1],
            self.num_experts,
            self.num_experts_per_tok,
            attention_mask,
        )
        if labels is not None:
            loss += self.router_aux_loss_coef * aux_loss.to(
                loss.device
            )  # make sure to reside in the same device

    if not return_dict:
        output = (logits,) + outputs[1:]
        if output_router_logits:
            output = (aux_loss,) + output
        return (loss,) + output if loss is not None else output

    return MoeCausalLMOutputWithPast(
        loss=loss,
        aux_loss=aux_loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
        router_logits=outputs.router_logits,
    )


================================================
FILE: src/axolotl/integrations/liger/models/llama4.py
================================================
"""
Liger FLCE for llama4
"""

import sys
from copy import deepcopy
from typing import List, Optional, Tuple, Union

import torch
from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
from transformers.modeling_outputs import CausalLMOutputWithPast


def lce_forward(
    self,
    input_ids: torch.LongTensor = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[
        Union["Cache", List[torch.FloatTensor]]  # noqa: F821
    ] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    **loss_kwargs,
) -> Union[Tuple, CausalLMOutputWithPast]:
    r"""
    Args:
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
            This is useful when using packed tensor format (single dimension for batch and sequence length).

    Returns:
    """

    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    return_dict = (
        return_dict if return_dict is not None else self.config.use_return_dict
    )

    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        return_dict=return_dict,
        cache_position=cache_position,
    )

    hidden_states = outputs[0]

    if hasattr(self.config, "pretraining_tp") and self.config.pretraining_tp > 1:
        raise Exception("Liger Kernel does not support pretraining_tp!!")

    logits = None
    loss = None
    # if in training mode, don't materialize logits
    if self.training and (labels is not None):
        loss = LigerForCausalLMLoss(
            hidden_states=hidden_states,
            lm_head_weight=self.lm_head.weight,
            labels=labels,
            hidden_size=self.config.hidden_size,
            **loss_kwargs,
        )

    else:  # if in inference mode materialize logits
        slice_indices = (
            slice(-logits_to_keep, None)
            if isinstance(logits_to_keep, int)
            else logits_to_keep
        )
        logits = self.lm_head(hidden_states[:, slice_indices, :])
        if labels is not None:
            loss = self.loss_function(
                logits=logits,
                labels=labels,
                vocab_size=self.config.vocab_size,
                **loss_kwargs,
            )

    if not return_dict:
        output = (logits,) + outputs[1:]
        return (loss,) + output if loss is not None else output

    return CausalLMOutputWithPast(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )


def apply_liger_kernel_to_llama4(
    cross_entropy: bool = False,
    fused_linear_cross_entropy: bool = False,
    rms_norm: bool = False,
    glu_activation: bool = False,
    layer_norm: bool = False,
    **kwargs,
) -> None:
    """
    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)

    Args:
        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
        fused_linear_cross_entropy (bool):
            Whether to apply Liger's fused linear cross entropy loss. Default is False.
            `cross_entropy` and `fused_linear_cross_entropy` cannot both be False.
            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False.
        glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
    """

    import transformers.models.llama4.modeling_llama4  # noqa: F401
    from liger_kernel.transformers.functional import liger_cross_entropy
    from liger_kernel.transformers.layer_norm import LigerLayerNorm
    from liger_kernel.transformers.rms_norm import LigerRMSNorm
    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP

    assert not (cross_entropy and fused_linear_cross_entropy), (
        "cross_entropy and fused_linear_cross_entropy cannot both be True."
    )

    modeling_llama4 = sys.modules["transformers.models.llama4.modeling_llama4"]

    if rms_norm:
        modeling_llama4.Llama4TextRMSNorm = LigerRMSNorm
    if glu_activation:

        def _liger_swiglu_mlp_wrapper(config, intermediate_size=None, **kwargs):
            "Accepts intermediate_size to pass to LigerSwiGLUMLP"
            # clone config to avoid modifying the original
            config = deepcopy(config)
            if intermediate_size:
                config.intermediate_size = intermediate_size
            return LigerSwiGLUMLP(config, **kwargs)

        modeling_llama4.Llama4TextMLP = _liger_swiglu_mlp_wrapper
    if layer_norm:
        modeling_llama4.nn.LayerNorm = LigerLayerNorm

    if cross_entropy:
        from transformers.loss.loss_utils import nn

        nn.functional.cross_entropy = liger_cross_entropy

    if fused_linear_cross_entropy:
        modeling_llama4.Llama4ForCausalLM.forward = lce_forward


================================================
FILE: src/axolotl/integrations/liger/models/qwen3.py
================================================
"""
Liger FLCE for Qwen3. Based on transformers v4.51.3.
"""

import sys
from typing import Optional, Tuple, Union

import torch
from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
from transformers.cache_utils import Cache
from transformers.modeling_outputs import CausalLMOutputWithPast


def lce_forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[Cache] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    **kwargs,
) -> Union[Tuple, CausalLMOutputWithPast]:
    r"""
    Args:
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
            This is useful when using packed tensor format (single dimension for batch and sequence length).

    Returns:
    """

    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )

    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        cache_position=cache_position,
        **kwargs,
    )

    hidden_states = outputs[0]

    logits = None
    loss = None
    # if in training mode, don't materialize logits
    if self.training and (labels is not None):
        loss = LigerForCausalLMLoss(
            hidden_states=hidden_states,
            lm_head_weight=self.lm_head.weight,
            labels=labels,
            hidden_size=self.config.hidden_size,
            **kwargs,
        )

    else:  # if in inference mode materialize logits
        slice_indices = (
            slice(-logits_to_keep, None)
            if isinstance(logits_to_keep, int)
            else logits_to_keep
        )
        logits = self.lm_head(hidden_states[:, slice_indices, :])
        if labels is not None:
            loss = self.loss_function(
                logits=logits,
                labels=labels,
                vocab_size=self.config.vocab_size,
                **kwargs,
            )

    return CausalLMOutputWithPast(
        loss=loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )


def apply_liger_kernel_to_qwen3(
    cross_entropy: bool = False,
    fused_linear_cross_entropy: bool = False,
    rms_norm: bool = False,
    glu_activation: bool = False,
    layer_norm: bool = False,
    **kwargs,
) -> None:
    """
    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)

    Args:
        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
        fused_linear_cross_entropy (bool):
            Whether to apply Liger's fused linear cross entropy loss. Default is False.
            `cross_entropy` and `fused_linear_cross_entropy` cannot both be False.
            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False.
        glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
    """

    import transformers.models.qwen3.modeling_qwen3  # noqa: F401
    from liger_kernel.transformers.functional import liger_cross_entropy
    from liger_kernel.transformers.layer_norm import LigerLayerNorm
    from liger_kernel.transformers.rms_norm import LigerRMSNorm
    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP

    assert not (cross_entropy and fused_linear_cross_entropy), (
        "cross_entropy and fused_linear_cross_entropy cannot both be True."
    )

    modeling_qwen3 = sys.modules["transformers.models.qwen3.modeling_qwen3"]

    if rms_norm:
        modeling_qwen3.Qwen3RMSNorm = LigerRMSNorm

    if glu_activation:
        modeling_qwen3.Qwen3MLP = LigerSwiGLUMLP

    if layer_norm:
        modeling_qwen3.nn.LayerNorm = LigerLayerNorm

    if cross_entropy:
        from transformers.loss.loss_utils import nn

        nn.functional.cross_entropy = liger_cross_entropy

    if fused_linear_cross_entropy:
        modeling_qwen3.Qwen3ForCausalLM.forward = lce_forward


================================================
FILE: src/axolotl/integrations/liger/models/qwen3_moe.py
================================================
"""
Liger FLCE for Qwen3 MoE. Based on transformers v4.51.3.
"""

import sys
from copy import deepcopy
from typing import List, Optional, Union

import torch
from liger_kernel.transformers.model.loss_utils import LigerForCausalLMLoss
from transformers.modeling_outputs import MoeCausalLMOutputWithPast
from transformers.models.qwen3_moe.modeling_qwen3_moe import load_balancing_loss_func


def lce_forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[List[torch.FloatTensor]] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    labels: Optional[torch.LongTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    output_router_logits: Optional[bool] = None,
    cache_position: Optional[torch.LongTensor] = None,
    logits_to_keep: Union[int, torch.Tensor] = 0,
    **kwargs,
) -> MoeCausalLMOutputWithPast:
    r"""
    Args:
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        logits_to_keep (`int` or `torch.Tensor`, *optional*):
            If an `int`, compute logits for the last `logits_to_keep` tokens. If `0`, calculate logits for all
            `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
            token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
            If a `torch.Tensor`, must be 1D corresponding to the indices to keep in the sequence length dimension.
            This is useful when using packed tensor format (single dimension for batch and sequence length).

    Returns:
    """

    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_router_logits = (
        output_router_logits
        if output_router_logits is not None
        else self.config.output_router_logits
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )

    # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
    outputs = self.model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_values=past_key_values,
        inputs_embeds=inputs_embeds,
        use_cache=use_cache,
        output_attentions=output_attentions,
        output_hidden_states=output_hidden_states,
        output_router_logits=output_router_logits,
        cache_position=cache_position,
        **kwargs,
    )

    hidden_states = outputs[0]

    logits = None
    loss = None
    # if in training mode, don't materialize logits
    if self.training and (labels is not None):
        loss = LigerForCausalLMLoss(
            hidden_states=hidden_states,
            lm_head_weight=self.lm_head.weight,
            labels=labels,
            hidden_size=self.config.hidden_size,
            **kwargs,
        )

    else:  # if in inference mode materialize logits
        slice_indices = (
            slice(-logits_to_keep, None)
            if isinstance(logits_to_keep, int)
            else logits_to_keep
        )
        logits = self.lm_head(hidden_states[:, slice_indices, :])
        if labels is not None:
            loss = self.loss_function(
                logits=logits,
                labels=labels,
                vocab_size=self.config.vocab_size,
                **kwargs,
            )

    aux_loss = None
    if output_router_logits:
        aux_loss = load_balancing_loss_func(
            outputs.router_logits,
            self.num_experts,
            self.num_experts_per_tok,
            attention_mask,
        )
        if labels is not None:
            loss += self.router_aux_loss_coef * aux_loss.to(
                loss.device
            )  # make sure to reside in the same device

    return MoeCausalLMOutputWithPast(
        loss=loss,
        aux_loss=aux_loss,
        logits=logits,
        past_key_values=outputs.past_key_values,
        hidden_states=outputs.hidden_states,
        attentions=outputs.attentions,
    )


def apply_liger_kernel_to_qwen3_moe(
    cross_entropy: bool = False,
    fused_linear_cross_entropy: bool = False,
    rms_norm: bool = False,
    glu_activation: bool = False,
    layer_norm: bool = False,
    **kwargs,
) -> None:
    """
    Apply Liger kernels to replace original implementation in HuggingFace Llama models (2 and 3)

    Args:
        cross_entropy (bool): Whether to apply Liger's cross entropy loss. Default is False.
        fused_linear_cross_entropy (bool):
            Whether to apply Liger's fused linear cross entropy loss. Default is False.
            `cross_entropy` and `fused_linear_cross_entropy` cannot both be False.
            If `fused_linear_cross_entropy` is True, the logits will not be materialized but more memory efficient.
        rms_norm (bool): Whether to apply Liger's RMSNorm. Default is False.
        glu_activation (bool): Whether to apply Liger's SwiGLU MLP. Default is False.
        layer_norm (bool): Whether to apply Liger's LayerNorm. Default is False.
    """

    import transformers.models.qwen3_moe.modeling_qwen3_moe  # noqa: F401
    from liger_kernel.transformers.functional import liger_cross_entropy
    from liger_kernel.transformers.layer_norm import LigerLayerNorm
    from liger_kernel.transformers.rms_norm import LigerRMSNorm
    from liger_kernel.transformers.swiglu import LigerSwiGLUMLP

    assert not (cross_entropy and fused_linear_cross_entropy), (
        "cross_entropy and fused_linear_cross_entropy cannot both be True."
    )

    modeling_qwen3_moe = sys.modules["transformers.models.qwen3_moe.modeling_qwen3_moe"]

    if rms_norm:
        modeling_qwen3_moe.Qwen3MoeRMSNorm = LigerRMSNorm

    if glu_activation:

        def _liger_swiglu_mlp_wrapper(config, intermediate_size=None, **kwargs):
            "Accepts intermediate_size to pass to LigerSwiGLUMLP"
            # clone config to avoid modifying the original
            config = deepcopy(config)
            if intermediate_size:
                config.intermediate_size = intermediate_size
            return LigerSwiGLUMLP(config, **kwargs)

        modeling_qwen3_moe.Qwen3MoeMLP = _liger_swiglu_mlp_wrapper

    if layer_norm:
        modeling_qwen3_moe.nn.LayerNorm = LigerLayerNorm

    if cross_entropy:
        from transformers.loss.loss_utils import nn

        nn.functional.cross_entropy = liger_cross_entropy

    if fused_linear_cross_entropy:
        modeling_qwen3_moe.Qwen3MoeForCausalLM.forward = lce_forward


================================================
FILE: src/axolotl/integrations/liger/plugin.py
================================================
"""
Liger-Kernel Plugin for Axolotl
"""

import inspect
import sys

from axolotl.integrations.base import BasePlugin
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


class LigerPlugin(BasePlugin):
    """
    Plugin for LIGER integraton with Axolotl.
    """

    def get_input_args(self):
        return "axolotl.integrations.liger.LigerArgs"

    def pre_model_load(self, cfg):
        # shim: liger-kernel 0.7.0 imports ORPOTrainer from old trl path
        import trl.trainer
        from trl.experimental.orpo import ORPOTrainer

        trl.trainer.ORPOTrainer = ORPOTrainer

        if cfg.torch_compile:
            # torch compile will unnecessarily attempt to optimize the triton kernel unless explicitly disabled
            import liger_kernel.ops.fused_linear_cross_entropy

            from .utils import patch_with_compile_disable

            patch_with_compile_disable(
                liger_kernel.ops.fused_linear_cross_entropy,
                "fused_linear_cross_entropy_forward",
            )
            patch_with_compile_disable(
                liger_kernel.ops.fused_linear_cross_entropy,
                "fused_linear_cross_entropy_backward",
            )

        from liger_kernel.transformers.cross_entropy import LigerCrossEntropyLoss
        from liger_kernel.transformers.functional import liger_cross_entropy
        from liger_kernel.transformers.layer_norm import LigerLayerNorm
        from liger_kernel.transformers.monkey_patch import MODEL_TYPE_TO_APPLY_LIGER_FN
        from liger_kernel.transformers.rms_norm import LigerRMSNorm
        from liger_kernel.transformers.rope import liger_rotary_pos_emb
        from liger_kernel.transformers.swiglu import LigerSwiGLUMLP

        if cfg.liger_cross_entropy and cfg.liger_fused_linear_cross_entropy:
            raise ValueError(
                "Cannot have both `liger_cross_entropy` and `liger_fused_linear_cross_entropy` set."
            )

        if cfg.liger_use_token_scaling:
            # Patch FLCE to set token_scaling=True for function and class API
            from liger_kernel.transformers import functional
            from liger_kernel.transformers.fused_linear_cross_entropy import (
                LigerFusedLinearCrossEntropyLoss,
            )

            old_liger_fused_linear_cross_entropy = (
                functional.liger_fused_linear_cross_entropy
            )

            def patched_liger_fused_linear_cross_entropy(*args, **kwargs):
                kwargs["use_token_scaling"] = True
                return old_liger_fused_linear_cross_entropy(*args, **kwargs)

            functional.liger_fused_linear_cross_entropy = (
                patched_liger_fused_linear_cross_entropy
            )

            old_init = LigerFusedLinearCrossEntropyLoss.__init__

            def patched_init(self, *args, **kwargs):
                kwargs["use_token_scaling"] = True
                return old_init(self, *args, **kwargs)

            LigerFusedLinearCrossEntropyLoss.__init__ = patched_init

        if cfg.model_config_type in MODEL_TYPE_TO_APPLY_LIGER_FN:
            apply_liger_fn = MODEL_TYPE_TO_APPLY_LIGER_FN[cfg.model_config_type]
            liger_fn_sig = inspect.signature(apply_liger_fn)
            kwargs = {}
            if "rope" in liger_fn_sig.parameters:
                kwargs["rope"] = cfg.liger_rope
            if "cross_entropy" in liger_fn_sig.parameters:
                kwargs["cross_entropy"] = cfg.liger_cross_entropy
            if "fused_linear_cross_entropy" in liger_fn_sig.parameters:
                kwargs["fused_linear_cross_entropy"] = (
                    cfg.liger_fused_linear_cross_entropy
                )
            if "rms_norm" in liger_fn_sig.parameters:
                kwargs["rms_norm"] = cfg.liger_rms_norm
            if "layer_norm" in liger_fn_sig.parameters:
                kwargs["layer_norm"] = cfg.liger_layer_norm
            if "geglu" in liger_fn_sig.parameters:
                kwargs["geglu"] = cfg.liger_glu_activation
            elif "swiglu" in liger_fn_sig.parameters:
                kwargs["swiglu"] = cfg.liger_glu_activation
            LOG.info(f"Applying LIGER to {cfg.model_config_type} with kwargs: {kwargs}")
            apply_liger_fn(**kwargs)
        elif cfg.model_config_type == "jamba":
            from transformers.models.jamba import modeling_jamba

            from .models.jamba import lce_forward as jamba_lce_forward

            if cfg.liger_rope:
                modeling_jamba.apply_rotary_pos_emb = liger_rotary_pos_emb
            if cfg.liger_rms_norm:
                modeling_jamba.JambaRMSNorm = LigerRMSNorm
            if cfg.liger_glu_activation:
                modeling_jamba.JambaMLP = LigerSwiGLUMLP
            if cfg.liger_layer_norm:
                modeling_jamba.nn.LayerNorm = LigerLayerNorm
            if cfg.liger_cross_entropy:
                from transformers.loss.loss_utils import nn

                nn.functional.cross_entropy = liger_cross_entropy
            if cfg.liger_fused_linear_cross_entropy:
                modeling_jamba.JambaForCausalLM.forward = jamba_lce_forward
        elif cfg.model_config_type == "deepseek_v2":
            from accelerate import init_empty_weights
            from transformers import AutoModelForCausalLM

            with init_empty_weights():
                model = AutoModelForCausalLM.from_pretrained(
                    cfg.base_model, trust_remote_code=cfg.trust_remote_code or False
                )
                modeling_mod = sys.modules[model.__class__.__module__]

            from .models.deepseekv2 import lce_forward as deepseekv2_lce_forward

            if cfg.liger_rope:
                # The DeepseekV2 version of RoPE is different than upstream LLaMA.
                # See https://github.com/linkedin/Liger-Kernel/issues/129#issuecomment-2313763528
                LOG.warning("Fused liger_rope is not supported for DeepseekV2.")
            if cfg.liger_rms_norm:
                modeling_mod.DeepseekV2RMSNorm = LigerRMSNorm
            if cfg.liger_glu_activation:
                modeling_mod.DeepseekV2MLP.forward = LigerSwiGLUMLP.forward
            if cfg.liger_layer_norm:
                LOG.warning("liger_layer_norm is not supported for DeepseekV2.")
            if cfg.liger_cross_entropy:
                # We do not patch `nn.functional.cross_entropy` for DeepseekV2 as it still uses
                # nn.CrossEntropyLoss in the forward method.
                modeling_mod.CrossEntropyLoss = LigerCrossEntropyLoss
            if cfg.liger_fused_linear_cross_entropy:
                modeling_mod.DeepseekV2ForCausalLM.forward = deepseekv2_lce_forward
        elif cfg.model_config_type == "llama4":
            from axolotl.integrations.liger.models.llama4 import (
                apply_liger_kernel_to_llama4,
            )

            apply_liger_kernel_to_llama4(
                cross_entropy=cfg.liger_cross_entropy,
                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
                glu_activation=cfg.liger_glu_activation,
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
        elif cfg.model_config_type == "qwen3":
            from axolotl.integrations.liger.models.qwen3 import (
                apply_liger_kernel_to_qwen3,
            )

            apply_liger_kernel_to_qwen3(
                cross_entropy=cfg.liger_cross_entropy,
                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
                glu_activation=cfg.liger_glu_activation,
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
        elif cfg.model_config_type == "qwen3_moe":
            from axolotl.integrations.liger.models.qwen3_moe import (
                apply_liger_kernel_to_qwen3_moe,
            )

            apply_liger_kernel_to_qwen3_moe(
                cross_entropy=cfg.liger_cross_entropy,
                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
                glu_activation=cfg.liger_glu_activation,
                rms_norm=cfg.liger_rms_norm,
                layer_norm=cfg.liger_layer_norm,
            )
        elif cfg.model_config_type == "granitemoe":
            from liger_kernel.transformers import apply_liger_kernel_to_granite

            apply_liger_kernel_to_granite(
                rope=cfg.liger_rope,
                cross_entropy=cfg.liger_cross_entropy,
                fused_linear_cross_entropy=cfg.liger_fused_linear_cross_entropy,
                rms_norm=cfg.liger_rms_norm,
                swiglu=cfg.liger_glu_activation,
            )
        elif cfg.liger_fused_linear_cross_entropy:
            try:
                from .models.base import patch_lce_forward

                patch_lce_forward(cfg.model_config_type)
                LOG.warning_once(
                    f"Applied ONLY liger_fused_linear_cross_entropy genericpatches for model type: {cfg.model_config_type}"
                )
                LOG.warning_once(
                    f"Liger + {cfg.model_config_type} generic FLCE support is experimental and may not work as expected."
                )
            except RuntimeError:
                LOG.warning(
                    f"Unsupported model config type: {cfg.model_config_type}. Liger not applied."
                )
        else:
            LOG.warning(
                f"Unsupported model config type: {cfg.model_config_type}. Liger not applied."
            )


================================================
FILE: src/axolotl/integrations/liger/utils.py
================================================
"""
utils to patch liger kernel ops to disable torch.compile
"""

from functools import wraps

import torch


def patch_with_compile_disable(module, function_name):
    """
    Patch a function in a module by wrapping it with torch.compile.disable

    Args:
        module: The module containing the function to patch
        function_name: The name of the function to patch
    """
    original_function = getattr(module, function_name)

    @wraps(original_function)
    @torch.compiler.disable
    def wrapped_function(*args, **kwargs):
        return original_function(*args, **kwargs)

    # Replace the original function with the wrapped one
    setattr(module, function_name, wrapped_function)

    # Return the original function in case you need to restore it later
    return original_function


================================================
FILE: src/axolotl/integrations/llm_compressor/README.md
================================================
# LLMCompressor Integration

Fine-tune sparsified models in Axolotl using Neural Magic's [LLMCompressor](https://github.com/vllm-project/llm-compressor).

This integration enables fine-tuning of models sparsified using LLMCompressor within the Axolotl training framework. By combining LLMCompressor's model compression capabilities with Axolotl's distributed training pipelines, users can efficiently fine-tune sparse models at scale.

It uses Axolotl’s plugin system to hook into the fine-tuning flows while maintaining sparsity throughout training.

---

## Requirements

- Axolotl with `llmcompressor` extras:

  ```bash
  pip install "axolotl[llmcompressor]"
  ```

- Requires `llmcompressor >= 0.5.1`

This will install all necessary dependencies to fine-tune sparsified models using the integration.

---

## Usage

To enable sparse fine-tuning with this integration, include the plugin in your Axolotl config:

```yaml
plugins:
  - axolotl.integrations.llm_compressor.LLMCompressorPlugin

llmcompressor:
  recipe:
    finetuning_stage:
      finetuning_modifiers:
        ConstantPruningModifier:
          targets: [
            're:.*q_proj.weight',
            're:.*k_proj.weight',
            're:.*v_proj.weight',
            're:.*o_proj.weight',
            're:.*gate_proj.weight',
            're:.*up_proj.weight',
            're:.*down_proj.weight',
          ]
          start: 0
  save_compressed: true
# ... (other training arguments)
```

This plugin **does not apply pruning or sparsification itself** — it is intended for **fine-tuning models that have already been sparsified**.

Pre-sparsified checkpoints can be:
- Generated using [LLMCompressor](https://github.com/vllm-project/llm-compressor)
- Downloaded from [Neural Magic's Hugging Face page](https://huggingface.co/neuralmagic)
- Any custom LLM with compatible sparsity patterns that you've created yourself

To learn more about writing and customizing LLMCompressor recipes, refer to the official documentation:
[https://github.com/vllm-project/llm-compressor/blob/main/README.md](https://github.com/vllm-project/llm-compressor/blob/main/README.md)

### Storage Optimization with save_compressed

Setting `save_compressed: true` in your configuration enables saving models in a compressed format, which:
- Reduces disk space usage by approximately 40%
- Maintains compatibility with vLLM for accelerated inference
- Maintains compatibility with llmcompressor for further optimization (example: quantization)

This option is highly recommended when working with sparse models to maximize the benefits of model compression.

### Example Config

See [`examples/llama-3/sparse-finetuning.yaml`](examples/llama-3/sparse-finetuning.yaml) for a complete example.

---

## Inference with vLLM

After fine-tuning your sparse model, you can leverage vLLM for efficient inference.
You can also use LLMCompressor to apply additional quantization to your fine-tuned
sparse model before inference for even greater performance benefits.:

```python
from vllm import LLM, SamplingParams

prompts = [
    "Hello, my name is",
    "The president of the United States is",
    "The capital of France is",
    "The future of AI is",
]
sampling_params = SamplingParams(temperature=0.8, top_p=0.95)
llm = LLM("path/to/your/sparse/model")
outputs = llm.generate(prompts, sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
```

For more details on vLLM's capabilities and advanced configuration options, see the [official vLLM documentation](https://docs.vllm.ai/).

## Learn More

For details on available sparsity and quantization schemes, fine-tuning recipes, and usage examples, visit the official LLMCompressor repository:

[https://github.com/vllm-project/llm-compressor](https://github.com/vllm-project/llm-compressor)


================================================
FILE: src/axolotl/integrations/llm_compressor/__init__.py
================================================
"""Integration entry point for the LLMCompressor plugin."""

from .plugin import LLMCompressorPlugin

__all__ = ["LLMCompressorPlugin"]


================================================
FILE: src/axolotl/integrations/llm_compressor/args.py
================================================
"""
LLMCompressor and Sparse Finetuning config models.
"""

from typing import Any

from pydantic import BaseModel, Field
from typing_extensions import Annotated


class CompressionArgs(BaseModel):
    """Sparse Finetuning config for LLMCompressor."""

    # Typing for recipe is set to Any due to:
    # https://github.com/vllm-project/llm-compressor/issues/1319
    recipe: Annotated[
        Any,
        Field(
            description="The recipe containing the compression algorithms and hyperparameters to apply."
        ),
    ]

    save_compressed: Annotated[
        bool,
        Field(
            default=False,
            description="Whether to save the compressed model after training.",
        ),
    ]


class LLMCompressorArgs(BaseModel):
    """LLMCompressor configuration BaseModel."""

    llmcompressor: Annotated[
        CompressionArgs,
        Field(
            description="Arguments enabling compression pathways through the LLM Compressor plugins"
        ),
    ]


================================================
FILE: src/axolotl/integrations/llm_compressor/plugin.py
================================================
"""
Sparse Finetuning plugin for Axolotl — enables handling of sparse neural networks
by maintaining masks for zero weights during training.
"""

from functools import wraps
from typing import Any, Callable, Concatenate, ParamSpec, TypeVar

from llmcompressor import active_session, create_session
from llmcompressor.core import callbacks as session_callbacks
from llmcompressor.recipe import Recipe
from torch.nn import Module
from transformers.trainer import Trainer
from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
from transformers.training_args import TrainingArguments

from axolotl.integrations.base import BasePlugin
from axolotl.utils.logging import get_logger

P = ParamSpec("P")  # Params for generic function signatures
R = TypeVar("R")  # Return type for generic function signatures

LOG = get_logger(__name__)


class LLMCompressorCallbackHandler(TrainerCallback):
    """
    Trainer callback for Sparse Finetuning.
    Maintains sparsity patterns during training by applying masks after optimization steps,
    ensuring zero-weight updates are canceled out.
    """

    def __init__(self, trainer: Trainer, recipe: Any):
        """
        Initialize the Sparse Finetuning callback handler.

        Args:
            trainer (Trainer): Huggingface Trainer instance.
            recipe (Recipe | dict): Sparse finetuning recipe to apply.
        """
        super().__init__()
        self.trainer = trainer
        self.recipe = (
            Recipe.model_validate(recipe) if not isinstance(recipe, Recipe) else recipe
        )
        self.original_compute_loss = trainer.compute_loss
        self.trainer.compute_loss = compute_loss_wrapper(self.trainer.compute_loss)
        create_session()

    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ) -> None:
        """
        Called at the beginning of training. Initializes the compression session.

        Args:
            args (TrainingArguments): Training arguments.
            state (TrainerState): Trainer state.
            control (TrainerControl): Trainer control.
        """
        super().on_train_begin(args, state, control, **kwargs)
        self.trainer.accelerator.wait_for_everyone()
        active_session().initialize(
            model=self.trainer.model,
            optimizer=self.trainer.optimizer,
            start=state.epoch,
            recipe=self.recipe,
        )
        self.trainer.accelerator.wait_for_everyone()

    def on_step_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ) -> None:
        """
        Called at the beginning of a training step. Triggers batch_start callback.
        """
        super().on_step_begin(args, state, control, **kwargs)
        session_callbacks.batch_start()

    def on_step_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ) -> None:
        """
        Called at the end of a training step. Triggers optimizer and batch_end callbacks.
        """
        super().on_step_end(args, state, control, **kwargs)
        session_callbacks.optim_pre_step()
        session_callbacks.optim_post_step()
        session_callbacks.batch_end()

    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ) -> None:
        """
        Called at the end of training. Finalizes the compression session.
        """
        super().on_train_end(args, state, control, **kwargs)
        active_session().finalize()
        self.trainer.compute_loss_func = self.original_compute_loss


class LLMCompressorPlugin(BasePlugin):
    """
    Sparse Finetuning plugin for Axolotl integration.
    """

    def get_input_args(self) -> str:
        """
        Returns the path to the plugin's argument definition.

        Returns:
            str: Dotted path to the LLMCompressorArgs class.
        """
        return "axolotl.integrations.llm_compressor.args.LLMCompressorArgs"

    def add_callbacks_post_trainer(self, cfg: Any, trainer: Trainer) -> list:
        """
        Adds Sparse Finetuning callback to the Trainer instance.

        Args:
            cfg (Any): Configuration object containing the sparse recipe.
            trainer (Trainer): Huggingface Trainer instance.

        Returns:
            list: List containing the configured callback instances.
        """
        LOG.info("Adding Sparse Finetuning callback to the trainer")
        callback = LLMCompressorCallbackHandler(
            trainer=trainer,
            recipe=cfg.llmcompressor.recipe,
        )
        return [callback]


def compute_loss_wrapper(
    compute_loss_func: Callable[Concatenate[Module, P], R],
) -> Callable[Concatenate[Module, P], R]:
    """
    Wraps the loss computation function to trigger the loss_calculated callback.

    Args:
        compute_loss_func (Callable): Original loss computation function.

    Returns:
        Callable: Wrapped function that also invokes the loss_calculated callback.
    """

    @wraps(compute_loss_func)
    def compute_and_notify(model: Module, *args: P.args, **kwargs: P.kwargs) -> R:
        loss = compute_loss_func(model, *args, **kwargs)
        if active_session().lifecycle.initialized_ and model.training:
            session_callbacks.loss_calculated(loss=loss)
        return loss

    return compute_and_notify


================================================
FILE: src/axolotl/integrations/llm_compressor/utils.py
================================================
"""Utilities for llmcompressor integration with axolotl."""

from typing import Union

from llmcompressor.transformers.sparsification.compressed_tensors_utils import (
    modify_save_pretrained,
)
from transformers import PreTrainedModel, Trainer


def save_compressed_model(
    model: PreTrainedModel,
    output_dir: Union[str, bytes],
    trainer: Trainer,
    save_compressed: bool = False,
) -> None:
    """
    Synchronize processes, apply compression hooks, and save the model.

    Args:
        model (PreTrainedModel): The model to be saved.
        output_dir (str or bytes): Path where the model files will be written.
        trainer (Trainer): Hugging Face Trainer for process synchronization.
        save_compressed (bool): Write compressed tensors if True.
    """
    trainer.accelerator.wait_for_everyone()

    # Only the main process writes the files
    if not trainer.accelerator.is_main_process:
        return

    modify_save_pretrained(model)
    model.save_pretrained(
        output_dir,
        save_compressed=save_compressed,
        skip_sparsity_compression_stats=not save_compressed,
    )


================================================
FILE: src/axolotl/integrations/lm_eval/README.md
================================================
# LM Eval Harness

Run evaluation on model using the popular lm-evaluation-harness library.

See https://github.com/EleutherAI/lm-evaluation-harness

## Usage

There are two ways to use the LM Eval integration:

### 1. Post-Training Evaluation

When training with the plugin enabled, evaluation runs automatically after training completes:

```yaml
plugins:
  - axolotl.integrations.lm_eval.LMEvalPlugin

lm_eval_tasks:
  - gsm8k
  - hellaswag
  - arc_easy

lm_eval_batch_size: # Batch size for evaluation

# Directory to save evaluation results.
# The final model is loaded from this directory
# unless specified otherwise (see below)
output_dir:
```

Run training as usual:
```bash
axolotl train config.yml
```

### 2. Standalone CLI Evaluation

Evaluate any model directly without training:

```yaml
lm_eval_model: meta-llama/Llama-2-7b-hf

plugins:
  - axolotl.integrations.lm_eval.LMEvalPlugin

lm_eval_tasks:
  - gsm8k
  - hellaswag
  - arc_easy

lm_eval_batch_size: 8
output_dir: ./outputs
```

Run evaluation:
```bash
axolotl lm-eval config.yml
```

## Model Selection Priority

The model to evaluate is selected in the following priority order:

1. **`lm_eval_model`** - Explicit model path or HuggingFace repo (highest priority)
2. **`hub_model_id`** - Trained model pushed to HuggingFace Hub
3. **`output_dir`** - Local checkpoint directory containing trained model weights

## Citation

```bib
@misc{eval-harness,
  author       = {Gao, Leo and Tow, Jonathan and Abbasi, Baber and Biderman, Stella and Black, Sid and DiPofi, Anthony and Foster, Charles and Golding, Laurence and Hsu, Jeffrey and Le Noac'h, Alain and Li, Haonan and McDonell, Kyle and Muennighoff, Niklas and Ociepa, Chris and Phang, Jason and Reynolds, Laria and Schoelkopf, Hailey and Skowron, Aviya and Sutawika, Lintang and Tang, Eric and Thite, Anish and Wang, Ben and Wang, Kevin and Zou, Andy},
  title        = {A framework for few-shot language model evaluation},
  month        = 07,
  year         = 2024,
  publisher    = {Zenodo},
  version      = {v0.4.3},
  doi          = {10.5281/zenodo.12608602},
  url          = {https://zenodo.org/records/12608602}
}
```


================================================
FILE: src/axolotl/integrations/lm_eval/__init__.py
================================================
"""
Module for the Plugin for LM Eval Harness
"""

import subprocess  # nosec

from axolotl.integrations.base import BasePlugin
from axolotl.integrations.lm_eval.cli import build_lm_eval_command, get_model_path

from .args import LMEvalArgs as LMEvalArgs


class LMEvalPlugin(BasePlugin):
    """
    Plugin for LM Evaluation Harness integraton with Axolotl.
    """

    def get_input_args(self):
        return "axolotl.integrations.lm_eval.LMEvalArgs"

    def post_train_unload(self, cfg):
        if cfg.lm_eval_post_train:
            for lm_eval_args in build_lm_eval_command(
                cfg.lm_eval_tasks,
                bfloat16=cfg.bfloat16 or cfg.bf16,
                flash_attention=cfg.flash_attention,
                output_dir=cfg.output_dir,
                batch_size=cfg.lm_eval_batch_size,
                wandb_project=cfg.wandb_project,
                wandb_entity=cfg.wandb_entity,
                wandb_name=cfg.wandb_name,
                model=get_model_path(cfg),
            ):
                subprocess.run(  # nosec
                    lm_eval_args,
                    check=True,
                )


================================================
FILE: src/axolotl/integrations/lm_eval/args.py
================================================
"""
Module for handling lm eval harness input arguments.
"""

from typing import List, Optional

from pydantic import BaseModel


class LMEvalArgs(BaseModel):
    """
    Input args for lm eval harness
    """

    lm_eval_tasks: List[str] = []
    lm_eval_batch_size: Optional[int] = 8
    lm_eval_post_train: Optional[bool] = True
    lm_eval_model: Optional[str] = None


================================================
FILE: src/axolotl/integrations/lm_eval/cli.py
================================================
"""
axolotl CLI for running lm_eval tasks
"""

import subprocess  # nosec
from collections import defaultdict
from datetime import datetime
from typing import Optional

import click
import yaml

from axolotl.utils.dict import DictDefault


def get_model_path(cfg: DictDefault) -> str | None:
    """
    Determine which model path to use for evaluation.

    Priority order (highest to lowest):
    1. lm_eval_model - Explicit model path override
    2. hub_model_id - Model pushed to HuggingFace Hub
    3. None - Falls back to output_dir in build_lm_eval_command

    Returns:
        Model path string or None to use output_dir fallback
    """
    return cfg.lm_eval_model or cfg.hub_model_id or None


def build_lm_eval_command(
    tasks: list[str],
    bfloat16=True,
    flash_attention=False,
    output_dir="./",
    batch_size=8,
    wandb_project=None,
    wandb_entity=None,
    wandb_name=None,
    model=None,
    revision=None,
    apply_chat_template=None,
    fewshot_as_multiturn=None,
):
    tasks_by_num_fewshot: dict[str, list] = defaultdict(list)
    if isinstance(tasks, str):
        tasks = [tasks]
    for task in tasks:
        num_fewshot = "-1"
        task_parts = task.split(":")
        task_name = task_parts[0]
        if len(task_parts) == 2:
            task_name, num_fewshot = task_parts
        tasks_by_num_fewshot[str(num_fewshot)].append(task_name)

    for num_fewshot, tasks_list in tasks_by_num_fewshot.items():
        tasks_str = ",".join(tasks_list)
        num_fewshot_val = num_fewshot if num_fewshot != "-1" else None
        pretrained = "pretrained="
        pretrained += model if model else output_dir
        fa2 = ",attn_implementation=flash_attention_2" if flash_attention else ""
        dtype = ",dtype=bfloat16" if bfloat16 else ",dtype=float16"
        revision = f",revision={revision}" if revision else ""
        output_path = output_dir
        output_path += "" if output_dir.endswith("/") else "/"
        output_path += "lm_eval_results/" + datetime.now().strftime("%Y%m%d_%H%M%S")
        lm_eval_args = [
            "lm_eval",
            "--model",
            "hf",
            "--model_args",
            f"{pretrained}{fa2}{dtype}{revision}",
            "--tasks",
            tasks_str,
            "--batch_size",
            str(batch_size),
            "--output_path",
            output_path,
        ]
        wandb_args = []
        if wandb_project:
            wandb_args.append(f"project={wandb_project}")
        if wandb_entity:
            wandb_args.append(f"entity={wandb_entity}")
        if wandb_name:
            wandb_args.append(f"name={wandb_name}")
        if wandb_args:
            lm_eval_args.append("--wandb_args")
            lm_eval_args.append(",".join(wandb_args))
        if apply_chat_template:
            lm_eval_args.append("--apply_chat_template")
        if num_fewshot_val:
            lm_eval_args.append("--num_fewshot")
            lm_eval_args.append(str(num_fewshot_val))
            if apply_chat_template and fewshot_as_multiturn:
                lm_eval_args.append("--fewshot_as_multiturn")

        yield lm_eval_args


@click.command()
@click.argument("config", type=click.Path(exists=True, path_type=str))
@click.option("--cloud", default=None, type=click.Path(exists=True, path_type=str))
def lm_eval(config: str, cloud: Optional[str] = None):
    """
    use lm eval to evaluate a trained language model
    """

    if cloud:
        from axolotl.cli.cloud import do_cli_lm_eval

        do_cli_lm_eval(cloud_config=cloud, config=config)
    else:
        with open(config, encoding="utf-8") as file:
            cfg: DictDefault = DictDefault(yaml.safe_load(file))

        for lm_eval_args in build_lm_eval_command(
            cfg.lm_eval_tasks,
            bfloat16=cfg.bfloat16 or cfg.bf16,
            flash_attention=cfg.flash_attention,
            output_dir=cfg.output_dir,
            batch_size=cfg.lm_eval_batch_size,
            wandb_project=cfg.wandb_project,
            wandb_entity=cfg.wandb_entity,
            wandb_name=cfg.wandb_name,
            model=get_model_path(cfg),
            revision=cfg.revision,
            apply_chat_template=cfg.apply_chat_template,
            fewshot_as_multiturn=cfg.fewshot_as_multiturn,
        ):
            subprocess.run(  # nosec
                lm_eval_args,
                check=True,
            )


================================================
FILE: src/axolotl/integrations/spectrum/LICENSE
================================================

                                 Apache License
                           Version 2.0, January 2004
                        http://www.apache.org/licenses/

   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION

   1. Definitions.

      "License" shall mean the terms and conditions for use, reproduction,
      and distribution as defined by Sections 1 through 9 of this document.

      "Licensor" shall mean the copyright owner or entity authorized by
      the copyright owner that is granting the License.

      "Legal Entity" shall mean the union of the acting entity and all
      other entities that control, are controlled by, or are under common
      control with that entity. For the purposes of this definition,
      "control" means (i) the power, direct or indirect, to cause the
      direction or management of such entity, whether by contract or
      otherwise, or (ii) ownership of fifty percent (50%) or more of the
      outstanding shares, or (iii) beneficial ownership of such entity.

      "You" (or "Your") shall mean an individual or Legal Entity
      exercising permissions granted by this License.

      "Source" form shall mean the preferred form for making modifications,
      including but not limited to software source code, documentation
      source, and configuration files.

      "Object" form shall mean any form resulting from mechanical
      transformation or translation of a Source form, including but
      not limited to compiled object code, generated documentation,
      and conversions to other media types.

      "Work" shall mean the work of authorship, whether in Source or
      Object form, made available under the License, as indicated by a
      copyright notice that is included in or attached to the work
      (an example is provided in the Appendix below).

      "Derivative Works" shall mean any work, whether in Source or Object
      form, that is based on (or derived from) the Work and for which the
      editorial revisions, annotations, elaborations, or other modifications
      represent, as a whole, an original work of authorship. For the purposes
      of this License, Derivative Works shall not include works that remain
      separable from, or merely link (or bind by name) to the interfaces of,
      the Work and Derivative Works thereof.

      "Contribution" shall mean any work of authorship, including
      the original version of the Work and any modifications or additions
      to that Work or Derivative Works thereof, that is intentionally
      submitted to Licensor for inclusion in the Work by the copyright owner
      or by an individual or Legal Entity authorized to submit on behalf of
      the copyright owner. For the purposes of this definition, "submitted"
      means any form of electronic, verbal, or written communication sent
      to the Licensor or its representatives, including but not limited to
      communication on electronic mailing lists, source code control systems,
      and issue tracking systems that are managed by, or on behalf of, the
      Licensor for the purpose of discussing and improving the Work, but
      excluding communication that is conspicuously marked or otherwise
      designated in writing by the copyright owner as "Not a Contribution."

      "Contributor" shall mean Licensor and any individual or Legal Entity
      on behalf of whom a Contribution has been received by Licensor and
      subsequently incorporated within the Work.

   2. Grant of Copyright License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      copyright license to reproduce, prepare Derivative Works of,
      publicly display, publicly perform, sublicense, and distribute the
      Work and such Derivative Works in Source or Object form.

   3. Grant of Patent License. Subject to the terms and conditions of
      this License, each Contributor hereby grants to You a perpetual,
      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
      (except as stated in this section) patent license to make, have made,
      use, offer to sell, sell, import, and otherwise transfer the Work,
      where such license applies only to those patent claims licensable
      by such Contributor that are necessarily infringed by their
      Contribution(s) alone or by combination of their Contribution(s)
      with the Work to which such Contribution(s) was submitted. If You
      institute patent litigation against any entity (including a
      cross-claim or counterclaim in a lawsuit) alleging that the Work
      or a Contribution incorporated within the Work constitutes direct
      or contributory patent infringement, then any patent licenses
      granted to You under this License for that Work shall terminate
      as of the date such litigation is filed.

   4. Redistribution. You may reproduce and distribute copies of the
      Work or Derivative Works thereof in any medium, with or without
      modifications, and in Source or Object form, provided that You
      meet the following conditions:

      (a) You must give any other recipients of the Work or
          Derivative Works a copy of this License; and

      (b) You must cause any modified files to carry prominent notices
          stating that You changed the files; and

      (c) You must retain, in the Source form of any Derivative Works
          that You distribute, all copyright, patent, trademark, and
          attribution notices from the Source form of the Work,
          excluding those notices that do not pertain to any part of
          the Derivative Works; and

      (d) If the Work includes a "NOTICE" text file as part of its
          distribution, then any Derivative Works that You distribute must
          include a readable copy of the attribution notices contained
          within such NOTICE file, excluding those notices that do not
          pertain to any part of the Derivative Works, in at least one
          of the following places: within a NOTICE text file distributed
          as part of the Derivative Works; within the Source form or
          documentation, if provided along with the Derivative Works; or,
          within a display generated by the Derivative Works, if and
          wherever such third-party notices normally appear. The contents
          of the NOTICE file are for informational purposes only and
          do not modify the License. You may add Your own attribution
          notices within Derivative Works that You distribute, alongside
          or as an addendum to the NOTICE text from the Work, provided
          that such additional attribution notices cannot be construed
          as modifying the License.

      You may add Your own copyright statement to Your modifications and
      may provide additional or different license terms and conditions
      for use, reproduction, or distribution of Your modifications, or
      for any such Derivative Works as a whole, provided Your use,
      reproduction, and distribution of the Work otherwise complies with
      the conditions stated in this License.

   5. Submission of Contributions. Unless You explicitly state otherwise,
      any Contribution intentionally submitted for inclusion in the Work
      by You to the Licensor shall be under the terms and conditions of
      this License, without any additional terms or conditions.
      Notwithstanding the above, nothing herein shall supersede or modify
      the terms of any separate license agreement you may have executed
      with Licensor regarding such Contributions.

   6. Trademarks. This License does not grant permission to use the trade
      names, trademarks, service marks, or product names of the Licensor,
      except as required for reasonable and customary use in describing the
      origin of the Work and reproducing the content of the NOTICE file.

   7. Disclaimer of Warranty. Unless required by applicable law or
      agreed to in writing, Licensor provides the Work (and each
      Contributor provides its Contributions) on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
      implied, including, without limitation, any warranties or conditions
      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
      PARTICULAR PURPOSE. You are solely responsible for determining the
      appropriateness of using or redistributing the Work and assume any
      risks associated with Your exercise of permissions under this License.

   8. Limitation of Liability. In no event and under no legal theory,
      whether in tort (including negligence), contract, or otherwise,
      unless required by applicable law (such as deliberate and grossly
      negligent acts) or agreed to in writing, shall any Contributor be
      liable to You for damages, including any direct, indirect, special,
      incidental, or consequential damages of any character arising as a
      result of this License or out of the use or inability to use the
      Work (including but not limited to damages for loss of goodwill,
      work stoppage, computer failure or malfunction, or any and all
      other commercial damages or losses), even if such Contributor
      has been advised of the possibility of such damages.

   9. Accepting Warranty or Additional Liability. While redistributing
      the Work or Derivative Works thereof, You may choose to offer,
      and charge a fee for, acceptance of support, warranty, indemnity,
      or other liability obligations and/or rights consistent with this
      License. However, in accepting such obligations, You may act only
      on Your own behalf and on Your sole responsibility, not on behalf
      of any other Contributor, and only if You agree to indemnify,
      defend, and hold each Contributor harmless for any liability
      incurred by, or claims asserted against, such Contributor by reason
      of your accepting any such warranty or additional liability.

   END OF TERMS AND CONDITIONS

   APPENDIX: How to apply the Apache License to your work.

      To apply the Apache License to your work, attach the following
      boilerplate notice, with the fields enclosed by brackets "[]"
      replaced with your own identifying information. (Don't include
      the brackets!)  The text should be enclosed in the appropriate
      comment syntax for the file format. We also recommend that a
      file or class name and description of purpose be included on the
      same "printed page" as the copyright notice for easier
      identification within third-party archives.

   Copyright [yyyy] [name of copyright owner]

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
   You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.


================================================
FILE: src/axolotl/integrations/spectrum/README.md
================================================
# Spectrum: Targeted Training on Signal to Noise Ratio

by Eric Hartford, Lucas Atkins, Fernando Fernandes, David Golchinfar

This plugin contains code to freeze the bottom fraction of modules in a model, based on the Signal-to-Noise Ratio (SNR).

See https://github.com/cognitivecomputations/spectrum

## Overview

Spectrum is a tool for scanning and evaluating the Signal-to-Noise Ratio (SNR) of layers in large language models.
By identifying the top n% of layers with the highest SNR, you can optimize training efficiency.

## Usage

```yaml
plugins:
  - axolotl.integrations.spectrum.SpectrumPlugin

spectrum_top_fraction: 0.5
# Optional if using a pre-scanned model as your base_model. Useful if using a model mirror
spectrum_model_name: meta-llama/Meta-Llama-3.1-8B
```

## Citation

```bib
@misc{hartford2024spectrumtargetedtrainingsignal,
      title={Spectrum: Targeted Training on Signal to Noise Ratio},
      author={Eric Hartford and Lucas Atkins and Fernando Fernandes Neto and David Golchinfar},
      year={2024},
      eprint={2406.06623},
      archivePrefix={arXiv},
      primaryClass={cs.LG},
      url={https://arxiv.org/abs/2406.06623},
}
```


================================================
FILE: src/axolotl/integrations/spectrum/__init__.py
================================================
# Copyright 2024 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Spectrum Plugin to automatically generate unfrozen parameters based on SNR data.
"""

import json

import requests

from axolotl.integrations.base import BasePlugin
from axolotl.utils.logging import get_logger

from .args import SpectrumArgs as SpectrumArgs

LOG = get_logger(__name__)


def _generate_unfrozen_params_yaml(snr_data, top_fraction=0.5):
    unfrozen_parameters = {}
    for layer_name, info in snr_data.items():
        layer_type = info["type"]
        if layer_type not in unfrozen_parameters:
            unfrozen_parameters[layer_type] = []
        unfrozen_parameters[layer_type].append((layer_name, info["snr"]))
    top_layers_by_type = {}
    for layer_type, layers in unfrozen_parameters.items():
        layers_sorted = sorted(layers, key=lambda x: x[1], reverse=True)
        num_top_layers = int(len(layers) * top_fraction)
        top_layers_by_type[layer_type] = [
            layer[0] for layer in layers_sorted[:num_top_layers]
        ]
    unfrozen_parameters = [
        "^lm_head.weight$",
        "^model.embed_tokens.weight$",
    ]
    for _, layer_names in top_layers_by_type.items():
        for layer_name in layer_names:
            unfrozen_parameters.append(layer_name)
    return unfrozen_parameters


class SpectrumPlugin(BasePlugin):
    """
    Spectrum Plugin to automatically generate unfrozen parameters based on SNR data.
    """

    base_url = "https://raw.githubusercontent.com/cognitivecomputations/spectrum/main/model_snr_results/"
    base_path = "./model_snr_results/"
    snr_file_template = "snr_results_{model_name_slug}.json"

    def get_input_args(self):
        return "axolotl.integrations.spectrum.SpectrumArgs"

    def pre_model_load(self, cfg):
        if cfg.get("spectrum_model_name"):
            model_name = cfg["spectrum_model_name"]
        else:
            model_name = cfg["base_model"]
        top_fraction = cfg.get("spectrum_top_fraction", 50)
        model_slug = model_name.replace("/", "-").replace("_", "-")
        snr_url = self.base_url + self.snr_file_template.format(
            model_name_slug=model_slug
        )
        snr_path = self.base_path + self.snr_file_template.format(
            model_name_slug=model_slug
        )
        # first check if the files exist locally and read the json
        snr_data = None
        try:
            with open(snr_path, "r", encoding="utf-8") as fin:
                snr_data = json.load(fin)
        except FileNotFoundError:
            pass
        except Exception as exc:
            LOG.warning(f"Failed to read SNR data from {snr_path}: {exc}")

        if not snr_data:
            try:
                snr_data = requests.get(snr_url, timeout=60).json()
            except requests.exceptions.RequestException as exc:
                LOG.warning(f"Failed to fetch SNR data from {snr_url}: {exc}")
                return
            # also catch json parsing errors
            except json.JSONDecodeError as exc:
                LOG.warning(f"Failed to parse SNR data from {snr_url}: {exc}")
                return

        unfrozen_parameters = _generate_unfrozen_params_yaml(
            snr_data, top_fraction=top_fraction
        )
        cfg["unfrozen_parameters"] = unfrozen_parameters


================================================
FILE: src/axolotl/integrations/spectrum/args.py
================================================
# Copyright 2024 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Module for handling Spectrum input arguments.
"""

from typing import Optional

from pydantic import BaseModel, model_validator


class SpectrumArgs(BaseModel):
    """
    Input args for Spectrum.
    """

    spectrum_top_fraction: Optional[float] = 0.5
    spectrum_model_name: Optional[str] = None

    @model_validator(mode="before")
    @classmethod
    def check_fsdp_use_orig_params(cls, data):
        if (
            data.get("fsdp")
            and data.get("fsdp_config")
            and not data["fsdp_config"].get("use_orig_params")
            and data.get("plugins")
            and any("SpectrumPlugin" in plugin for plugin in data["plugins"])
        ):
            # would otherwise raise
            # ValueError: Must flatten tensors with uniform `requires_grad` when `use_orig_params=False`
            raise ValueError(
                "FSDP + SpectrumPlugin cannot be used together when `use_orig_params=False` is set"
            )
        return data


================================================
FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-1.5B-Instruct.json
================================================
{
    "model.layers.0.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.1.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.2.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.3.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.4.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.5.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.6.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.7.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.8.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.9.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.10.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.11.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.12.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.13.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.14.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.15.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.16.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.17.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.18.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.19.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.20.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.21.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.22.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.23.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.24.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.25.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.26.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.27.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "lm_head": {
        "snr": Infinity,
        "type": "lm_head"
    },
    "model.layers.0.mlp.down_proj": {
        "snr": 70.50235748291016,
        "type": "mlp.down_proj"
    },
    "model.layers.1.mlp.down_proj": {
        "snr": 134.4214630126953,
        "type": "mlp.down_proj"
    },
    "model.layers.2.mlp.down_proj": {
        "snr": 235.74794006347656,
        "type": "mlp.down_proj"
    },
    "model.layers.3.mlp.down_proj": {
        "snr": 73.25755310058594,
        "type": "mlp.down_proj"
    },
    "model.layers.4.mlp.down_proj": {
        "snr": 27.22879981994629,
        "type": "mlp.down_proj"
    },
    "model.layers.5.mlp.down_proj": {
        "snr": 17.5551815032959,
        "type": "mlp.down_proj"
    },
    "model.layers.6.mlp.down_proj": {
        "snr": 54.210426330566406,
        "type": "mlp.down_proj"
    },
    "model.layers.7.mlp.down_proj": {
        "snr": 38.808937072753906,
        "type": "mlp.down_proj"
    },
    "model.layers.8.mlp.down_proj": {
        "snr": 29.799747467041016,
        "type": "mlp.down_proj"
    },
    "model.layers.9.mlp.down_proj": {
        "snr": 10.296355247497559,
        "type": "mlp.down_proj"
    },
    "model.layers.10.mlp.down_proj": {
        "snr": 8.86428165435791,
        "type": "mlp.down_proj"
    },
    "model.layers.11.mlp.down_proj": {
        "snr": 6.43813943862915,
        "type": "mlp.down_proj"
    },
    "model.layers.12.mlp.down_proj": {
        "snr": 7.0912184715271,
        "type": "mlp.down_proj"
    },
    "model.layers.13.mlp.down_proj": {
        "snr": 3.285884141921997,
        "type": "mlp.down_proj"
    },
    "model.layers.14.mlp.down_proj": {
        "snr": 6.073758125305176,
        "type": "mlp.down_proj"
    },
    "model.layers.15.mlp.down_proj": {
        "snr": 5.325990676879883,
        "type": "mlp.down_proj"
    },
    "model.layers.16.mlp.down_proj": {
        "snr": 4.591946601867676,
        "type": "mlp.down_proj"
    },
    "model.layers.17.mlp.down_proj": {
        "snr": 7.021907329559326,
        "type": "mlp.down_proj"
    },
    "model.layers.18.mlp.down_proj": {
        "snr": 6.392782211303711,
        "type": "mlp.down_proj"
    },
    "model.layers.19.mlp.down_proj": {
        "snr": 210.51983642578125,
        "type": "mlp.down_proj"
    },
    "model.layers.20.mlp.down_proj": {
        "snr": 7.1035943031311035,
        "type": "mlp.down_proj"
    },
    "model.layers.21.mlp.down_proj": {
        "snr": 18.701711654663086,
        "type": "mlp.down_proj"
    },
    "model.layers.22.mlp.down_proj": {
        "snr": 14.842622756958008,
        "type": "mlp.down_proj"
    },
    "model.layers.23.mlp.down_proj": {
        "snr": 10.50004768371582,
        "type": "mlp.down_proj"
    },
    "model.layers.24.mlp.down_proj": {
        "snr": 7.225146770477295,
        "type": "mlp.down_proj"
    },
    "model.layers.25.mlp.down_proj": {
        "snr": 7.463952541351318,
        "type": "mlp.down_proj"
    },
    "model.layers.26.mlp.down_proj": {
        "snr": 15.226134300231934,
        "type": "mlp.down_proj"
    },
    "model.layers.27.mlp.down_proj": {
        "snr": 105.4173355102539,
        "type": "mlp.down_proj"
    },
    "model.layers.0.mlp.gate_proj": {
        "snr": 0.5021594166755676,
        "type": "mlp.gate_proj"
    },
    "model.layers.1.mlp.gate_proj": {
        "snr": 34.75935363769531,
        "type": "mlp.gate_proj"
    },
    "model.layers.2.mlp.gate_proj": {
        "snr": 22.855531692504883,
        "type": "mlp.gate_proj"
    },
    "model.layers.3.mlp.gate_proj": {
        "snr": 25.09166717529297,
        "type": "mlp.gate_proj"
    },
    "model.layers.4.mlp.gate_proj": {
        "snr": 28.533172607421875,
        "type": "mlp.gate_proj"
    },
    "model.layers.5.mlp.gate_proj": {
        "snr": 18.625717163085938,
        "type": "mlp.gate_proj"
    },
    "model.layers.6.mlp.gate_proj": {
        "snr": 39.77565383911133,
        "type": "mlp.gate_proj"
    },
    "model.layers.7.mlp.gate_proj": {
        "snr": 24.77678680419922,
        "type": "mlp.gate_proj"
    },
    "model.layers.8.mlp.gate_proj": {
        "snr": 11.854388236999512,
        "type": "mlp.gate_proj"
    },
    "model.layers.9.mlp.gate_proj": {
        "snr": 20.372356414794922,
        "type": "mlp.gate_proj"
    },
    "model.layers.10.mlp.gate_proj": {
        "snr": 14.639552116394043,
        "type": "mlp.gate_proj"
    },
    "model.layers.11.mlp.gate_proj": {
        "snr": 9.82955551147461,
        "type": "mlp.gate_proj"
    },
    "model.layers.12.mlp.gate_proj": {
        "snr": 13.942151069641113,
        "type": "mlp.gate_proj"
    },
    "model.layers.13.mlp.gate_proj": {
        "snr": 12.524999618530273,
        "type": "mlp.gate_proj"
    },
    "model.layers.14.mlp.gate_proj": {
        "snr": 8.19681167602539,
        "type": "mlp.gate_proj"
    },
    "model.layers.15.mlp.gate_proj": {
        "snr": 8.561081886291504,
        "type": "mlp.gate_proj"
    },
    "model.layers.16.mlp.gate_proj": {
        "snr": 6.421900749206543,
        "type": "mlp.gate_proj"
    },
    "model.layers.17.mlp.gate_proj": {
        "snr": 5.568161964416504,
        "type": "mlp.gate_proj"
    },
    "model.layers.18.mlp.gate_proj": {
        "snr": 10.090147972106934,
        "type": "mlp.gate_proj"
    },
    "model.layers.19.mlp.gate_proj": {
        "snr": 5.6181230545043945,
        "type": "mlp.gate_proj"
    },
    "model.layers.20.mlp.gate_proj": {
        "snr": 5.173826694488525,
        "type": "mlp.gate_proj"
    },
    "model.layers.21.mlp.gate_proj": {
        "snr": 5.663441181182861,
        "type": "mlp.gate_proj"
    },
    "model.layers.22.mlp.gate_proj": {
        "snr": 6.824708461761475,
        "type": "mlp.gate_proj"
    },
    "model.layers.23.mlp.gate_proj": {
        "snr": 4.724992275238037,
        "type": "mlp.gate_proj"
    },
    "model.layers.24.mlp.gate_proj": {
        "snr": 6.829834938049316,
        "type": "mlp.gate_proj"
    },
    "model.layers.25.mlp.gate_proj": {
        "snr": 9.968582153320312,
        "type": "mlp.gate_proj"
    },
    "model.layers.26.mlp.gate_proj": {
        "snr": 14.35350513458252,
        "type": "mlp.gate_proj"
    },
    "model.layers.27.mlp.gate_proj": {
        "snr": 20.121768951416016,
        "type": "mlp.gate_proj"
    },
    "model.layers.0.mlp.up_proj": {
        "snr": 1.9020992517471313,
        "type": "mlp.up_proj"
    },
    "model.layers.1.mlp.up_proj": {
        "snr": 46.9393424987793,
        "type": "mlp.up_proj"
    },
    "model.layers.2.mlp.up_proj": {
        "snr": 76.04901123046875,
        "type": "mlp.up_proj"
    },
    "model.layers.3.mlp.up_proj": {
        "snr": 104.08525848388672,
        "type": "mlp.up_proj"
    },
    "model.layers.4.mlp.up_proj": {
        "snr": 77.74343872070312,
        "type": "mlp.up_proj"
    },
    "model.layers.5.mlp.up_proj": {
        "snr": 104.15605926513672,
        "type": "mlp.up_proj"
    },
    "model.layers.6.mlp.up_proj": {
        "snr": 105.16349792480469,
        "type": "mlp.up_proj"
    },
    "model.layers.7.mlp.up_proj": {
        "snr": 78.4150390625,
        "type": "mlp.up_proj"
    },
    "model.layers.8.mlp.up_proj": {
        "snr": 57.51069641113281,
        "type": "mlp.up_proj"
    },
    "model.layers.9.mlp.up_proj": {
        "snr": 50.26409912109375,
        "type": "mlp.up_proj"
    },
    "model.layers.10.mlp.up_proj": {
        "snr": 50.36701965332031,
        "type": "mlp.up_proj"
    },
    "model.layers.11.mlp.up_proj": {
        "snr": 56.66413497924805,
        "type": "mlp.up_proj"
    },
    "model.layers.12.mlp.up_proj": {
        "snr": 62.384559631347656,
        "type": "mlp.up_proj"
    },
    "model.layers.13.mlp.up_proj": {
        "snr": 44.97883987426758,
        "type": "mlp.up_proj"
    },
    "model.layers.14.mlp.up_proj": {
        "snr": 69.7376480102539,
        "type": "mlp.up_proj"
    },
    "model.layers.15.mlp.up_proj": {
        "snr": 35.93111801147461,
        "type": "mlp.up_proj"
    },
    "model.layers.16.mlp.up_proj": {
        "snr": 33.63168716430664,
        "type": "mlp.up_proj"
    },
    "model.layers.17.mlp.up_proj": {
        "snr": 37.695919036865234,
        "type": "mlp.up_proj"
    },
    "model.layers.18.mlp.up_proj": {
        "snr": 43.516517639160156,
        "type": "mlp.up_proj"
    },
    "model.layers.19.mlp.up_proj": {
        "snr": 30.479318618774414,
        "type": "mlp.up_proj"
    },
    "model.layers.20.mlp.up_proj": {
        "snr": 12.495409965515137,
        "type": "mlp.up_proj"
    },
    "model.layers.21.mlp.up_proj": {
        "snr": 19.616689682006836,
        "type": "mlp.up_proj"
    },
    "model.layers.22.mlp.up_proj": {
        "snr": 18.42948341369629,
        "type": "mlp.up_proj"
    },
    "model.layers.23.mlp.up_proj": {
        "snr": 10.799560546875,
        "type": "mlp.up_proj"
    },
    "model.layers.24.mlp.up_proj": {
        "snr": 14.167623519897461,
        "type": "mlp.up_proj"
    },
    "model.layers.25.mlp.up_proj": {
        "snr": 14.938597679138184,
        "type": "mlp.up_proj"
    },
    "model.layers.26.mlp.up_proj": {
        "snr": 8.896568298339844,
        "type": "mlp.up_proj"
    },
    "model.layers.27.mlp.up_proj": {
        "snr": 25.774547576904297,
        "type": "mlp.up_proj"
    },
    "model.embed_tokens": {
        "snr": Infinity,
        "type": "model.embed_tokens"
    },
    "model.norm": {
        "snr": Infinity,
        "type": "model.norm"
    },
    "model.layers.0.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.1.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.2.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.3.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.4.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.5.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.6.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.7.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.8.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.9.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.10.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.11.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.12.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.13.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.14.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.15.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.16.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.17.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.18.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.19.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.20.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.21.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.22.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.23.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.24.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.25.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.26.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.27.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.0.self_attn.k_proj": {
        "snr": 1.8306859731674194,
        "type": "self_attn.k_proj"
    },
    "model.layers.1.self_attn.k_proj": {
        "snr": 0.896544337272644,
        "type": "self_attn.k_proj"
    },
    "model.layers.2.self_attn.k_proj": {
        "snr": 2.345759868621826,
        "type": "self_attn.k_proj"
    },
    "model.layers.3.self_attn.k_proj": {
        "snr": 2.0610744953155518,
        "type": "self_attn.k_proj"
    },
    "model.layers.4.self_attn.k_proj": {
        "snr": 2.3658556938171387,
        "type": "self_attn.k_proj"
    },
    "model.layers.5.self_attn.k_proj": {
        "snr": 1.6586917638778687,
        "type": "self_attn.k_proj"
    },
    "model.layers.6.self_attn.k_proj": {
        "snr": 1.7613047361373901,
        "type": "self_attn.k_proj"
    },
    "model.layers.7.self_attn.k_proj": {
        "snr": 1.325312852859497,
        "type": "self_attn.k_proj"
    },
    "model.layers.8.self_attn.k_proj": {
        "snr": 1.458108901977539,
        "type": "self_attn.k_proj"
    },
    "model.layers.9.self_attn.k_proj": {
        "snr": 1.4319790601730347,
        "type": "self_attn.k_proj"
    },
    "model.layers.10.self_attn.k_proj": {
        "snr": 0.9579543471336365,
        "type": "self_attn.k_proj"
    },
    "model.layers.11.self_attn.k_proj": {
        "snr": 0.8787619471549988,
        "type": "self_attn.k_proj"
    },
    "model.layers.12.self_attn.k_proj": {
        "snr": 1.0447536706924438,
        "type": "self_attn.k_proj"
    },
    "model.layers.13.self_attn.k_proj": {
        "snr": 0.9157310724258423,
        "type": "self_attn.k_proj"
    },
    "model.layers.14.self_attn.k_proj": {
        "snr": 0.7528730630874634,
        "type": "self_attn.k_proj"
    },
    "model.layers.15.self_attn.k_proj": {
        "snr": 0.9293556213378906,
        "type": "self_attn.k_proj"
    },
    "model.layers.16.self_attn.k_proj": {
        "snr": 0.8057093620300293,
        "type": "self_attn.k_proj"
    },
    "model.layers.17.self_attn.k_proj": {
        "snr": 1.2973601818084717,
        "type": "self_attn.k_proj"
    },
    "model.layers.18.self_attn.k_proj": {
        "snr": 1.1357901096343994,
        "type": "self_attn.k_proj"
    },
    "model.layers.19.self_attn.k_proj": {
        "snr": 1.3661632537841797,
        "type": "self_attn.k_proj"
    },
    "model.layers.20.self_attn.k_proj": {
        "snr": 0.8829066753387451,
        "type": "self_attn.k_proj"
    },
    "model.layers.21.self_attn.k_proj": {
        "snr": 0.9105398654937744,
        "type": "self_attn.k_proj"
    },
    "model.layers.22.self_attn.k_proj": {
        "snr": 2.086926221847534,
        "type": "self_attn.k_proj"
    },
    "model.layers.23.self_attn.k_proj": {
        "snr": 1.0393351316452026,
        "type": "self_attn.k_proj"
    },
    "model.layers.24.self_attn.k_proj": {
        "snr": 1.114574670791626,
        "type": "self_attn.k_proj"
    },
    "model.layers.25.self_attn.k_proj": {
        "snr": 2.599745035171509,
        "type": "self_attn.k_proj"
    },
    "model.layers.26.self_attn.k_proj": {
        "snr": 1.1256712675094604,
        "type": "self_attn.k_proj"
    },
    "model.layers.27.self_attn.k_proj": {
        "snr": 1.1784162521362305,
        "type": "self_attn.k_proj"
    },
    "model.layers.0.self_attn.o_proj": {
        "snr": 0.8094121813774109,
        "type": "self_attn.o_proj"
    },
    "model.layers.1.self_attn.o_proj": {
        "snr": 0.22000817954540253,
        "type": "self_attn.o_proj"
    },
    "model.layers.2.self_attn.o_proj": {
        "snr": 0.21972468495368958,
        "type": "self_attn.o_proj"
    },
    "model.layers.3.self_attn.o_proj": {
        "snr": 0.22064059972763062,
        "type": "self_attn.o_proj"
    },
    "model.layers.4.self_attn.o_proj": {
        "snr": 0.22308556735515594,
        "type": "self_attn.o_proj"
    },
    "model.layers.5.self_attn.o_proj": {
        "snr": 0.22396250069141388,
        "type": "self_attn.o_proj"
    },
    "model.layers.6.self_attn.o_proj": {
        "snr": 0.228360116481781,
        "type": "self_attn.o_proj"
    },
    "model.layers.7.self_attn.o_proj": {
        "snr": 0.2306283563375473,
        "type": "self_attn.o_proj"
    },
    "model.layers.8.self_attn.o_proj": {
        "snr": 0.2430228292942047,
        "type": "self_attn.o_proj"
    },
    "model.layers.9.self_attn.o_proj": {
        "snr": 0.2115175724029541,
        "type": "self_attn.o_proj"
    },
    "model.layers.10.self_attn.o_proj": {
        "snr": 0.18226943910121918,
        "type": "self_attn.o_proj"
    },
    "model.layers.11.self_attn.o_proj": {
        "snr": 0.144245907664299,
        "type": "self_attn.o_proj"
    },
    "model.layers.12.self_attn.o_proj": {
        "snr": 0.21965907514095306,
        "type": "self_attn.o_proj"
    },
    "model.layers.13.self_attn.o_proj": {
        "snr": 0.1797526627779007,
        "type": "self_attn.o_proj"
    },
    "model.layers.14.self_attn.o_proj": {
        "snr": 0.26513636112213135,
        "type": "self_attn.o_proj"
    },
    "model.layers.15.self_attn.o_proj": {
        "snr": 0.19463808834552765,
        "type": "self_attn.o_proj"
    },
    "model.layers.16.self_attn.o_proj": {
        "snr": 0.22129350900650024,
        "type": "self_attn.o_proj"
    },
    "model.layers.17.self_attn.o_proj": {
        "snr": 0.22545330226421356,
        "type": "self_attn.o_proj"
    },
    "model.layers.18.self_attn.o_proj": {
        "snr": 0.25302645564079285,
        "type": "self_attn.o_proj"
    },
    "model.layers.19.self_attn.o_proj": {
        "snr": 0.26326504349708557,
        "type": "self_attn.o_proj"
    },
    "model.layers.20.self_attn.o_proj": {
        "snr": 0.15203869342803955,
        "type": "self_attn.o_proj"
    },
    "model.layers.21.self_attn.o_proj": {
        "snr": 0.22418837249279022,
        "type": "self_attn.o_proj"
    },
    "model.layers.22.self_attn.o_proj": {
        "snr": 0.23777326941490173,
        "type": "self_attn.o_proj"
    },
    "model.layers.23.self_attn.o_proj": {
        "snr": 0.18076598644256592,
        "type": "self_attn.o_proj"
    },
    "model.layers.24.self_attn.o_proj": {
        "snr": 0.19919466972351074,
        "type": "self_attn.o_proj"
    },
    "model.layers.25.self_attn.o_proj": {
        "snr": 0.11310968548059464,
        "type": "self_attn.o_proj"
    },
    "model.layers.26.self_attn.o_proj": {
        "snr": 0.08452697843313217,
        "type": "self_attn.o_proj"
    },
    "model.layers.27.self_attn.o_proj": {
        "snr": 0.1029304787516594,
        "type": "self_attn.o_proj"
    },
    "model.layers.0.self_attn.q_proj": {
        "snr": 0.03922705352306366,
        "type": "self_attn.q_proj"
    },
    "model.layers.1.self_attn.q_proj": {
        "snr": 0.1410205066204071,
        "type": "self_attn.q_proj"
    },
    "model.layers.2.self_attn.q_proj": {
        "snr": 0.18240582942962646,
        "type": "self_attn.q_proj"
    },
    "model.layers.3.self_attn.q_proj": {
        "snr": 0.1702580451965332,
        "type": "self_attn.q_proj"
    },
    "model.layers.4.self_attn.q_proj": {
        "snr": 0.19508686661720276,
        "type": "self_attn.q_proj"
    },
    "model.layers.5.self_attn.q_proj": {
        "snr": 0.21549257636070251,
        "type": "self_attn.q_proj"
    },
    "model.layers.6.self_attn.q_proj": {
        "snr": 0.22021502256393433,
        "type": "self_attn.q_proj"
    },
    "model.layers.7.self_attn.q_proj": {
        "snr": 0.2044307142496109,
        "type": "self_attn.q_proj"
    },
    "model.layers.8.self_attn.q_proj": {
        "snr": 0.22745060920715332,
        "type": "self_attn.q_proj"
    },
    "model.layers.9.self_attn.q_proj": {
        "snr": 0.23825915157794952,
        "type": "self_attn.q_proj"
    },
    "model.layers.10.self_attn.q_proj": {
        "snr": 0.2181481122970581,
        "type": "self_attn.q_proj"
    },
    "model.layers.11.self_attn.q_proj": {
        "snr": 0.23490090668201447,
        "type": "self_attn.q_proj"
    },
    "model.layers.12.self_attn.q_proj": {
        "snr": 0.2379382699728012,
        "type": "self_attn.q_proj"
    },
    "model.layers.13.self_attn.q_proj": {
        "snr": 0.19233369827270508,
        "type": "self_attn.q_proj"
    },
    "model.layers.14.self_attn.q_proj": {
        "snr": 0.2587313652038574,
        "type": "self_attn.q_proj"
    },
    "model.layers.15.self_attn.q_proj": {
        "snr": 0.07332809269428253,
        "type": "self_attn.q_proj"
    },
    "model.layers.16.self_attn.q_proj": {
        "snr": 0.22992204129695892,
        "type": "self_attn.q_proj"
    },
    "model.layers.17.self_attn.q_proj": {
        "snr": 0.2537729740142822,
        "type": "self_attn.q_proj"
    },
    "model.layers.18.self_attn.q_proj": {
        "snr": 0.2389948070049286,
        "type": "self_attn.q_proj"
    },
    "model.layers.19.self_attn.q_proj": {
        "snr": 0.20716068148612976,
        "type": "self_attn.q_proj"
    },
    "model.layers.20.self_attn.q_proj": {
        "snr": 0.2575169503688812,
        "type": "self_attn.q_proj"
    },
    "model.layers.21.self_attn.q_proj": {
        "snr": 0.22347678244113922,
        "type": "self_attn.q_proj"
    },
    "model.layers.22.self_attn.q_proj": {
        "snr": 0.18831054866313934,
        "type": "self_attn.q_proj"
    },
    "model.layers.23.self_attn.q_proj": {
        "snr": 0.19853907823562622,
        "type": "self_attn.q_proj"
    },
    "model.layers.24.self_attn.q_proj": {
        "snr": 0.16343259811401367,
        "type": "self_attn.q_proj"
    },
    "model.layers.25.self_attn.q_proj": {
        "snr": 0.1583252102136612,
        "type": "self_attn.q_proj"
    },
    "model.layers.26.self_attn.q_proj": {
        "snr": 0.254446804523468,
        "type": "self_attn.q_proj"
    },
    "model.layers.27.self_attn.q_proj": {
        "snr": 0.23828543722629547,
        "type": "self_attn.q_proj"
    },
    "model.layers.0.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.1.self_attn.v_proj": {
        "snr": 856.5148315429688,
        "type": "self_attn.v_proj"
    },
    "model.layers.2.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.3.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.4.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.5.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.6.self_attn.v_proj": {
        "snr": 48.941104888916016,
        "type": "self_attn.v_proj"
    },
    "model.layers.7.self_attn.v_proj": {
        "snr": 70.25466918945312,
        "type": "self_attn.v_proj"
    },
    "model.layers.8.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.9.self_attn.v_proj": {
        "snr": 370.885986328125,
        "type": "self_attn.v_proj"
    },
    "model.layers.10.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.11.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.12.self_attn.v_proj": {
        "snr": 75.51139831542969,
        "type": "self_attn.v_proj"
    },
    "model.layers.13.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.14.self_attn.v_proj": {
        "snr": 52.004058837890625,
        "type": "self_attn.v_proj"
    },
    "model.layers.15.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.16.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.17.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.18.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.19.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.20.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.21.self_attn.v_proj": {
        "snr": 641.026611328125,
        "type": "self_attn.v_proj"
    },
    "model.layers.22.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.23.self_attn.v_proj": {
        "snr": 323.4858093261719,
        "type": "self_attn.v_proj"
    },
    "model.layers.24.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.25.self_attn.v_proj": {
        "snr": 2.1745388507843018,
        "type": "self_attn.v_proj"
    },
    "model.layers.26.self_attn.v_proj": {
        "snr": 3.0791690349578857,
        "type": "self_attn.v_proj"
    },
    "model.layers.27.self_attn.v_proj": {
        "snr": 2.029968023300171,
        "type": "self_attn.v_proj"
    }
}


================================================
FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-1.5B.json
================================================
{
    "model.layers.0.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.1.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.2.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.3.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.4.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.5.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.6.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.7.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.8.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.9.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.10.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.11.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.12.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.13.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.14.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.15.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.16.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.17.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.18.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.19.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.20.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.21.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.22.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.23.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.24.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.25.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.26.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.27.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "lm_head": {
        "snr": Infinity,
        "type": "lm_head"
    },
    "model.layers.0.mlp.down_proj": {
        "snr": 70.4939193725586,
        "type": "mlp.down_proj"
    },
    "model.layers.1.mlp.down_proj": {
        "snr": 134.2310028076172,
        "type": "mlp.down_proj"
    },
    "model.layers.2.mlp.down_proj": {
        "snr": 235.44140625,
        "type": "mlp.down_proj"
    },
    "model.layers.3.mlp.down_proj": {
        "snr": 73.19381713867188,
        "type": "mlp.down_proj"
    },
    "model.layers.4.mlp.down_proj": {
        "snr": 27.216264724731445,
        "type": "mlp.down_proj"
    },
    "model.layers.5.mlp.down_proj": {
        "snr": 17.544504165649414,
        "type": "mlp.down_proj"
    },
    "model.layers.6.mlp.down_proj": {
        "snr": 54.17462158203125,
        "type": "mlp.down_proj"
    },
    "model.layers.7.mlp.down_proj": {
        "snr": 38.78171920776367,
        "type": "mlp.down_proj"
    },
    "model.layers.8.mlp.down_proj": {
        "snr": 29.777149200439453,
        "type": "mlp.down_proj"
    },
    "model.layers.9.mlp.down_proj": {
        "snr": 10.289377212524414,
        "type": "mlp.down_proj"
    },
    "model.layers.10.mlp.down_proj": {
        "snr": 8.858332633972168,
        "type": "mlp.down_proj"
    },
    "model.layers.11.mlp.down_proj": {
        "snr": 6.433396816253662,
        "type": "mlp.down_proj"
    },
    "model.layers.12.mlp.down_proj": {
        "snr": 7.085702419281006,
        "type": "mlp.down_proj"
    },
    "model.layers.13.mlp.down_proj": {
        "snr": 3.323948383331299,
        "type": "mlp.down_proj"
    },
    "model.layers.14.mlp.down_proj": {
        "snr": 6.204164505004883,
        "type": "mlp.down_proj"
    },
    "model.layers.15.mlp.down_proj": {
        "snr": 5.321533203125,
        "type": "mlp.down_proj"
    },
    "model.layers.16.mlp.down_proj": {
        "snr": 4.588479995727539,
        "type": "mlp.down_proj"
    },
    "model.layers.17.mlp.down_proj": {
        "snr": 7.01450252532959,
        "type": "mlp.down_proj"
    },
    "model.layers.18.mlp.down_proj": {
        "snr": 6.386813163757324,
        "type": "mlp.down_proj"
    },
    "model.layers.19.mlp.down_proj": {
        "snr": 210.38458251953125,
        "type": "mlp.down_proj"
    },
    "model.layers.20.mlp.down_proj": {
        "snr": 7.096683979034424,
        "type": "mlp.down_proj"
    },
    "model.layers.21.mlp.down_proj": {
        "snr": 18.68245506286621,
        "type": "mlp.down_proj"
    },
    "model.layers.22.mlp.down_proj": {
        "snr": 14.824685096740723,
        "type": "mlp.down_proj"
    },
    "model.layers.23.mlp.down_proj": {
        "snr": 10.491303443908691,
        "type": "mlp.down_proj"
    },
    "model.layers.24.mlp.down_proj": {
        "snr": 7.2194437980651855,
        "type": "mlp.down_proj"
    },
    "model.layers.25.mlp.down_proj": {
        "snr": 7.458613872528076,
        "type": "mlp.down_proj"
    },
    "model.layers.26.mlp.down_proj": {
        "snr": 15.222760200500488,
        "type": "mlp.down_proj"
    },
    "model.layers.27.mlp.down_proj": {
        "snr": 105.41569519042969,
        "type": "mlp.down_proj"
    },
    "model.layers.0.mlp.gate_proj": {
        "snr": 0.5017311573028564,
        "type": "mlp.gate_proj"
    },
    "model.layers.1.mlp.gate_proj": {
        "snr": 34.71562576293945,
        "type": "mlp.gate_proj"
    },
    "model.layers.2.mlp.gate_proj": {
        "snr": 22.82915496826172,
        "type": "mlp.gate_proj"
    },
    "model.layers.3.mlp.gate_proj": {
        "snr": 25.0699520111084,
        "type": "mlp.gate_proj"
    },
    "model.layers.4.mlp.gate_proj": {
        "snr": 28.508079528808594,
        "type": "mlp.gate_proj"
    },
    "model.layers.5.mlp.gate_proj": {
        "snr": 18.608009338378906,
        "type": "mlp.gate_proj"
    },
    "model.layers.6.mlp.gate_proj": {
        "snr": 39.732391357421875,
        "type": "mlp.gate_proj"
    },
    "model.layers.7.mlp.gate_proj": {
        "snr": 24.760026931762695,
        "type": "mlp.gate_proj"
    },
    "model.layers.8.mlp.gate_proj": {
        "snr": 11.842738151550293,
        "type": "mlp.gate_proj"
    },
    "model.layers.9.mlp.gate_proj": {
        "snr": 20.35906982421875,
        "type": "mlp.gate_proj"
    },
    "model.layers.10.mlp.gate_proj": {
        "snr": 14.627532958984375,
        "type": "mlp.gate_proj"
    },
    "model.layers.11.mlp.gate_proj": {
        "snr": 9.821962356567383,
        "type": "mlp.gate_proj"
    },
    "model.layers.12.mlp.gate_proj": {
        "snr": 13.930404663085938,
        "type": "mlp.gate_proj"
    },
    "model.layers.13.mlp.gate_proj": {
        "snr": 12.509871482849121,
        "type": "mlp.gate_proj"
    },
    "model.layers.14.mlp.gate_proj": {
        "snr": 8.187695503234863,
        "type": "mlp.gate_proj"
    },
    "model.layers.15.mlp.gate_proj": {
        "snr": 8.553187370300293,
        "type": "mlp.gate_proj"
    },
    "model.layers.16.mlp.gate_proj": {
        "snr": 6.414614200592041,
        "type": "mlp.gate_proj"
    },
    "model.layers.17.mlp.gate_proj": {
        "snr": 5.561778545379639,
        "type": "mlp.gate_proj"
    },
    "model.layers.18.mlp.gate_proj": {
        "snr": 10.078697204589844,
        "type": "mlp.gate_proj"
    },
    "model.layers.19.mlp.gate_proj": {
        "snr": 5.61345100402832,
        "type": "mlp.gate_proj"
    },
    "model.layers.20.mlp.gate_proj": {
        "snr": 5.265484809875488,
        "type": "mlp.gate_proj"
    },
    "model.layers.21.mlp.gate_proj": {
        "snr": 5.659949779510498,
        "type": "mlp.gate_proj"
    },
    "model.layers.22.mlp.gate_proj": {
        "snr": 6.8203511238098145,
        "type": "mlp.gate_proj"
    },
    "model.layers.23.mlp.gate_proj": {
        "snr": 4.721294403076172,
        "type": "mlp.gate_proj"
    },
    "model.layers.24.mlp.gate_proj": {
        "snr": 6.82572603225708,
        "type": "mlp.gate_proj"
    },
    "model.layers.25.mlp.gate_proj": {
        "snr": 9.963521003723145,
        "type": "mlp.gate_proj"
    },
    "model.layers.26.mlp.gate_proj": {
        "snr": 14.342291831970215,
        "type": "mlp.gate_proj"
    },
    "model.layers.27.mlp.gate_proj": {
        "snr": 20.092098236083984,
        "type": "mlp.gate_proj"
    },
    "model.layers.0.mlp.up_proj": {
        "snr": 1.901187777519226,
        "type": "mlp.up_proj"
    },
    "model.layers.1.mlp.up_proj": {
        "snr": 46.9141731262207,
        "type": "mlp.up_proj"
    },
    "model.layers.2.mlp.up_proj": {
        "snr": 76.07878112792969,
        "type": "mlp.up_proj"
    },
    "model.layers.3.mlp.up_proj": {
        "snr": 103.9194564819336,
        "type": "mlp.up_proj"
    },
    "model.layers.4.mlp.up_proj": {
        "snr": 77.62561798095703,
        "type": "mlp.up_proj"
    },
    "model.layers.5.mlp.up_proj": {
        "snr": 104.01624298095703,
        "type": "mlp.up_proj"
    },
    "model.layers.6.mlp.up_proj": {
        "snr": 105.0235366821289,
        "type": "mlp.up_proj"
    },
    "model.layers.7.mlp.up_proj": {
        "snr": 78.33445739746094,
        "type": "mlp.up_proj"
    },
    "model.layers.8.mlp.up_proj": {
        "snr": 57.44070816040039,
        "type": "mlp.up_proj"
    },
    "model.layers.9.mlp.up_proj": {
        "snr": 50.20344924926758,
        "type": "mlp.up_proj"
    },
    "model.layers.10.mlp.up_proj": {
        "snr": 50.32845687866211,
        "type": "mlp.up_proj"
    },
    "model.layers.11.mlp.up_proj": {
        "snr": 56.6197624206543,
        "type": "mlp.up_proj"
    },
    "model.layers.12.mlp.up_proj": {
        "snr": 62.338096618652344,
        "type": "mlp.up_proj"
    },
    "model.layers.13.mlp.up_proj": {
        "snr": 44.92917251586914,
        "type": "mlp.up_proj"
    },
    "model.layers.14.mlp.up_proj": {
        "snr": 69.69624328613281,
        "type": "mlp.up_proj"
    },
    "model.layers.15.mlp.up_proj": {
        "snr": 35.90705108642578,
        "type": "mlp.up_proj"
    },
    "model.layers.16.mlp.up_proj": {
        "snr": 33.610374450683594,
        "type": "mlp.up_proj"
    },
    "model.layers.17.mlp.up_proj": {
        "snr": 37.67365646362305,
        "type": "mlp.up_proj"
    },
    "model.layers.18.mlp.up_proj": {
        "snr": 43.488929748535156,
        "type": "mlp.up_proj"
    },
    "model.layers.19.mlp.up_proj": {
        "snr": 30.451993942260742,
        "type": "mlp.up_proj"
    },
    "model.layers.20.mlp.up_proj": {
        "snr": 12.480182647705078,
        "type": "mlp.up_proj"
    },
    "model.layers.21.mlp.up_proj": {
        "snr": 19.595102310180664,
        "type": "mlp.up_proj"
    },
    "model.layers.22.mlp.up_proj": {
        "snr": 19.067970275878906,
        "type": "mlp.up_proj"
    },
    "model.layers.23.mlp.up_proj": {
        "snr": 10.786394119262695,
        "type": "mlp.up_proj"
    },
    "model.layers.24.mlp.up_proj": {
        "snr": 14.150126457214355,
        "type": "mlp.up_proj"
    },
    "model.layers.25.mlp.up_proj": {
        "snr": 14.927021026611328,
        "type": "mlp.up_proj"
    },
    "model.layers.26.mlp.up_proj": {
        "snr": 8.891448020935059,
        "type": "mlp.up_proj"
    },
    "model.layers.27.mlp.up_proj": {
        "snr": 25.74305534362793,
        "type": "mlp.up_proj"
    },
    "model.embed_tokens": {
        "snr": Infinity,
        "type": "model.embed_tokens"
    },
    "model.norm": {
        "snr": Infinity,
        "type": "model.norm"
    },
    "model.layers.0.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.1.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.2.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.3.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.4.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.5.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.6.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.7.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.8.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.9.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.10.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.11.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.12.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.13.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.14.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.15.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.16.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.17.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.18.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.19.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.20.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.21.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.22.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.23.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.24.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.25.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.26.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.27.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.0.self_attn.k_proj": {
        "snr": 1.7818864583969116,
        "type": "self_attn.k_proj"
    },
    "model.layers.1.self_attn.k_proj": {
        "snr": 0.8955822587013245,
        "type": "self_attn.k_proj"
    },
    "model.layers.2.self_attn.k_proj": {
        "snr": 2.344149351119995,
        "type": "self_attn.k_proj"
    },
    "model.layers.3.self_attn.k_proj": {
        "snr": 2.0597119331359863,
        "type": "self_attn.k_proj"
    },
    "model.layers.4.self_attn.k_proj": {
        "snr": 2.36411714553833,
        "type": "self_attn.k_proj"
    },
    "model.layers.5.self_attn.k_proj": {
        "snr": 1.6570613384246826,
        "type": "self_attn.k_proj"
    },
    "model.layers.6.self_attn.k_proj": {
        "snr": 1.7604507207870483,
        "type": "self_attn.k_proj"
    },
    "model.layers.7.self_attn.k_proj": {
        "snr": 1.3245182037353516,
        "type": "self_attn.k_proj"
    },
    "model.layers.8.self_attn.k_proj": {
        "snr": 1.4567548036575317,
        "type": "self_attn.k_proj"
    },
    "model.layers.9.self_attn.k_proj": {
        "snr": 1.4310829639434814,
        "type": "self_attn.k_proj"
    },
    "model.layers.10.self_attn.k_proj": {
        "snr": 0.95713210105896,
        "type": "self_attn.k_proj"
    },
    "model.layers.11.self_attn.k_proj": {
        "snr": 0.8781776428222656,
        "type": "self_attn.k_proj"
    },
    "model.layers.12.self_attn.k_proj": {
        "snr": 1.0438013076782227,
        "type": "self_attn.k_proj"
    },
    "model.layers.13.self_attn.k_proj": {
        "snr": 0.9315219521522522,
        "type": "self_attn.k_proj"
    },
    "model.layers.14.self_attn.k_proj": {
        "snr": 0.7521569728851318,
        "type": "self_attn.k_proj"
    },
    "model.layers.15.self_attn.k_proj": {
        "snr": 0.9286947250366211,
        "type": "self_attn.k_proj"
    },
    "model.layers.16.self_attn.k_proj": {
        "snr": 0.8047553896903992,
        "type": "self_attn.k_proj"
    },
    "model.layers.17.self_attn.k_proj": {
        "snr": 1.2965552806854248,
        "type": "self_attn.k_proj"
    },
    "model.layers.18.self_attn.k_proj": {
        "snr": 1.134974479675293,
        "type": "self_attn.k_proj"
    },
    "model.layers.19.self_attn.k_proj": {
        "snr": 1.3648872375488281,
        "type": "self_attn.k_proj"
    },
    "model.layers.20.self_attn.k_proj": {
        "snr": 0.8667459487915039,
        "type": "self_attn.k_proj"
    },
    "model.layers.21.self_attn.k_proj": {
        "snr": 0.9100639224052429,
        "type": "self_attn.k_proj"
    },
    "model.layers.22.self_attn.k_proj": {
        "snr": 2.127535820007324,
        "type": "self_attn.k_proj"
    },
    "model.layers.23.self_attn.k_proj": {
        "snr": 1.0382369756698608,
        "type": "self_attn.k_proj"
    },
    "model.layers.24.self_attn.k_proj": {
        "snr": 1.113753318786621,
        "type": "self_attn.k_proj"
    },
    "model.layers.25.self_attn.k_proj": {
        "snr": 2.597890853881836,
        "type": "self_attn.k_proj"
    },
    "model.layers.26.self_attn.k_proj": {
        "snr": 1.1248247623443604,
        "type": "self_attn.k_proj"
    },
    "model.layers.27.self_attn.k_proj": {
        "snr": 1.1984941959381104,
        "type": "self_attn.k_proj"
    },
    "model.layers.0.self_attn.o_proj": {
        "snr": 0.8139898777008057,
        "type": "self_attn.o_proj"
    },
    "model.layers.1.self_attn.o_proj": {
        "snr": 0.21965594589710236,
        "type": "self_attn.o_proj"
    },
    "model.layers.2.self_attn.o_proj": {
        "snr": 0.219479501247406,
        "type": "self_attn.o_proj"
    },
    "model.layers.3.self_attn.o_proj": {
        "snr": 0.22144284844398499,
        "type": "self_attn.o_proj"
    },
    "model.layers.4.self_attn.o_proj": {
        "snr": 0.22390463948249817,
        "type": "self_attn.o_proj"
    },
    "model.layers.5.self_attn.o_proj": {
        "snr": 0.22383669018745422,
        "type": "self_attn.o_proj"
    },
    "model.layers.6.self_attn.o_proj": {
        "snr": 0.22818723320960999,
        "type": "self_attn.o_proj"
    },
    "model.layers.7.self_attn.o_proj": {
        "snr": 0.23134392499923706,
        "type": "self_attn.o_proj"
    },
    "model.layers.8.self_attn.o_proj": {
        "snr": 0.24275101721286774,
        "type": "self_attn.o_proj"
    },
    "model.layers.9.self_attn.o_proj": {
        "snr": 0.21139128506183624,
        "type": "self_attn.o_proj"
    },
    "model.layers.10.self_attn.o_proj": {
        "snr": 0.18210072815418243,
        "type": "self_attn.o_proj"
    },
    "model.layers.11.self_attn.o_proj": {
        "snr": 0.14415481686592102,
        "type": "self_attn.o_proj"
    },
    "model.layers.12.self_attn.o_proj": {
        "snr": 0.21947966516017914,
        "type": "self_attn.o_proj"
    },
    "model.layers.13.self_attn.o_proj": {
        "snr": 0.17875106632709503,
        "type": "self_attn.o_proj"
    },
    "model.layers.14.self_attn.o_proj": {
        "snr": 0.264996200799942,
        "type": "self_attn.o_proj"
    },
    "model.layers.15.self_attn.o_proj": {
        "snr": 0.19353187084197998,
        "type": "self_attn.o_proj"
    },
    "model.layers.16.self_attn.o_proj": {
        "snr": 0.22111012041568756,
        "type": "self_attn.o_proj"
    },
    "model.layers.17.self_attn.o_proj": {
        "snr": 0.2242278754711151,
        "type": "self_attn.o_proj"
    },
    "model.layers.18.self_attn.o_proj": {
        "snr": 0.2527434229850769,
        "type": "self_attn.o_proj"
    },
    "model.layers.19.self_attn.o_proj": {
        "snr": 0.26184532046318054,
        "type": "self_attn.o_proj"
    },
    "model.layers.20.self_attn.o_proj": {
        "snr": 0.1519661247730255,
        "type": "self_attn.o_proj"
    },
    "model.layers.21.self_attn.o_proj": {
        "snr": 0.22386522591114044,
        "type": "self_attn.o_proj"
    },
    "model.layers.22.self_attn.o_proj": {
        "snr": 0.2386160045862198,
        "type": "self_attn.o_proj"
    },
    "model.layers.23.self_attn.o_proj": {
        "snr": 0.18057651817798615,
        "type": "self_attn.o_proj"
    },
    "model.layers.24.self_attn.o_proj": {
        "snr": 0.1989467740058899,
        "type": "self_attn.o_proj"
    },
    "model.layers.25.self_attn.o_proj": {
        "snr": 0.11306505650281906,
        "type": "self_attn.o_proj"
    },
    "model.layers.26.self_attn.o_proj": {
        "snr": 0.08449216932058334,
        "type": "self_attn.o_proj"
    },
    "model.layers.27.self_attn.o_proj": {
        "snr": 0.10287519544363022,
        "type": "self_attn.o_proj"
    },
    "model.layers.0.self_attn.q_proj": {
        "snr": 0.039204664528369904,
        "type": "self_attn.q_proj"
    },
    "model.layers.1.self_attn.q_proj": {
        "snr": 0.14075909554958344,
        "type": "self_attn.q_proj"
    },
    "model.layers.2.self_attn.q_proj": {
        "snr": 0.18212397396564484,
        "type": "self_attn.q_proj"
    },
    "model.layers.3.self_attn.q_proj": {
        "snr": 0.1700422316789627,
        "type": "self_attn.q_proj"
    },
    "model.layers.4.self_attn.q_proj": {
        "snr": 0.1948907971382141,
        "type": "self_attn.q_proj"
    },
    "model.layers.5.self_attn.q_proj": {
        "snr": 0.2153141051530838,
        "type": "self_attn.q_proj"
    },
    "model.layers.6.self_attn.q_proj": {
        "snr": 0.21998055279254913,
        "type": "self_attn.q_proj"
    },
    "model.layers.7.self_attn.q_proj": {
        "snr": 0.20416118204593658,
        "type": "self_attn.q_proj"
    },
    "model.layers.8.self_attn.q_proj": {
        "snr": 0.2272879034280777,
        "type": "self_attn.q_proj"
    },
    "model.layers.9.self_attn.q_proj": {
        "snr": 0.23795834183692932,
        "type": "self_attn.q_proj"
    },
    "model.layers.10.self_attn.q_proj": {
        "snr": 0.21887299418449402,
        "type": "self_attn.q_proj"
    },
    "model.layers.11.self_attn.q_proj": {
        "snr": 0.23469635844230652,
        "type": "self_attn.q_proj"
    },
    "model.layers.12.self_attn.q_proj": {
        "snr": 0.23774078488349915,
        "type": "self_attn.q_proj"
    },
    "model.layers.13.self_attn.q_proj": {
        "snr": 0.1920779049396515,
        "type": "self_attn.q_proj"
    },
    "model.layers.14.self_attn.q_proj": {
        "snr": 0.2584812641143799,
        "type": "self_attn.q_proj"
    },
    "model.layers.15.self_attn.q_proj": {
        "snr": 0.07330238074064255,
        "type": "self_attn.q_proj"
    },
    "model.layers.16.self_attn.q_proj": {
        "snr": 0.23073157668113708,
        "type": "self_attn.q_proj"
    },
    "model.layers.17.self_attn.q_proj": {
        "snr": 0.2523840367794037,
        "type": "self_attn.q_proj"
    },
    "model.layers.18.self_attn.q_proj": {
        "snr": 0.23874858021736145,
        "type": "self_attn.q_proj"
    },
    "model.layers.19.self_attn.q_proj": {
        "snr": 0.20698708295822144,
        "type": "self_attn.q_proj"
    },
    "model.layers.20.self_attn.q_proj": {
        "snr": 0.25723400712013245,
        "type": "self_attn.q_proj"
    },
    "model.layers.21.self_attn.q_proj": {
        "snr": 0.223300039768219,
        "type": "self_attn.q_proj"
    },
    "model.layers.22.self_attn.q_proj": {
        "snr": 0.18824049830436707,
        "type": "self_attn.q_proj"
    },
    "model.layers.23.self_attn.q_proj": {
        "snr": 0.19840741157531738,
        "type": "self_attn.q_proj"
    },
    "model.layers.24.self_attn.q_proj": {
        "snr": 0.16326843202114105,
        "type": "self_attn.q_proj"
    },
    "model.layers.25.self_attn.q_proj": {
        "snr": 0.1581888198852539,
        "type": "self_attn.q_proj"
    },
    "model.layers.26.self_attn.q_proj": {
        "snr": 0.25306230783462524,
        "type": "self_attn.q_proj"
    },
    "model.layers.27.self_attn.q_proj": {
        "snr": 0.23808495700359344,
        "type": "self_attn.q_proj"
    },
    "model.layers.0.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.1.self_attn.v_proj": {
        "snr": 864.8881225585938,
        "type": "self_attn.v_proj"
    },
    "model.layers.2.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.3.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.4.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.5.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.6.self_attn.v_proj": {
        "snr": 48.853694915771484,
        "type": "self_attn.v_proj"
    },
    "model.layers.7.self_attn.v_proj": {
        "snr": 70.18457794189453,
        "type": "self_attn.v_proj"
    },
    "model.layers.8.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.9.self_attn.v_proj": {
        "snr": 371.1153259277344,
        "type": "self_attn.v_proj"
    },
    "model.layers.10.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.11.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.12.self_attn.v_proj": {
        "snr": 75.41203308105469,
        "type": "self_attn.v_proj"
    },
    "model.layers.13.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.14.self_attn.v_proj": {
        "snr": 51.92624282836914,
        "type": "self_attn.v_proj"
    },
    "model.layers.15.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.16.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.17.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.18.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.19.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.20.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.21.self_attn.v_proj": {
        "snr": 642.9313354492188,
        "type": "self_attn.v_proj"
    },
    "model.layers.22.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.23.self_attn.v_proj": {
        "snr": 323.5724182128906,
        "type": "self_attn.v_proj"
    },
    "model.layers.24.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.25.self_attn.v_proj": {
        "snr": 2.1736748218536377,
        "type": "self_attn.v_proj"
    },
    "model.layers.26.self_attn.v_proj": {
        "snr": 3.1729259490966797,
        "type": "self_attn.v_proj"
    },
    "model.layers.27.self_attn.v_proj": {
        "snr": 2.024953842163086,
        "type": "self_attn.v_proj"
    }
}


================================================
FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-3B-Instruct.json
================================================
{
    "model.layers.0.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.1.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.2.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.3.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.4.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.5.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.6.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.7.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.8.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.9.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.10.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.11.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.12.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.13.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.14.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.15.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.16.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.17.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.18.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.19.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.20.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.21.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.22.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.23.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.24.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.25.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.26.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.27.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.28.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.29.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.30.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.31.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.32.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.33.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.34.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.35.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "lm_head": {
        "snr": Infinity,
        "type": "lm_head"
    },
    "model.layers.0.mlp.down_proj": {
        "snr": 20.964319229125977,
        "type": "mlp.down_proj"
    },
    "model.layers.1.mlp.down_proj": {
        "snr": 0.11561352014541626,
        "type": "mlp.down_proj"
    },
    "model.layers.2.mlp.down_proj": {
        "snr": 0.14991413056850433,
        "type": "mlp.down_proj"
    },
    "model.layers.3.mlp.down_proj": {
        "snr": 0.3673713207244873,
        "type": "mlp.down_proj"
    },
    "model.layers.4.mlp.down_proj": {
        "snr": 0.5076134204864502,
        "type": "mlp.down_proj"
    },
    "model.layers.5.mlp.down_proj": {
        "snr": 33.89468002319336,
        "type": "mlp.down_proj"
    },
    "model.layers.6.mlp.down_proj": {
        "snr": 45.08732986450195,
        "type": "mlp.down_proj"
    },
    "model.layers.7.mlp.down_proj": {
        "snr": 33.234222412109375,
        "type": "mlp.down_proj"
    },
    "model.layers.8.mlp.down_proj": {
        "snr": 29.3447322845459,
        "type": "mlp.down_proj"
    },
    "model.layers.9.mlp.down_proj": {
        "snr": 26.664169311523438,
        "type": "mlp.down_proj"
    },
    "model.layers.10.mlp.down_proj": {
        "snr": 22.323949813842773,
        "type": "mlp.down_proj"
    },
    "model.layers.11.mlp.down_proj": {
        "snr": 18.259737014770508,
        "type": "mlp.down_proj"
    },
    "model.layers.12.mlp.down_proj": {
        "snr": 14.422037124633789,
        "type": "mlp.down_proj"
    },
    "model.layers.13.mlp.down_proj": {
        "snr": 22.172054290771484,
        "type": "mlp.down_proj"
    },
    "model.layers.14.mlp.down_proj": {
        "snr": 27.363698959350586,
        "type": "mlp.down_proj"
    },
    "model.layers.15.mlp.down_proj": {
        "snr": 28.474334716796875,
        "type": "mlp.down_proj"
    },
    "model.layers.16.mlp.down_proj": {
        "snr": 10.4143648147583,
        "type": "mlp.down_proj"
    },
    "model.layers.17.mlp.down_proj": {
        "snr": 10.719133377075195,
        "type": "mlp.down_proj"
    },
    "model.layers.18.mlp.down_proj": {
        "snr": 8.6494722366333,
        "type": "mlp.down_proj"
    },
    "model.layers.19.mlp.down_proj": {
        "snr": 5.69321870803833,
        "type": "mlp.down_proj"
    },
    "model.layers.20.mlp.down_proj": {
        "snr": 23.889677047729492,
        "type": "mlp.down_proj"
    },
    "model.layers.21.mlp.down_proj": {
        "snr": 11.59121036529541,
        "type": "mlp.down_proj"
    },
    "model.layers.22.mlp.down_proj": {
        "snr": 5.997435569763184,
        "type": "mlp.down_proj"
    },
    "model.layers.23.mlp.down_proj": {
        "snr": 19.415578842163086,
        "type": "mlp.down_proj"
    },
    "model.layers.24.mlp.down_proj": {
        "snr": 8.241704940795898,
        "type": "mlp.down_proj"
    },
    "model.layers.25.mlp.down_proj": {
        "snr": 12.993823051452637,
        "type": "mlp.down_proj"
    },
    "model.layers.26.mlp.down_proj": {
        "snr": 36.26508712768555,
        "type": "mlp.down_proj"
    },
    "model.layers.27.mlp.down_proj": {
        "snr": 19.957971572875977,
        "type": "mlp.down_proj"
    },
    "model.layers.28.mlp.down_proj": {
        "snr": 6.067765235900879,
        "type": "mlp.down_proj"
    },
    "model.layers.29.mlp.down_proj": {
        "snr": 5.369481086730957,
        "type": "mlp.down_proj"
    },
    "model.layers.30.mlp.down_proj": {
        "snr": 7.358774662017822,
        "type": "mlp.down_proj"
    },
    "model.layers.31.mlp.down_proj": {
        "snr": 7.8687238693237305,
        "type": "mlp.down_proj"
    },
    "model.layers.32.mlp.down_proj": {
        "snr": 8.713484764099121,
        "type": "mlp.down_proj"
    },
    "model.layers.33.mlp.down_proj": {
        "snr": 21.233531951904297,
        "type": "mlp.down_proj"
    },
    "model.layers.34.mlp.down_proj": {
        "snr": 32.37357711791992,
        "type": "mlp.down_proj"
    },
    "model.layers.35.mlp.down_proj": {
        "snr": 179.8053741455078,
        "type": "mlp.down_proj"
    },
    "model.layers.0.mlp.gate_proj": {
        "snr": 0.24989914894104004,
        "type": "mlp.gate_proj"
    },
    "model.layers.1.mlp.gate_proj": {
        "snr": 0.11613649874925613,
        "type": "mlp.gate_proj"
    },
    "model.layers.2.mlp.gate_proj": {
        "snr": 0.16354432702064514,
        "type": "mlp.gate_proj"
    },
    "model.layers.3.mlp.gate_proj": {
        "snr": 0.36216047406196594,
        "type": "mlp.gate_proj"
    },
    "model.layers.4.mlp.gate_proj": {
        "snr": 0.3485107719898224,
        "type": "mlp.gate_proj"
    },
    "model.layers.5.mlp.gate_proj": {
        "snr": 2.6546616554260254,
        "type": "mlp.gate_proj"
    },
    "model.layers.6.mlp.gate_proj": {
        "snr": 8.362885475158691,
        "type": "mlp.gate_proj"
    },
    "model.layers.7.mlp.gate_proj": {
        "snr": 7.38665246963501,
        "type": "mlp.gate_proj"
    },
    "model.layers.8.mlp.gate_proj": {
        "snr": 13.016111373901367,
        "type": "mlp.gate_proj"
    },
    "model.layers.9.mlp.gate_proj": {
        "snr": 14.94902515411377,
        "type": "mlp.gate_proj"
    },
    "model.layers.10.mlp.gate_proj": {
        "snr": 20.92418670654297,
        "type": "mlp.gate_proj"
    },
    "model.layers.11.mlp.gate_proj": {
        "snr": 15.954015731811523,
        "type": "mlp.gate_proj"
    },
    "model.layers.12.mlp.gate_proj": {
        "snr": 8.980009078979492,
        "type": "mlp.gate_proj"
    },
    "model.layers.13.mlp.gate_proj": {
        "snr": 17.59958267211914,
        "type": "mlp.gate_proj"
    },
    "model.layers.14.mlp.gate_proj": {
        "snr": 17.23070526123047,
        "type": "mlp.gate_proj"
    },
    "model.layers.15.mlp.gate_proj": {
        "snr": 23.725330352783203,
        "type": "mlp.gate_proj"
    },
    "model.layers.16.mlp.gate_proj": {
        "snr": 17.000444412231445,
        "type": "mlp.gate_proj"
    },
    "model.layers.17.mlp.gate_proj": {
        "snr": 18.293012619018555,
        "type": "mlp.gate_proj"
    },
    "model.layers.18.mlp.gate_proj": {
        "snr": 12.644190788269043,
        "type": "mlp.gate_proj"
    },
    "model.layers.19.mlp.gate_proj": {
        "snr": 16.278690338134766,
        "type": "mlp.gate_proj"
    },
    "model.layers.20.mlp.gate_proj": {
        "snr": 7.407368183135986,
        "type": "mlp.gate_proj"
    },
    "model.layers.21.mlp.gate_proj": {
        "snr": 6.109912395477295,
        "type": "mlp.gate_proj"
    },
    "model.layers.22.mlp.gate_proj": {
        "snr": 5.3692426681518555,
        "type": "mlp.gate_proj"
    },
    "model.layers.23.mlp.gate_proj": {
        "snr": 9.354235649108887,
        "type": "mlp.gate_proj"
    },
    "model.layers.24.mlp.gate_proj": {
        "snr": 7.655010223388672,
        "type": "mlp.gate_proj"
    },
    "model.layers.25.mlp.gate_proj": {
        "snr": 6.252986431121826,
        "type": "mlp.gate_proj"
    },
    "model.layers.26.mlp.gate_proj": {
        "snr": 14.26718521118164,
        "type": "mlp.gate_proj"
    },
    "model.layers.27.mlp.gate_proj": {
        "snr": 7.705836772918701,
        "type": "mlp.gate_proj"
    },
    "model.layers.28.mlp.gate_proj": {
        "snr": 5.998677730560303,
        "type": "mlp.gate_proj"
    },
    "model.layers.29.mlp.gate_proj": {
        "snr": 6.044872760772705,
        "type": "mlp.gate_proj"
    },
    "model.layers.30.mlp.gate_proj": {
        "snr": 9.027137756347656,
        "type": "mlp.gate_proj"
    },
    "model.layers.31.mlp.gate_proj": {
        "snr": 5.449969291687012,
        "type": "mlp.gate_proj"
    },
    "model.layers.32.mlp.gate_proj": {
        "snr": 4.206825256347656,
        "type": "mlp.gate_proj"
    },
    "model.layers.33.mlp.gate_proj": {
        "snr": 5.22825288772583,
        "type": "mlp.gate_proj"
    },
    "model.layers.34.mlp.gate_proj": {
        "snr": 43.71927261352539,
        "type": "mlp.gate_proj"
    },
    "model.layers.35.mlp.gate_proj": {
        "snr": 45.37385177612305,
        "type": "mlp.gate_proj"
    },
    "model.layers.0.mlp.up_proj": {
        "snr": 0.7069714665412903,
        "type": "mlp.up_proj"
    },
    "model.layers.1.mlp.up_proj": {
        "snr": 0.17766596376895905,
        "type": "mlp.up_proj"
    },
    "model.layers.2.mlp.up_proj": {
        "snr": 0.28577035665512085,
        "type": "mlp.up_proj"
    },
    "model.layers.3.mlp.up_proj": {
        "snr": 0.6763099431991577,
        "type": "mlp.up_proj"
    },
    "model.layers.4.mlp.up_proj": {
        "snr": 0.8340913653373718,
        "type": "mlp.up_proj"
    },
    "model.layers.5.mlp.up_proj": {
        "snr": 3.946547031402588,
        "type": "mlp.up_proj"
    },
    "model.layers.6.mlp.up_proj": {
        "snr": 19.56715202331543,
        "type": "mlp.up_proj"
    },
    "model.layers.7.mlp.up_proj": {
        "snr": 36.21149826049805,
        "type": "mlp.up_proj"
    },
    "model.layers.8.mlp.up_proj": {
        "snr": 44.28759002685547,
        "type": "mlp.up_proj"
    },
    "model.layers.9.mlp.up_proj": {
        "snr": 45.47198486328125,
        "type": "mlp.up_proj"
    },
    "model.layers.10.mlp.up_proj": {
        "snr": 79.00128936767578,
        "type": "mlp.up_proj"
    },
    "model.layers.11.mlp.up_proj": {
        "snr": 52.28038787841797,
        "type": "mlp.up_proj"
    },
    "model.layers.12.mlp.up_proj": {
        "snr": 48.08102035522461,
        "type": "mlp.up_proj"
    },
    "model.layers.13.mlp.up_proj": {
        "snr": 56.071285247802734,
        "type": "mlp.up_proj"
    },
    "model.layers.14.mlp.up_proj": {
        "snr": 72.24358367919922,
        "type": "mlp.up_proj"
    },
    "model.layers.15.mlp.up_proj": {
        "snr": 54.818233489990234,
        "type": "mlp.up_proj"
    },
    "model.layers.16.mlp.up_proj": {
        "snr": 47.251495361328125,
        "type": "mlp.up_proj"
    },
    "model.layers.17.mlp.up_proj": {
        "snr": 51.585636138916016,
        "type": "mlp.up_proj"
    },
    "model.layers.18.mlp.up_proj": {
        "snr": 43.47938919067383,
        "type": "mlp.up_proj"
    },
    "model.layers.19.mlp.up_proj": {
        "snr": 38.132469177246094,
        "type": "mlp.up_proj"
    },
    "model.layers.20.mlp.up_proj": {
        "snr": 21.78435707092285,
        "type": "mlp.up_proj"
    },
    "model.layers.21.mlp.up_proj": {
        "snr": 22.261096954345703,
        "type": "mlp.up_proj"
    },
    "model.layers.22.mlp.up_proj": {
        "snr": 30.751861572265625,
        "type": "mlp.up_proj"
    },
    "model.layers.23.mlp.up_proj": {
        "snr": 28.61063575744629,
        "type": "mlp.up_proj"
    },
    "model.layers.24.mlp.up_proj": {
        "snr": 20.21415901184082,
        "type": "mlp.up_proj"
    },
    "model.layers.25.mlp.up_proj": {
        "snr": 20.759052276611328,
        "type": "mlp.up_proj"
    },
    "model.layers.26.mlp.up_proj": {
        "snr": 33.80818557739258,
        "type": "mlp.up_proj"
    },
    "model.layers.27.mlp.up_proj": {
        "snr": 17.274362564086914,
        "type": "mlp.up_proj"
    },
    "model.layers.28.mlp.up_proj": {
        "snr": 13.943653106689453,
        "type": "mlp.up_proj"
    },
    "model.layers.29.mlp.up_proj": {
        "snr": 16.202186584472656,
        "type": "mlp.up_proj"
    },
    "model.layers.30.mlp.up_proj": {
        "snr": 24.25114631652832,
        "type": "mlp.up_proj"
    },
    "model.layers.31.mlp.up_proj": {
        "snr": 10.68645191192627,
        "type": "mlp.up_proj"
    },
    "model.layers.32.mlp.up_proj": {
        "snr": 5.7449774742126465,
        "type": "mlp.up_proj"
    },
    "model.layers.33.mlp.up_proj": {
        "snr": 11.879876136779785,
        "type": "mlp.up_proj"
    },
    "model.layers.34.mlp.up_proj": {
        "snr": 25.948715209960938,
        "type": "mlp.up_proj"
    },
    "model.layers.35.mlp.up_proj": {
        "snr": 38.63526153564453,
        "type": "mlp.up_proj"
    },
    "model.embed_tokens": {
        "snr": Infinity,
        "type": "model.embed_tokens"
    },
    "model.norm": {
        "snr": Infinity,
        "type": "model.norm"
    },
    "model.layers.0.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.1.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.2.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.3.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.4.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.5.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.6.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.7.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.8.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.9.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.10.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.11.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.12.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.13.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.14.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.15.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.16.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.17.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.18.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.19.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.20.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.21.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.22.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.23.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.24.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.25.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.26.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.27.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.28.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.29.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.30.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.31.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.32.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.33.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.34.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.35.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.0.self_attn.k_proj": {
        "snr": 12.243099212646484,
        "type": "self_attn.k_proj"
    },
    "model.layers.1.self_attn.k_proj": {
        "snr": 0.6446183323860168,
        "type": "self_attn.k_proj"
    },
    "model.layers.2.self_attn.k_proj": {
        "snr": 0.7159711718559265,
        "type": "self_attn.k_proj"
    },
    "model.layers.3.self_attn.k_proj": {
        "snr": 5.5100932121276855,
        "type": "self_attn.k_proj"
    },
    "model.layers.4.self_attn.k_proj": {
        "snr": 3.0802414417266846,
        "type": "self_attn.k_proj"
    },
    "model.layers.5.self_attn.k_proj": {
        "snr": 1.0472767353057861,
        "type": "self_attn.k_proj"
    },
    "model.layers.6.self_attn.k_proj": {
        "snr": 3.576918601989746,
        "type": "self_attn.k_proj"
    },
    "model.layers.7.self_attn.k_proj": {
        "snr": 3.3793225288391113,
        "type": "self_attn.k_proj"
    },
    "model.layers.8.self_attn.k_proj": {
        "snr": 2.9598212242126465,
        "type": "self_attn.k_proj"
    },
    "model.layers.9.self_attn.k_proj": {
        "snr": 6.102792263031006,
        "type": "self_attn.k_proj"
    },
    "model.layers.10.self_attn.k_proj": {
        "snr": 2.231630325317383,
        "type": "self_attn.k_proj"
    },
    "model.layers.11.self_attn.k_proj": {
        "snr": 2.176372766494751,
        "type": "self_attn.k_proj"
    },
    "model.layers.12.self_attn.k_proj": {
        "snr": 1.3229435682296753,
        "type": "self_attn.k_proj"
    },
    "model.layers.13.self_attn.k_proj": {
        "snr": 2.6183862686157227,
        "type": "self_attn.k_proj"
    },
    "model.layers.14.self_attn.k_proj": {
        "snr": 2.608288526535034,
        "type": "self_attn.k_proj"
    },
    "model.layers.15.self_attn.k_proj": {
        "snr": 1.5090984106063843,
        "type": "self_attn.k_proj"
    },
    "model.layers.16.self_attn.k_proj": {
        "snr": 1.284422516822815,
        "type": "self_attn.k_proj"
    },
    "model.layers.17.self_attn.k_proj": {
        "snr": 0.8903945088386536,
        "type": "self_attn.k_proj"
    },
    "model.layers.18.self_attn.k_proj": {
        "snr": 1.8880385160446167,
        "type": "self_attn.k_proj"
    },
    "model.layers.19.self_attn.k_proj": {
        "snr": 0.8905735015869141,
        "type": "self_attn.k_proj"
    },
    "model.layers.20.self_attn.k_proj": {
        "snr": 0.9060881733894348,
        "type": "self_attn.k_proj"
    },
    "model.layers.21.self_attn.k_proj": {
        "snr": 0.7572551965713501,
        "type": "self_attn.k_proj"
    },
    "model.layers.22.self_attn.k_proj": {
        "snr": 0.940827488899231,
        "type": "self_attn.k_proj"
    },
    "model.layers.23.self_attn.k_proj": {
        "snr": 3.7776191234588623,
        "type": "self_attn.k_proj"
    },
    "model.layers.24.self_attn.k_proj": {
        "snr": 1.328923225402832,
        "type": "self_attn.k_proj"
    },
    "model.layers.25.self_attn.k_proj": {
        "snr": 1.3986345529556274,
        "type": "self_attn.k_proj"
    },
    "model.layers.26.self_attn.k_proj": {
        "snr": 1.2436336278915405,
        "type": "self_attn.k_proj"
    },
    "model.layers.27.self_attn.k_proj": {
        "snr": 0.7737217545509338,
        "type": "self_attn.k_proj"
    },
    "model.layers.28.self_attn.k_proj": {
        "snr": 2.6027626991271973,
        "type": "self_attn.k_proj"
    },
    "model.layers.29.self_attn.k_proj": {
        "snr": 2.2332751750946045,
        "type": "self_attn.k_proj"
    },
    "model.layers.30.self_attn.k_proj": {
        "snr": 2.476585626602173,
        "type": "self_attn.k_proj"
    },
    "model.layers.31.self_attn.k_proj": {
        "snr": 1.1115432977676392,
        "type": "self_attn.k_proj"
    },
    "model.layers.32.self_attn.k_proj": {
        "snr": 0.8251476287841797,
        "type": "self_attn.k_proj"
    },
    "model.layers.33.self_attn.k_proj": {
        "snr": 0.9331105947494507,
        "type": "self_attn.k_proj"
    },
    "model.layers.34.self_attn.k_proj": {
        "snr": 6.602395534515381,
        "type": "self_attn.k_proj"
    },
    "model.layers.35.self_attn.k_proj": {
        "snr": 10.151693344116211,
        "type": "self_attn.k_proj"
    },
    "model.layers.0.self_attn.o_proj": {
        "snr": 0.3661542534828186,
        "type": "self_attn.o_proj"
    },
    "model.layers.1.self_attn.o_proj": {
        "snr": 0.19571374356746674,
        "type": "self_attn.o_proj"
    },
    "model.layers.2.self_attn.o_proj": {
        "snr": 0.2244851142168045,
        "type": "self_attn.o_proj"
    },
    "model.layers.3.self_attn.o_proj": {
        "snr": 0.2593664526939392,
        "type": "self_attn.o_proj"
    },
    "model.layers.4.self_attn.o_proj": {
        "snr": 0.2569783926010132,
        "type": "self_attn.o_proj"
    },
    "model.layers.5.self_attn.o_proj": {
        "snr": 0.2564302980899811,
        "type": "self_attn.o_proj"
    },
    "model.layers.6.self_attn.o_proj": {
        "snr": 0.18539844453334808,
        "type": "self_attn.o_proj"
    },
    "model.layers.7.self_attn.o_proj": {
        "snr": 0.2328651398420334,
        "type": "self_attn.o_proj"
    },
    "model.layers.8.self_attn.o_proj": {
        "snr": 0.22055882215499878,
        "type": "self_attn.o_proj"
    },
    "model.layers.9.self_attn.o_proj": {
        "snr": 0.21800543367862701,
        "type": "self_attn.o_proj"
    },
    "model.layers.10.self_attn.o_proj": {
        "snr": 0.22867777943611145,
        "type": "self_attn.o_proj"
    },
    "model.layers.11.self_attn.o_proj": {
        "snr": 0.23986175656318665,
        "type": "self_attn.o_proj"
    },
    "model.layers.12.self_attn.o_proj": {
        "snr": 0.17598563432693481,
        "type": "self_attn.o_proj"
    },
    "model.layers.13.self_attn.o_proj": {
        "snr": 0.20469218492507935,
        "type": "self_attn.o_proj"
    },
    "model.layers.14.self_attn.o_proj": {
        "snr": 0.21040217578411102,
        "type": "self_attn.o_proj"
    },
    "model.layers.15.self_attn.o_proj": {
        "snr": 0.23787625133991241,
        "type": "self_attn.o_proj"
    },
    "model.layers.16.self_attn.o_proj": {
        "snr": 0.16339677572250366,
        "type": "self_attn.o_proj"
    },
    "model.layers.17.self_attn.o_proj": {
        "snr": 0.2070712298154831,
        "type": "self_attn.o_proj"
    },
    "model.layers.18.self_attn.o_proj": {
        "snr": 0.1826934814453125,
        "type": "self_attn.o_proj"
    },
    "model.layers.19.self_attn.o_proj": {
        "snr": 0.19459959864616394,
        "type": "self_attn.o_proj"
    },
    "model.layers.20.self_attn.o_proj": {
        "snr": 0.2668156027793884,
        "type": "self_attn.o_proj"
    },
    "model.layers.21.self_attn.o_proj": {
        "snr": 0.16906610131263733,
        "type": "self_attn.o_proj"
    },
    "model.layers.22.self_attn.o_proj": {
        "snr": 0.18790249526500702,
        "type": "self_attn.o_proj"
    },
    "model.layers.23.self_attn.o_proj": {
        "snr": 0.18883933126926422,
        "type": "self_attn.o_proj"
    },
    "model.layers.24.self_attn.o_proj": {
        "snr": 0.1793188899755478,
        "type": "self_attn.o_proj"
    },
    "model.layers.25.self_attn.o_proj": {
        "snr": 0.1800570785999298,
        "type": "self_attn.o_proj"
    },
    "model.layers.26.self_attn.o_proj": {
        "snr": 0.17790433764457703,
        "type": "self_attn.o_proj"
    },
    "model.layers.27.self_attn.o_proj": {
        "snr": 0.2029498964548111,
        "type": "self_attn.o_proj"
    },
    "model.layers.28.self_attn.o_proj": {
        "snr": 0.17044201493263245,
        "type": "self_attn.o_proj"
    },
    "model.layers.29.self_attn.o_proj": {
        "snr": 0.19938386976718903,
        "type": "self_attn.o_proj"
    },
    "model.layers.30.self_attn.o_proj": {
        "snr": 0.23108959197998047,
        "type": "self_attn.o_proj"
    },
    "model.layers.31.self_attn.o_proj": {
        "snr": 0.16427059471607208,
        "type": "self_attn.o_proj"
    },
    "model.layers.32.self_attn.o_proj": {
        "snr": 0.10631092637777328,
        "type": "self_attn.o_proj"
    },
    "model.layers.33.self_attn.o_proj": {
        "snr": 0.09417019784450531,
        "type": "self_attn.o_proj"
    },
    "model.layers.34.self_attn.o_proj": {
        "snr": 0.1324978619813919,
        "type": "self_attn.o_proj"
    },
    "model.layers.35.self_attn.o_proj": {
        "snr": 0.11784011125564575,
        "type": "self_attn.o_proj"
    },
    "model.layers.0.self_attn.q_proj": {
        "snr": 0.05565479397773743,
        "type": "self_attn.q_proj"
    },
    "model.layers.1.self_attn.q_proj": {
        "snr": 0.138458251953125,
        "type": "self_attn.q_proj"
    },
    "model.layers.2.self_attn.q_proj": {
        "snr": 0.12992437183856964,
        "type": "self_attn.q_proj"
    },
    "model.layers.3.self_attn.q_proj": {
        "snr": 0.15362468361854553,
        "type": "self_attn.q_proj"
    },
    "model.layers.4.self_attn.q_proj": {
        "snr": 0.1563446819782257,
        "type": "self_attn.q_proj"
    },
    "model.layers.5.self_attn.q_proj": {
        "snr": 0.15544593334197998,
        "type": "self_attn.q_proj"
    },
    "model.layers.6.self_attn.q_proj": {
        "snr": 0.15956827998161316,
        "type": "self_attn.q_proj"
    },
    "model.layers.7.self_attn.q_proj": {
        "snr": 0.17549948394298553,
        "type": "self_attn.q_proj"
    },
    "model.layers.8.self_attn.q_proj": {
        "snr": 0.16668449342250824,
        "type": "self_attn.q_proj"
    },
    "model.layers.9.self_attn.q_proj": {
        "snr": 0.15626586973667145,
        "type": "self_attn.q_proj"
    },
    "model.layers.10.self_attn.q_proj": {
        "snr": 0.18318884074687958,
        "type": "self_attn.q_proj"
    },
    "model.layers.11.self_attn.q_proj": {
        "snr": 0.171547532081604,
        "type": "self_attn.q_proj"
    },
    "model.layers.12.self_attn.q_proj": {
        "snr": 0.18164905905723572,
        "type": "self_attn.q_proj"
    },
    "model.layers.13.self_attn.q_proj": {
        "snr": 0.2091975212097168,
        "type": "self_attn.q_proj"
    },
    "model.layers.14.self_attn.q_proj": {
        "snr": 0.17431670427322388,
        "type": "self_attn.q_proj"
    },
    "model.layers.15.self_attn.q_proj": {
        "snr": 0.20902502536773682,
        "type": "self_attn.q_proj"
    },
    "model.layers.16.self_attn.q_proj": {
        "snr": 0.15439842641353607,
        "type": "self_attn.q_proj"
    },
    "model.layers.17.self_attn.q_proj": {
        "snr": 0.1945274919271469,
        "type": "self_attn.q_proj"
    },
    "model.layers.18.self_attn.q_proj": {
        "snr": 0.18916545808315277,
        "type": "self_attn.q_proj"
    },
    "model.layers.19.self_attn.q_proj": {
        "snr": 0.20778712630271912,
        "type": "self_attn.q_proj"
    },
    "model.layers.20.self_attn.q_proj": {
        "snr": 0.20866931974887848,
        "type": "self_attn.q_proj"
    },
    "model.layers.21.self_attn.q_proj": {
        "snr": 0.1900305300951004,
        "type": "self_attn.q_proj"
    },
    "model.layers.22.self_attn.q_proj": {
        "snr": 0.18200653791427612,
        "type": "self_attn.q_proj"
    },
    "model.layers.23.self_attn.q_proj": {
        "snr": 0.2070988416671753,
        "type": "self_attn.q_proj"
    },
    "model.layers.24.self_attn.q_proj": {
        "snr": 0.1845332235097885,
        "type": "self_attn.q_proj"
    },
    "model.layers.25.self_attn.q_proj": {
        "snr": 0.20868781208992004,
        "type": "self_attn.q_proj"
    },
    "model.layers.26.self_attn.q_proj": {
        "snr": 0.19242744147777557,
        "type": "self_attn.q_proj"
    },
    "model.layers.27.self_attn.q_proj": {
        "snr": 0.15225112438201904,
        "type": "self_attn.q_proj"
    },
    "model.layers.28.self_attn.q_proj": {
        "snr": 0.20065009593963623,
        "type": "self_attn.q_proj"
    },
    "model.layers.29.self_attn.q_proj": {
        "snr": 0.19390477240085602,
        "type": "self_attn.q_proj"
    },
    "model.layers.30.self_attn.q_proj": {
        "snr": 0.18538697063922882,
        "type": "self_attn.q_proj"
    },
    "model.layers.31.self_attn.q_proj": {
        "snr": 0.18954339623451233,
        "type": "self_attn.q_proj"
    },
    "model.layers.32.self_attn.q_proj": {
        "snr": 0.20089596509933472,
        "type": "self_attn.q_proj"
    },
    "model.layers.33.self_attn.q_proj": {
        "snr": 0.19814996421337128,
        "type": "self_attn.q_proj"
    },
    "model.layers.34.self_attn.q_proj": {
        "snr": 0.17733213305473328,
        "type": "self_attn.q_proj"
    },
    "model.layers.35.self_attn.q_proj": {
        "snr": 0.14075976610183716,
        "type": "self_attn.q_proj"
    },
    "model.layers.0.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.1.self_attn.v_proj": {
        "snr": 845.8053588867188,
        "type": "self_attn.v_proj"
    },
    "model.layers.2.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.3.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.4.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.5.self_attn.v_proj": {
        "snr": 83.97241973876953,
        "type": "self_attn.v_proj"
    },
    "model.layers.6.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.7.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.8.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.9.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.10.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.11.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.12.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.13.self_attn.v_proj": {
        "snr": 213.70960998535156,
        "type": "self_attn.v_proj"
    },
    "model.layers.14.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.15.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.16.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.17.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.18.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.19.self_attn.v_proj": {
        "snr": 18.950267791748047,
        "type": "self_attn.v_proj"
    },
    "model.layers.20.self_attn.v_proj": {
        "snr": 435.8339538574219,
        "type": "self_attn.v_proj"
    },
    "model.layers.21.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.22.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.23.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.24.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.25.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.26.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.27.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.28.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.29.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.30.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.31.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.32.self_attn.v_proj": {
        "snr": 1.2341279983520508,
        "type": "self_attn.v_proj"
    },
    "model.layers.33.self_attn.v_proj": {
        "snr": 0.6158654689788818,
        "type": "self_attn.v_proj"
    },
    "model.layers.34.self_attn.v_proj": {
        "snr": 509.3221130371094,
        "type": "self_attn.v_proj"
    },
    "model.layers.35.self_attn.v_proj": {
        "snr": 538.6658325195312,
        "type": "self_attn.v_proj"
    }
}


================================================
FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-3B.json
================================================
{
    "model.layers.0.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.1.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.2.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.3.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.4.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.5.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.6.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.7.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.8.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.9.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.10.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.11.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.12.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.13.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.14.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.15.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.16.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.17.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.18.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.19.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.20.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.21.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.22.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.23.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.24.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.25.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.26.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.27.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.28.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.29.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.30.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.31.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.32.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.33.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.34.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.35.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "lm_head": {
        "snr": Infinity,
        "type": "lm_head"
    },
    "model.layers.0.mlp.down_proj": {
        "snr": 20.942785263061523,
        "type": "mlp.down_proj"
    },
    "model.layers.1.mlp.down_proj": {
        "snr": 0.11550866067409515,
        "type": "mlp.down_proj"
    },
    "model.layers.2.mlp.down_proj": {
        "snr": 0.14981402456760406,
        "type": "mlp.down_proj"
    },
    "model.layers.3.mlp.down_proj": {
        "snr": 0.36719316244125366,
        "type": "mlp.down_proj"
    },
    "model.layers.4.mlp.down_proj": {
        "snr": 0.5072987079620361,
        "type": "mlp.down_proj"
    },
    "model.layers.5.mlp.down_proj": {
        "snr": 33.86688232421875,
        "type": "mlp.down_proj"
    },
    "model.layers.6.mlp.down_proj": {
        "snr": 45.066246032714844,
        "type": "mlp.down_proj"
    },
    "model.layers.7.mlp.down_proj": {
        "snr": 33.20981979370117,
        "type": "mlp.down_proj"
    },
    "model.layers.8.mlp.down_proj": {
        "snr": 29.310104370117188,
        "type": "mlp.down_proj"
    },
    "model.layers.9.mlp.down_proj": {
        "snr": 26.638381958007812,
        "type": "mlp.down_proj"
    },
    "model.layers.10.mlp.down_proj": {
        "snr": 22.302486419677734,
        "type": "mlp.down_proj"
    },
    "model.layers.11.mlp.down_proj": {
        "snr": 18.249290466308594,
        "type": "mlp.down_proj"
    },
    "model.layers.12.mlp.down_proj": {
        "snr": 14.057564735412598,
        "type": "mlp.down_proj"
    },
    "model.layers.13.mlp.down_proj": {
        "snr": 22.154281616210938,
        "type": "mlp.down_proj"
    },
    "model.layers.14.mlp.down_proj": {
        "snr": 27.348575592041016,
        "type": "mlp.down_proj"
    },
    "model.layers.15.mlp.down_proj": {
        "snr": 28.447378158569336,
        "type": "mlp.down_proj"
    },
    "model.layers.16.mlp.down_proj": {
        "snr": 10.405216217041016,
        "type": "mlp.down_proj"
    },
    "model.layers.17.mlp.down_proj": {
        "snr": 10.71042251586914,
        "type": "mlp.down_proj"
    },
    "model.layers.18.mlp.down_proj": {
        "snr": 8.642854690551758,
        "type": "mlp.down_proj"
    },
    "model.layers.19.mlp.down_proj": {
        "snr": 5.690433979034424,
        "type": "mlp.down_proj"
    },
    "model.layers.20.mlp.down_proj": {
        "snr": 23.869070053100586,
        "type": "mlp.down_proj"
    },
    "model.layers.21.mlp.down_proj": {
        "snr": 11.584356307983398,
        "type": "mlp.down_proj"
    },
    "model.layers.22.mlp.down_proj": {
        "snr": 5.992950916290283,
        "type": "mlp.down_proj"
    },
    "model.layers.23.mlp.down_proj": {
        "snr": 18.495361328125,
        "type": "mlp.down_proj"
    },
    "model.layers.24.mlp.down_proj": {
        "snr": 8.233827590942383,
        "type": "mlp.down_proj"
    },
    "model.layers.25.mlp.down_proj": {
        "snr": 12.626734733581543,
        "type": "mlp.down_proj"
    },
    "model.layers.26.mlp.down_proj": {
        "snr": 36.21802520751953,
        "type": "mlp.down_proj"
    },
    "model.layers.27.mlp.down_proj": {
        "snr": 19.932941436767578,
        "type": "mlp.down_proj"
    },
    "model.layers.28.mlp.down_proj": {
        "snr": 6.0616455078125,
        "type": "mlp.down_proj"
    },
    "model.layers.29.mlp.down_proj": {
        "snr": 5.363720417022705,
        "type": "mlp.down_proj"
    },
    "model.layers.30.mlp.down_proj": {
        "snr": 7.455615520477295,
        "type": "mlp.down_proj"
    },
    "model.layers.31.mlp.down_proj": {
        "snr": 7.8631815910339355,
        "type": "mlp.down_proj"
    },
    "model.layers.32.mlp.down_proj": {
        "snr": 8.706913948059082,
        "type": "mlp.down_proj"
    },
    "model.layers.33.mlp.down_proj": {
        "snr": 21.220134735107422,
        "type": "mlp.down_proj"
    },
    "model.layers.34.mlp.down_proj": {
        "snr": 32.33852005004883,
        "type": "mlp.down_proj"
    },
    "model.layers.35.mlp.down_proj": {
        "snr": 179.8906707763672,
        "type": "mlp.down_proj"
    },
    "model.layers.0.mlp.gate_proj": {
        "snr": 0.24970805644989014,
        "type": "mlp.gate_proj"
    },
    "model.layers.1.mlp.gate_proj": {
        "snr": 0.11607512086629868,
        "type": "mlp.gate_proj"
    },
    "model.layers.2.mlp.gate_proj": {
        "snr": 0.16310769319534302,
        "type": "mlp.gate_proj"
    },
    "model.layers.3.mlp.gate_proj": {
        "snr": 0.3621424436569214,
        "type": "mlp.gate_proj"
    },
    "model.layers.4.mlp.gate_proj": {
        "snr": 0.3482637107372284,
        "type": "mlp.gate_proj"
    },
    "model.layers.5.mlp.gate_proj": {
        "snr": 2.6533455848693848,
        "type": "mlp.gate_proj"
    },
    "model.layers.6.mlp.gate_proj": {
        "snr": 8.359040260314941,
        "type": "mlp.gate_proj"
    },
    "model.layers.7.mlp.gate_proj": {
        "snr": 7.382037162780762,
        "type": "mlp.gate_proj"
    },
    "model.layers.8.mlp.gate_proj": {
        "snr": 13.00683879852295,
        "type": "mlp.gate_proj"
    },
    "model.layers.9.mlp.gate_proj": {
        "snr": 14.936161994934082,
        "type": "mlp.gate_proj"
    },
    "model.layers.10.mlp.gate_proj": {
        "snr": 20.907283782958984,
        "type": "mlp.gate_proj"
    },
    "model.layers.11.mlp.gate_proj": {
        "snr": 15.941497802734375,
        "type": "mlp.gate_proj"
    },
    "model.layers.12.mlp.gate_proj": {
        "snr": 8.97419548034668,
        "type": "mlp.gate_proj"
    },
    "model.layers.13.mlp.gate_proj": {
        "snr": 17.585100173950195,
        "type": "mlp.gate_proj"
    },
    "model.layers.14.mlp.gate_proj": {
        "snr": 17.21462059020996,
        "type": "mlp.gate_proj"
    },
    "model.layers.15.mlp.gate_proj": {
        "snr": 23.703285217285156,
        "type": "mlp.gate_proj"
    },
    "model.layers.16.mlp.gate_proj": {
        "snr": 16.986576080322266,
        "type": "mlp.gate_proj"
    },
    "model.layers.17.mlp.gate_proj": {
        "snr": 18.27729606628418,
        "type": "mlp.gate_proj"
    },
    "model.layers.18.mlp.gate_proj": {
        "snr": 12.63351058959961,
        "type": "mlp.gate_proj"
    },
    "model.layers.19.mlp.gate_proj": {
        "snr": 16.2633113861084,
        "type": "mlp.gate_proj"
    },
    "model.layers.20.mlp.gate_proj": {
        "snr": 7.399787902832031,
        "type": "mlp.gate_proj"
    },
    "model.layers.21.mlp.gate_proj": {
        "snr": 6.10424280166626,
        "type": "mlp.gate_proj"
    },
    "model.layers.22.mlp.gate_proj": {
        "snr": 5.363350868225098,
        "type": "mlp.gate_proj"
    },
    "model.layers.23.mlp.gate_proj": {
        "snr": 9.344535827636719,
        "type": "mlp.gate_proj"
    },
    "model.layers.24.mlp.gate_proj": {
        "snr": 7.647364616394043,
        "type": "mlp.gate_proj"
    },
    "model.layers.25.mlp.gate_proj": {
        "snr": 6.143579959869385,
        "type": "mlp.gate_proj"
    },
    "model.layers.26.mlp.gate_proj": {
        "snr": 14.254817008972168,
        "type": "mlp.gate_proj"
    },
    "model.layers.27.mlp.gate_proj": {
        "snr": 7.7000861167907715,
        "type": "mlp.gate_proj"
    },
    "model.layers.28.mlp.gate_proj": {
        "snr": 5.994422435760498,
        "type": "mlp.gate_proj"
    },
    "model.layers.29.mlp.gate_proj": {
        "snr": 6.041909694671631,
        "type": "mlp.gate_proj"
    },
    "model.layers.30.mlp.gate_proj": {
        "snr": 9.027522087097168,
        "type": "mlp.gate_proj"
    },
    "model.layers.31.mlp.gate_proj": {
        "snr": 5.450753211975098,
        "type": "mlp.gate_proj"
    },
    "model.layers.32.mlp.gate_proj": {
        "snr": 4.149200439453125,
        "type": "mlp.gate_proj"
    },
    "model.layers.33.mlp.gate_proj": {
        "snr": 5.223763942718506,
        "type": "mlp.gate_proj"
    },
    "model.layers.34.mlp.gate_proj": {
        "snr": 43.65521240234375,
        "type": "mlp.gate_proj"
    },
    "model.layers.35.mlp.gate_proj": {
        "snr": 45.312774658203125,
        "type": "mlp.gate_proj"
    },
    "model.layers.0.mlp.up_proj": {
        "snr": 0.7065013647079468,
        "type": "mlp.up_proj"
    },
    "model.layers.1.mlp.up_proj": {
        "snr": 0.17752516269683838,
        "type": "mlp.up_proj"
    },
    "model.layers.2.mlp.up_proj": {
        "snr": 0.2847473919391632,
        "type": "mlp.up_proj"
    },
    "model.layers.3.mlp.up_proj": {
        "snr": 0.6757690906524658,
        "type": "mlp.up_proj"
    },
    "model.layers.4.mlp.up_proj": {
        "snr": 0.8353318572044373,
        "type": "mlp.up_proj"
    },
    "model.layers.5.mlp.up_proj": {
        "snr": 3.940711736679077,
        "type": "mlp.up_proj"
    },
    "model.layers.6.mlp.up_proj": {
        "snr": 19.556047439575195,
        "type": "mlp.up_proj"
    },
    "model.layers.7.mlp.up_proj": {
        "snr": 36.19340515136719,
        "type": "mlp.up_proj"
    },
    "model.layers.8.mlp.up_proj": {
        "snr": 44.2518424987793,
        "type": "mlp.up_proj"
    },
    "model.layers.9.mlp.up_proj": {
        "snr": 45.418025970458984,
        "type": "mlp.up_proj"
    },
    "model.layers.10.mlp.up_proj": {
        "snr": 78.90928649902344,
        "type": "mlp.up_proj"
    },
    "model.layers.11.mlp.up_proj": {
        "snr": 52.24648666381836,
        "type": "mlp.up_proj"
    },
    "model.layers.12.mlp.up_proj": {
        "snr": 48.02030563354492,
        "type": "mlp.up_proj"
    },
    "model.layers.13.mlp.up_proj": {
        "snr": 56.016239166259766,
        "type": "mlp.up_proj"
    },
    "model.layers.14.mlp.up_proj": {
        "snr": 72.16619873046875,
        "type": "mlp.up_proj"
    },
    "model.layers.15.mlp.up_proj": {
        "snr": 54.75283432006836,
        "type": "mlp.up_proj"
    },
    "model.layers.16.mlp.up_proj": {
        "snr": 47.204097747802734,
        "type": "mlp.up_proj"
    },
    "model.layers.17.mlp.up_proj": {
        "snr": 51.549312591552734,
        "type": "mlp.up_proj"
    },
    "model.layers.18.mlp.up_proj": {
        "snr": 43.43872833251953,
        "type": "mlp.up_proj"
    },
    "model.layers.19.mlp.up_proj": {
        "snr": 38.09785461425781,
        "type": "mlp.up_proj"
    },
    "model.layers.20.mlp.up_proj": {
        "snr": 21.767858505249023,
        "type": "mlp.up_proj"
    },
    "model.layers.21.mlp.up_proj": {
        "snr": 22.243661880493164,
        "type": "mlp.up_proj"
    },
    "model.layers.22.mlp.up_proj": {
        "snr": 30.71843147277832,
        "type": "mlp.up_proj"
    },
    "model.layers.23.mlp.up_proj": {
        "snr": 28.5756778717041,
        "type": "mlp.up_proj"
    },
    "model.layers.24.mlp.up_proj": {
        "snr": 20.186717987060547,
        "type": "mlp.up_proj"
    },
    "model.layers.25.mlp.up_proj": {
        "snr": 20.742860794067383,
        "type": "mlp.up_proj"
    },
    "model.layers.26.mlp.up_proj": {
        "snr": 33.777984619140625,
        "type": "mlp.up_proj"
    },
    "model.layers.27.mlp.up_proj": {
        "snr": 17.254213333129883,
        "type": "mlp.up_proj"
    },
    "model.layers.28.mlp.up_proj": {
        "snr": 13.930026054382324,
        "type": "mlp.up_proj"
    },
    "model.layers.29.mlp.up_proj": {
        "snr": 16.17984390258789,
        "type": "mlp.up_proj"
    },
    "model.layers.30.mlp.up_proj": {
        "snr": 24.236648559570312,
        "type": "mlp.up_proj"
    },
    "model.layers.31.mlp.up_proj": {
        "snr": 10.665648460388184,
        "type": "mlp.up_proj"
    },
    "model.layers.32.mlp.up_proj": {
        "snr": 5.735939025878906,
        "type": "mlp.up_proj"
    },
    "model.layers.33.mlp.up_proj": {
        "snr": 11.592061042785645,
        "type": "mlp.up_proj"
    },
    "model.layers.34.mlp.up_proj": {
        "snr": 25.923419952392578,
        "type": "mlp.up_proj"
    },
    "model.layers.35.mlp.up_proj": {
        "snr": 38.579349517822266,
        "type": "mlp.up_proj"
    },
    "model.embed_tokens": {
        "snr": Infinity,
        "type": "model.embed_tokens"
    },
    "model.norm": {
        "snr": Infinity,
        "type": "model.norm"
    },
    "model.layers.0.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.1.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.2.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.3.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.4.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.5.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.6.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.7.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.8.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.9.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.10.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.11.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.12.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.13.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.14.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.15.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.16.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.17.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.18.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.19.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.20.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.21.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.22.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.23.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.24.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.25.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.26.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.27.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.28.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.29.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.30.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.31.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.32.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.33.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.34.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.35.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.0.self_attn.k_proj": {
        "snr": 12.24727725982666,
        "type": "self_attn.k_proj"
    },
    "model.layers.1.self_attn.k_proj": {
        "snr": 0.6436238288879395,
        "type": "self_attn.k_proj"
    },
    "model.layers.2.self_attn.k_proj": {
        "snr": 0.7156716585159302,
        "type": "self_attn.k_proj"
    },
    "model.layers.3.self_attn.k_proj": {
        "snr": 5.505439758300781,
        "type": "self_attn.k_proj"
    },
    "model.layers.4.self_attn.k_proj": {
        "snr": 3.0760715007781982,
        "type": "self_attn.k_proj"
    },
    "model.layers.5.self_attn.k_proj": {
        "snr": 1.0453941822052002,
        "type": "self_attn.k_proj"
    },
    "model.layers.6.self_attn.k_proj": {
        "snr": 3.57472562789917,
        "type": "self_attn.k_proj"
    },
    "model.layers.7.self_attn.k_proj": {
        "snr": 3.3765170574188232,
        "type": "self_attn.k_proj"
    },
    "model.layers.8.self_attn.k_proj": {
        "snr": 2.8859639167785645,
        "type": "self_attn.k_proj"
    },
    "model.layers.9.self_attn.k_proj": {
        "snr": 6.09852409362793,
        "type": "self_attn.k_proj"
    },
    "model.layers.10.self_attn.k_proj": {
        "snr": 2.229580879211426,
        "type": "self_attn.k_proj"
    },
    "model.layers.11.self_attn.k_proj": {
        "snr": 2.173879623413086,
        "type": "self_attn.k_proj"
    },
    "model.layers.12.self_attn.k_proj": {
        "snr": 1.3220131397247314,
        "type": "self_attn.k_proj"
    },
    "model.layers.13.self_attn.k_proj": {
        "snr": 2.61668062210083,
        "type": "self_attn.k_proj"
    },
    "model.layers.14.self_attn.k_proj": {
        "snr": 2.606799840927124,
        "type": "self_attn.k_proj"
    },
    "model.layers.15.self_attn.k_proj": {
        "snr": 1.5080311298370361,
        "type": "self_attn.k_proj"
    },
    "model.layers.16.self_attn.k_proj": {
        "snr": 1.2841484546661377,
        "type": "self_attn.k_proj"
    },
    "model.layers.17.self_attn.k_proj": {
        "snr": 0.8896433115005493,
        "type": "self_attn.k_proj"
    },
    "model.layers.18.self_attn.k_proj": {
        "snr": 1.8873414993286133,
        "type": "self_attn.k_proj"
    },
    "model.layers.19.self_attn.k_proj": {
        "snr": 0.8897770643234253,
        "type": "self_attn.k_proj"
    },
    "model.layers.20.self_attn.k_proj": {
        "snr": 0.9051405787467957,
        "type": "self_attn.k_proj"
    },
    "model.layers.21.self_attn.k_proj": {
        "snr": 0.7568970322608948,
        "type": "self_attn.k_proj"
    },
    "model.layers.22.self_attn.k_proj": {
        "snr": 0.9403582811355591,
        "type": "self_attn.k_proj"
    },
    "model.layers.23.self_attn.k_proj": {
        "snr": 3.777062177658081,
        "type": "self_attn.k_proj"
    },
    "model.layers.24.self_attn.k_proj": {
        "snr": 1.3280683755874634,
        "type": "self_attn.k_proj"
    },
    "model.layers.25.self_attn.k_proj": {
        "snr": 1.3980307579040527,
        "type": "self_attn.k_proj"
    },
    "model.layers.26.self_attn.k_proj": {
        "snr": 1.2435240745544434,
        "type": "self_attn.k_proj"
    },
    "model.layers.27.self_attn.k_proj": {
        "snr": 0.7732619047164917,
        "type": "self_attn.k_proj"
    },
    "model.layers.28.self_attn.k_proj": {
        "snr": 2.6010243892669678,
        "type": "self_attn.k_proj"
    },
    "model.layers.29.self_attn.k_proj": {
        "snr": 2.232773780822754,
        "type": "self_attn.k_proj"
    },
    "model.layers.30.self_attn.k_proj": {
        "snr": 2.4743099212646484,
        "type": "self_attn.k_proj"
    },
    "model.layers.31.self_attn.k_proj": {
        "snr": 1.11082923412323,
        "type": "self_attn.k_proj"
    },
    "model.layers.32.self_attn.k_proj": {
        "snr": 0.8243986368179321,
        "type": "self_attn.k_proj"
    },
    "model.layers.33.self_attn.k_proj": {
        "snr": 0.932928204536438,
        "type": "self_attn.k_proj"
    },
    "model.layers.34.self_attn.k_proj": {
        "snr": 6.608611583709717,
        "type": "self_attn.k_proj"
    },
    "model.layers.35.self_attn.k_proj": {
        "snr": 10.160987854003906,
        "type": "self_attn.k_proj"
    },
    "model.layers.0.self_attn.o_proj": {
        "snr": 0.36662933230400085,
        "type": "self_attn.o_proj"
    },
    "model.layers.1.self_attn.o_proj": {
        "snr": 0.1955128312110901,
        "type": "self_attn.o_proj"
    },
    "model.layers.2.self_attn.o_proj": {
        "snr": 0.22419843077659607,
        "type": "self_attn.o_proj"
    },
    "model.layers.3.self_attn.o_proj": {
        "snr": 0.25902292132377625,
        "type": "self_attn.o_proj"
    },
    "model.layers.4.self_attn.o_proj": {
        "snr": 0.2567676901817322,
        "type": "self_attn.o_proj"
    },
    "model.layers.5.self_attn.o_proj": {
        "snr": 0.2560890316963196,
        "type": "self_attn.o_proj"
    },
    "model.layers.6.self_attn.o_proj": {
        "snr": 0.18518221378326416,
        "type": "self_attn.o_proj"
    },
    "model.layers.7.self_attn.o_proj": {
        "snr": 0.23254290223121643,
        "type": "self_attn.o_proj"
    },
    "model.layers.8.self_attn.o_proj": {
        "snr": 0.2203962802886963,
        "type": "self_attn.o_proj"
    },
    "model.layers.9.self_attn.o_proj": {
        "snr": 0.217017263174057,
        "type": "self_attn.o_proj"
    },
    "model.layers.10.self_attn.o_proj": {
        "snr": 0.22843335568904877,
        "type": "self_attn.o_proj"
    },
    "model.layers.11.self_attn.o_proj": {
        "snr": 0.23816843330860138,
        "type": "self_attn.o_proj"
    },
    "model.layers.12.self_attn.o_proj": {
        "snr": 0.17585325241088867,
        "type": "self_attn.o_proj"
    },
    "model.layers.13.self_attn.o_proj": {
        "snr": 0.20451271533966064,
        "type": "self_attn.o_proj"
    },
    "model.layers.14.self_attn.o_proj": {
        "snr": 0.2095799297094345,
        "type": "self_attn.o_proj"
    },
    "model.layers.15.self_attn.o_proj": {
        "snr": 0.23767071962356567,
        "type": "self_attn.o_proj"
    },
    "model.layers.16.self_attn.o_proj": {
        "snr": 0.16328400373458862,
        "type": "self_attn.o_proj"
    },
    "model.layers.17.self_attn.o_proj": {
        "snr": 0.20690056681632996,
        "type": "self_attn.o_proj"
    },
    "model.layers.18.self_attn.o_proj": {
        "snr": 0.18191492557525635,
        "type": "self_attn.o_proj"
    },
    "model.layers.19.self_attn.o_proj": {
        "snr": 0.1945018619298935,
        "type": "self_attn.o_proj"
    },
    "model.layers.20.self_attn.o_proj": {
        "snr": 0.26658856868743896,
        "type": "self_attn.o_proj"
    },
    "model.layers.21.self_attn.o_proj": {
        "snr": 0.16897724568843842,
        "type": "self_attn.o_proj"
    },
    "model.layers.22.self_attn.o_proj": {
        "snr": 0.18773262202739716,
        "type": "self_attn.o_proj"
    },
    "model.layers.23.self_attn.o_proj": {
        "snr": 0.18808405101299286,
        "type": "self_attn.o_proj"
    },
    "model.layers.24.self_attn.o_proj": {
        "snr": 0.17919476330280304,
        "type": "self_attn.o_proj"
    },
    "model.layers.25.self_attn.o_proj": {
        "snr": 0.1793426126241684,
        "type": "self_attn.o_proj"
    },
    "model.layers.26.self_attn.o_proj": {
        "snr": 0.1777871698141098,
        "type": "self_attn.o_proj"
    },
    "model.layers.27.self_attn.o_proj": {
        "snr": 0.20279864966869354,
        "type": "self_attn.o_proj"
    },
    "model.layers.28.self_attn.o_proj": {
        "snr": 0.17030371725559235,
        "type": "self_attn.o_proj"
    },
    "model.layers.29.self_attn.o_proj": {
        "snr": 0.1992504596710205,
        "type": "self_attn.o_proj"
    },
    "model.layers.30.self_attn.o_proj": {
        "snr": 0.23085352778434753,
        "type": "self_attn.o_proj"
    },
    "model.layers.31.self_attn.o_proj": {
        "snr": 0.1641533523797989,
        "type": "self_attn.o_proj"
    },
    "model.layers.32.self_attn.o_proj": {
        "snr": 0.10621391236782074,
        "type": "self_attn.o_proj"
    },
    "model.layers.33.self_attn.o_proj": {
        "snr": 0.09411631524562836,
        "type": "self_attn.o_proj"
    },
    "model.layers.34.self_attn.o_proj": {
        "snr": 0.13239727914333344,
        "type": "self_attn.o_proj"
    },
    "model.layers.35.self_attn.o_proj": {
        "snr": 0.11740171164274216,
        "type": "self_attn.o_proj"
    },
    "model.layers.0.self_attn.q_proj": {
        "snr": 0.055595725774765015,
        "type": "self_attn.q_proj"
    },
    "model.layers.1.self_attn.q_proj": {
        "snr": 0.13823610544204712,
        "type": "self_attn.q_proj"
    },
    "model.layers.2.self_attn.q_proj": {
        "snr": 0.1297825127840042,
        "type": "self_attn.q_proj"
    },
    "model.layers.3.self_attn.q_proj": {
        "snr": 0.15291297435760498,
        "type": "self_attn.q_proj"
    },
    "model.layers.4.self_attn.q_proj": {
        "snr": 0.15615035593509674,
        "type": "self_attn.q_proj"
    },
    "model.layers.5.self_attn.q_proj": {
        "snr": 0.15535500645637512,
        "type": "self_attn.q_proj"
    },
    "model.layers.6.self_attn.q_proj": {
        "snr": 0.15993140637874603,
        "type": "self_attn.q_proj"
    },
    "model.layers.7.self_attn.q_proj": {
        "snr": 0.1753682643175125,
        "type": "self_attn.q_proj"
    },
    "model.layers.8.self_attn.q_proj": {
        "snr": 0.1664913445711136,
        "type": "self_attn.q_proj"
    },
    "model.layers.9.self_attn.q_proj": {
        "snr": 0.15656901895999908,
        "type": "self_attn.q_proj"
    },
    "model.layers.10.self_attn.q_proj": {
        "snr": 0.18300014734268188,
        "type": "self_attn.q_proj"
    },
    "model.layers.11.self_attn.q_proj": {
        "snr": 0.1713649481534958,
        "type": "self_attn.q_proj"
    },
    "model.layers.12.self_attn.q_proj": {
        "snr": 0.1809009313583374,
        "type": "self_attn.q_proj"
    },
    "model.layers.13.self_attn.q_proj": {
        "snr": 0.20895132422447205,
        "type": "self_attn.q_proj"
    },
    "model.layers.14.self_attn.q_proj": {
        "snr": 0.17413195967674255,
        "type": "self_attn.q_proj"
    },
    "model.layers.15.self_attn.q_proj": {
        "snr": 0.20878490805625916,
        "type": "self_attn.q_proj"
    },
    "model.layers.16.self_attn.q_proj": {
        "snr": 0.1547088772058487,
        "type": "self_attn.q_proj"
    },
    "model.layers.17.self_attn.q_proj": {
        "snr": 0.1943129003047943,
        "type": "self_attn.q_proj"
    },
    "model.layers.18.self_attn.q_proj": {
        "snr": 0.1889297217130661,
        "type": "self_attn.q_proj"
    },
    "model.layers.19.self_attn.q_proj": {
        "snr": 0.207680344581604,
        "type": "self_attn.q_proj"
    },
    "model.layers.20.self_attn.q_proj": {
        "snr": 0.20839959383010864,
        "type": "self_attn.q_proj"
    },
    "model.layers.21.self_attn.q_proj": {
        "snr": 0.18989044427871704,
        "type": "self_attn.q_proj"
    },
    "model.layers.22.self_attn.q_proj": {
        "snr": 0.18180623650550842,
        "type": "self_attn.q_proj"
    },
    "model.layers.23.self_attn.q_proj": {
        "snr": 0.2069384753704071,
        "type": "self_attn.q_proj"
    },
    "model.layers.24.self_attn.q_proj": {
        "snr": 0.1842993050813675,
        "type": "self_attn.q_proj"
    },
    "model.layers.25.self_attn.q_proj": {
        "snr": 0.2078687846660614,
        "type": "self_attn.q_proj"
    },
    "model.layers.26.self_attn.q_proj": {
        "snr": 0.19224946200847626,
        "type": "self_attn.q_proj"
    },
    "model.layers.27.self_attn.q_proj": {
        "snr": 0.15170617401599884,
        "type": "self_attn.q_proj"
    },
    "model.layers.28.self_attn.q_proj": {
        "snr": 0.20116600394248962,
        "type": "self_attn.q_proj"
    },
    "model.layers.29.self_attn.q_proj": {
        "snr": 0.19373668730258942,
        "type": "self_attn.q_proj"
    },
    "model.layers.30.self_attn.q_proj": {
        "snr": 0.18462225794792175,
        "type": "self_attn.q_proj"
    },
    "model.layers.31.self_attn.q_proj": {
        "snr": 0.18939673900604248,
        "type": "self_attn.q_proj"
    },
    "model.layers.32.self_attn.q_proj": {
        "snr": 0.20071947574615479,
        "type": "self_attn.q_proj"
    },
    "model.layers.33.self_attn.q_proj": {
        "snr": 0.19740056991577148,
        "type": "self_attn.q_proj"
    },
    "model.layers.34.self_attn.q_proj": {
        "snr": 0.17658494412899017,
        "type": "self_attn.q_proj"
    },
    "model.layers.35.self_attn.q_proj": {
        "snr": 0.1407373696565628,
        "type": "self_attn.q_proj"
    },
    "model.layers.0.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.1.self_attn.v_proj": {
        "snr": 846.30126953125,
        "type": "self_attn.v_proj"
    },
    "model.layers.2.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.3.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.4.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.5.self_attn.v_proj": {
        "snr": 83.83415222167969,
        "type": "self_attn.v_proj"
    },
    "model.layers.6.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.7.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.8.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.9.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.10.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.11.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.12.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.13.self_attn.v_proj": {
        "snr": 213.51316833496094,
        "type": "self_attn.v_proj"
    },
    "model.layers.14.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.15.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.16.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.17.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.18.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.19.self_attn.v_proj": {
        "snr": 18.92746925354004,
        "type": "self_attn.v_proj"
    },
    "model.layers.20.self_attn.v_proj": {
        "snr": 433.9771728515625,
        "type": "self_attn.v_proj"
    },
    "model.layers.21.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.22.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.23.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.24.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.25.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.26.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.27.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.28.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.29.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.30.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.31.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.32.self_attn.v_proj": {
        "snr": 1.2332282066345215,
        "type": "self_attn.v_proj"
    },
    "model.layers.33.self_attn.v_proj": {
        "snr": 0.6151890158653259,
        "type": "self_attn.v_proj"
    },
    "model.layers.34.self_attn.v_proj": {
        "snr": 509.7169189453125,
        "type": "self_attn.v_proj"
    },
    "model.layers.35.self_attn.v_proj": {
        "snr": 536.0748901367188,
        "type": "self_attn.v_proj"
    }
}


================================================
FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-7B-Instruct.json
================================================
{
    "model.layers.0.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.1.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.2.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.3.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.4.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.5.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.6.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.7.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.8.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.9.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.10.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.11.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.12.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.13.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.14.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.15.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.16.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.17.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.18.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.19.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.20.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.21.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.22.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.23.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.24.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.25.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.26.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.27.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "lm_head": {
        "snr": Infinity,
        "type": "lm_head"
    },
    "model.layers.0.mlp.down_proj": {
        "snr": 10.283808708190918,
        "type": "mlp.down_proj"
    },
    "model.layers.1.mlp.down_proj": {
        "snr": 1.2089825868606567,
        "type": "mlp.down_proj"
    },
    "model.layers.2.mlp.down_proj": {
        "snr": 19.309062957763672,
        "type": "mlp.down_proj"
    },
    "model.layers.3.mlp.down_proj": {
        "snr": 50.174461364746094,
        "type": "mlp.down_proj"
    },
    "model.layers.4.mlp.down_proj": {
        "snr": 114.28582763671875,
        "type": "mlp.down_proj"
    },
    "model.layers.5.mlp.down_proj": {
        "snr": 215.5762176513672,
        "type": "mlp.down_proj"
    },
    "model.layers.6.mlp.down_proj": {
        "snr": 204.5117950439453,
        "type": "mlp.down_proj"
    },
    "model.layers.7.mlp.down_proj": {
        "snr": 182.5479278564453,
        "type": "mlp.down_proj"
    },
    "model.layers.8.mlp.down_proj": {
        "snr": 74.92950439453125,
        "type": "mlp.down_proj"
    },
    "model.layers.9.mlp.down_proj": {
        "snr": 16.482666015625,
        "type": "mlp.down_proj"
    },
    "model.layers.10.mlp.down_proj": {
        "snr": 55.33920669555664,
        "type": "mlp.down_proj"
    },
    "model.layers.11.mlp.down_proj": {
        "snr": 16.851062774658203,
        "type": "mlp.down_proj"
    },
    "model.layers.12.mlp.down_proj": {
        "snr": 58.65230178833008,
        "type": "mlp.down_proj"
    },
    "model.layers.13.mlp.down_proj": {
        "snr": 11.150161743164062,
        "type": "mlp.down_proj"
    },
    "model.layers.14.mlp.down_proj": {
        "snr": 65.32643127441406,
        "type": "mlp.down_proj"
    },
    "model.layers.15.mlp.down_proj": {
        "snr": 46.736305236816406,
        "type": "mlp.down_proj"
    },
    "model.layers.16.mlp.down_proj": {
        "snr": 14.288785934448242,
        "type": "mlp.down_proj"
    },
    "model.layers.17.mlp.down_proj": {
        "snr": 23.40110206604004,
        "type": "mlp.down_proj"
    },
    "model.layers.18.mlp.down_proj": {
        "snr": 86.34363555908203,
        "type": "mlp.down_proj"
    },
    "model.layers.19.mlp.down_proj": {
        "snr": 49.14613342285156,
        "type": "mlp.down_proj"
    },
    "model.layers.20.mlp.down_proj": {
        "snr": 1276.84814453125,
        "type": "mlp.down_proj"
    },
    "model.layers.21.mlp.down_proj": {
        "snr": 51.803409576416016,
        "type": "mlp.down_proj"
    },
    "model.layers.22.mlp.down_proj": {
        "snr": 143.0666046142578,
        "type": "mlp.down_proj"
    },
    "model.layers.23.mlp.down_proj": {
        "snr": 35.14984893798828,
        "type": "mlp.down_proj"
    },
    "model.layers.24.mlp.down_proj": {
        "snr": 21.41700553894043,
        "type": "mlp.down_proj"
    },
    "model.layers.25.mlp.down_proj": {
        "snr": 10.651569366455078,
        "type": "mlp.down_proj"
    },
    "model.layers.26.mlp.down_proj": {
        "snr": 21.635149002075195,
        "type": "mlp.down_proj"
    },
    "model.layers.27.mlp.down_proj": {
        "snr": 1446.2774658203125,
        "type": "mlp.down_proj"
    },
    "model.layers.0.mlp.gate_proj": {
        "snr": 0.04497330263257027,
        "type": "mlp.gate_proj"
    },
    "model.layers.1.mlp.gate_proj": {
        "snr": 0.16888172924518585,
        "type": "mlp.gate_proj"
    },
    "model.layers.2.mlp.gate_proj": {
        "snr": 0.33653727173805237,
        "type": "mlp.gate_proj"
    },
    "model.layers.3.mlp.gate_proj": {
        "snr": 3.1445391178131104,
        "type": "mlp.gate_proj"
    },
    "model.layers.4.mlp.gate_proj": {
        "snr": 9.107144355773926,
        "type": "mlp.gate_proj"
    },
    "model.layers.5.mlp.gate_proj": {
        "snr": 15.909018516540527,
        "type": "mlp.gate_proj"
    },
    "model.layers.6.mlp.gate_proj": {
        "snr": 60.9138069152832,
        "type": "mlp.gate_proj"
    },
    "model.layers.7.mlp.gate_proj": {
        "snr": 57.570281982421875,
        "type": "mlp.gate_proj"
    },
    "model.layers.8.mlp.gate_proj": {
        "snr": 65.82791137695312,
        "type": "mlp.gate_proj"
    },
    "model.layers.9.mlp.gate_proj": {
        "snr": 10.455283164978027,
        "type": "mlp.gate_proj"
    },
    "model.layers.10.mlp.gate_proj": {
        "snr": 26.970706939697266,
        "type": "mlp.gate_proj"
    },
    "model.layers.11.mlp.gate_proj": {
        "snr": 31.139820098876953,
        "type": "mlp.gate_proj"
    },
    "model.layers.12.mlp.gate_proj": {
        "snr": 43.987159729003906,
        "type": "mlp.gate_proj"
    },
    "model.layers.13.mlp.gate_proj": {
        "snr": 20.704849243164062,
        "type": "mlp.gate_proj"
    },
    "model.layers.14.mlp.gate_proj": {
        "snr": 21.191452026367188,
        "type": "mlp.gate_proj"
    },
    "model.layers.15.mlp.gate_proj": {
        "snr": 42.66447830200195,
        "type": "mlp.gate_proj"
    },
    "model.layers.16.mlp.gate_proj": {
        "snr": 22.136825561523438,
        "type": "mlp.gate_proj"
    },
    "model.layers.17.mlp.gate_proj": {
        "snr": 22.60980987548828,
        "type": "mlp.gate_proj"
    },
    "model.layers.18.mlp.gate_proj": {
        "snr": 81.80574035644531,
        "type": "mlp.gate_proj"
    },
    "model.layers.19.mlp.gate_proj": {
        "snr": 20.88619613647461,
        "type": "mlp.gate_proj"
    },
    "model.layers.20.mlp.gate_proj": {
        "snr": 58.3524055480957,
        "type": "mlp.gate_proj"
    },
    "model.layers.21.mlp.gate_proj": {
        "snr": 22.786706924438477,
        "type": "mlp.gate_proj"
    },
    "model.layers.22.mlp.gate_proj": {
        "snr": 16.932226181030273,
        "type": "mlp.gate_proj"
    },
    "model.layers.23.mlp.gate_proj": {
        "snr": 16.819862365722656,
        "type": "mlp.gate_proj"
    },
    "model.layers.24.mlp.gate_proj": {
        "snr": 19.76348304748535,
        "type": "mlp.gate_proj"
    },
    "model.layers.25.mlp.gate_proj": {
        "snr": 28.98714256286621,
        "type": "mlp.gate_proj"
    },
    "model.layers.26.mlp.gate_proj": {
        "snr": 36.7071533203125,
        "type": "mlp.gate_proj"
    },
    "model.layers.27.mlp.gate_proj": {
        "snr": 51.81539535522461,
        "type": "mlp.gate_proj"
    },
    "model.layers.0.mlp.up_proj": {
        "snr": 0.2243107706308365,
        "type": "mlp.up_proj"
    },
    "model.layers.1.mlp.up_proj": {
        "snr": 0.4464716613292694,
        "type": "mlp.up_proj"
    },
    "model.layers.2.mlp.up_proj": {
        "snr": 1.7838181257247925,
        "type": "mlp.up_proj"
    },
    "model.layers.3.mlp.up_proj": {
        "snr": 17.912736892700195,
        "type": "mlp.up_proj"
    },
    "model.layers.4.mlp.up_proj": {
        "snr": 47.45841979980469,
        "type": "mlp.up_proj"
    },
    "model.layers.5.mlp.up_proj": {
        "snr": 56.3084602355957,
        "type": "mlp.up_proj"
    },
    "model.layers.6.mlp.up_proj": {
        "snr": 173.33717346191406,
        "type": "mlp.up_proj"
    },
    "model.layers.7.mlp.up_proj": {
        "snr": 148.22750854492188,
        "type": "mlp.up_proj"
    },
    "model.layers.8.mlp.up_proj": {
        "snr": 133.63565063476562,
        "type": "mlp.up_proj"
    },
    "model.layers.9.mlp.up_proj": {
        "snr": 83.65129852294922,
        "type": "mlp.up_proj"
    },
    "model.layers.10.mlp.up_proj": {
        "snr": 117.94369506835938,
        "type": "mlp.up_proj"
    },
    "model.layers.11.mlp.up_proj": {
        "snr": 94.52413940429688,
        "type": "mlp.up_proj"
    },
    "model.layers.12.mlp.up_proj": {
        "snr": 130.43333435058594,
        "type": "mlp.up_proj"
    },
    "model.layers.13.mlp.up_proj": {
        "snr": 76.11975860595703,
        "type": "mlp.up_proj"
    },
    "model.layers.14.mlp.up_proj": {
        "snr": 158.75192260742188,
        "type": "mlp.up_proj"
    },
    "model.layers.15.mlp.up_proj": {
        "snr": 143.72706604003906,
        "type": "mlp.up_proj"
    },
    "model.layers.16.mlp.up_proj": {
        "snr": 84.28279876708984,
        "type": "mlp.up_proj"
    },
    "model.layers.17.mlp.up_proj": {
        "snr": 116.65055084228516,
        "type": "mlp.up_proj"
    },
    "model.layers.18.mlp.up_proj": {
        "snr": 177.1201934814453,
        "type": "mlp.up_proj"
    },
    "model.layers.19.mlp.up_proj": {
        "snr": 82.4564437866211,
        "type": "mlp.up_proj"
    },
    "model.layers.20.mlp.up_proj": {
        "snr": 137.73019409179688,
        "type": "mlp.up_proj"
    },
    "model.layers.21.mlp.up_proj": {
        "snr": 89.97538757324219,
        "type": "mlp.up_proj"
    },
    "model.layers.22.mlp.up_proj": {
        "snr": 86.30876159667969,
        "type": "mlp.up_proj"
    },
    "model.layers.23.mlp.up_proj": {
        "snr": 61.53449249267578,
        "type": "mlp.up_proj"
    },
    "model.layers.24.mlp.up_proj": {
        "snr": 45.22392654418945,
        "type": "mlp.up_proj"
    },
    "model.layers.25.mlp.up_proj": {
        "snr": 60.3155517578125,
        "type": "mlp.up_proj"
    },
    "model.layers.26.mlp.up_proj": {
        "snr": 40.06092071533203,
        "type": "mlp.up_proj"
    },
    "model.layers.27.mlp.up_proj": {
        "snr": 48.12322998046875,
        "type": "mlp.up_proj"
    },
    "model.embed_tokens": {
        "snr": 0.08805440366268158,
        "type": "model.embed_tokens"
    },
    "model.norm": {
        "snr": Infinity,
        "type": "model.norm"
    },
    "model.layers.0.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.1.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.2.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.3.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.4.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.5.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.6.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.7.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.8.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.9.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.10.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.11.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.12.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.13.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.14.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.15.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.16.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.17.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.18.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.19.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.20.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.21.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.22.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.23.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.24.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.25.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.26.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.27.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.0.self_attn.k_proj": {
        "snr": 4.771554470062256,
        "type": "self_attn.k_proj"
    },
    "model.layers.1.self_attn.k_proj": {
        "snr": 0.46674421429634094,
        "type": "self_attn.k_proj"
    },
    "model.layers.2.self_attn.k_proj": {
        "snr": 1.6167784929275513,
        "type": "self_attn.k_proj"
    },
    "model.layers.3.self_attn.k_proj": {
        "snr": 2.0980119705200195,
        "type": "self_attn.k_proj"
    },
    "model.layers.4.self_attn.k_proj": {
        "snr": 1.4339035749435425,
        "type": "self_attn.k_proj"
    },
    "model.layers.5.self_attn.k_proj": {
        "snr": 1.7446703910827637,
        "type": "self_attn.k_proj"
    },
    "model.layers.6.self_attn.k_proj": {
        "snr": 1.2829725742340088,
        "type": "self_attn.k_proj"
    },
    "model.layers.7.self_attn.k_proj": {
        "snr": 2.2314982414245605,
        "type": "self_attn.k_proj"
    },
    "model.layers.8.self_attn.k_proj": {
        "snr": 1.5125916004180908,
        "type": "self_attn.k_proj"
    },
    "model.layers.9.self_attn.k_proj": {
        "snr": 1.2817912101745605,
        "type": "self_attn.k_proj"
    },
    "model.layers.10.self_attn.k_proj": {
        "snr": 3.3553454875946045,
        "type": "self_attn.k_proj"
    },
    "model.layers.11.self_attn.k_proj": {
        "snr": 1.591347336769104,
        "type": "self_attn.k_proj"
    },
    "model.layers.12.self_attn.k_proj": {
        "snr": 1.1114169359207153,
        "type": "self_attn.k_proj"
    },
    "model.layers.13.self_attn.k_proj": {
        "snr": 1.1536189317703247,
        "type": "self_attn.k_proj"
    },
    "model.layers.14.self_attn.k_proj": {
        "snr": 0.994098424911499,
        "type": "self_attn.k_proj"
    },
    "model.layers.15.self_attn.k_proj": {
        "snr": 1.484580636024475,
        "type": "self_attn.k_proj"
    },
    "model.layers.16.self_attn.k_proj": {
        "snr": 1.2999093532562256,
        "type": "self_attn.k_proj"
    },
    "model.layers.17.self_attn.k_proj": {
        "snr": 2.1628623008728027,
        "type": "self_attn.k_proj"
    },
    "model.layers.18.self_attn.k_proj": {
        "snr": 1.3842225074768066,
        "type": "self_attn.k_proj"
    },
    "model.layers.19.self_attn.k_proj": {
        "snr": 1.440075159072876,
        "type": "self_attn.k_proj"
    },
    "model.layers.20.self_attn.k_proj": {
        "snr": 1.7816450595855713,
        "type": "self_attn.k_proj"
    },
    "model.layers.21.self_attn.k_proj": {
        "snr": 1.746536135673523,
        "type": "self_attn.k_proj"
    },
    "model.layers.22.self_attn.k_proj": {
        "snr": 1.318993091583252,
        "type": "self_attn.k_proj"
    },
    "model.layers.23.self_attn.k_proj": {
        "snr": 1.7234206199645996,
        "type": "self_attn.k_proj"
    },
    "model.layers.24.self_attn.k_proj": {
        "snr": 2.586996555328369,
        "type": "self_attn.k_proj"
    },
    "model.layers.25.self_attn.k_proj": {
        "snr": 1.6486897468566895,
        "type": "self_attn.k_proj"
    },
    "model.layers.26.self_attn.k_proj": {
        "snr": 1.3349357843399048,
        "type": "self_attn.k_proj"
    },
    "model.layers.27.self_attn.k_proj": {
        "snr": 0.9039687514305115,
        "type": "self_attn.k_proj"
    },
    "model.layers.0.self_attn.o_proj": {
        "snr": 0.10605750232934952,
        "type": "self_attn.o_proj"
    },
    "model.layers.1.self_attn.o_proj": {
        "snr": 0.2503393292427063,
        "type": "self_attn.o_proj"
    },
    "model.layers.2.self_attn.o_proj": {
        "snr": 0.21453581750392914,
        "type": "self_attn.o_proj"
    },
    "model.layers.3.self_attn.o_proj": {
        "snr": 0.20600366592407227,
        "type": "self_attn.o_proj"
    },
    "model.layers.4.self_attn.o_proj": {
        "snr": 0.22004099190235138,
        "type": "self_attn.o_proj"
    },
    "model.layers.5.self_attn.o_proj": {
        "snr": 0.2267625778913498,
        "type": "self_attn.o_proj"
    },
    "model.layers.6.self_attn.o_proj": {
        "snr": 0.1736888736486435,
        "type": "self_attn.o_proj"
    },
    "model.layers.7.self_attn.o_proj": {
        "snr": 0.2314220815896988,
        "type": "self_attn.o_proj"
    },
    "model.layers.8.self_attn.o_proj": {
        "snr": 0.24031606316566467,
        "type": "self_attn.o_proj"
    },
    "model.layers.9.self_attn.o_proj": {
        "snr": 0.13458871841430664,
        "type": "self_attn.o_proj"
    },
    "model.layers.10.self_attn.o_proj": {
        "snr": 0.20170633494853973,
        "type": "self_attn.o_proj"
    },
    "model.layers.11.self_attn.o_proj": {
        "snr": 0.19507651031017303,
        "type": "self_attn.o_proj"
    },
    "model.layers.12.self_attn.o_proj": {
        "snr": 0.1862162947654724,
        "type": "self_attn.o_proj"
    },
    "model.layers.13.self_attn.o_proj": {
        "snr": 0.15117767453193665,
        "type": "self_attn.o_proj"
    },
    "model.layers.14.self_attn.o_proj": {
        "snr": 0.1857745349407196,
        "type": "self_attn.o_proj"
    },
    "model.layers.15.self_attn.o_proj": {
        "snr": 0.2064860314130783,
        "type": "self_attn.o_proj"
    },
    "model.layers.16.self_attn.o_proj": {
        "snr": 0.15419450402259827,
        "type": "self_attn.o_proj"
    },
    "model.layers.17.self_attn.o_proj": {
        "snr": 0.17895667254924774,
        "type": "self_attn.o_proj"
    },
    "model.layers.18.self_attn.o_proj": {
        "snr": 0.18284623324871063,
        "type": "self_attn.o_proj"
    },
    "model.layers.19.self_attn.o_proj": {
        "snr": 0.17497135698795319,
        "type": "self_attn.o_proj"
    },
    "model.layers.20.self_attn.o_proj": {
        "snr": 0.178844153881073,
        "type": "self_attn.o_proj"
    },
    "model.layers.21.self_attn.o_proj": {
        "snr": 0.16190896928310394,
        "type": "self_attn.o_proj"
    },
    "model.layers.22.self_attn.o_proj": {
        "snr": 0.19371949136257172,
        "type": "self_attn.o_proj"
    },
    "model.layers.23.self_attn.o_proj": {
        "snr": 0.14116843044757843,
        "type": "self_attn.o_proj"
    },
    "model.layers.24.self_attn.o_proj": {
        "snr": 0.14100700616836548,
        "type": "self_attn.o_proj"
    },
    "model.layers.25.self_attn.o_proj": {
        "snr": 0.14792074263095856,
        "type": "self_attn.o_proj"
    },
    "model.layers.26.self_attn.o_proj": {
        "snr": 0.11953117698431015,
        "type": "self_attn.o_proj"
    },
    "model.layers.27.self_attn.o_proj": {
        "snr": 0.06241385638713837,
        "type": "self_attn.o_proj"
    },
    "model.layers.0.self_attn.q_proj": {
        "snr": 0.02127065323293209,
        "type": "self_attn.q_proj"
    },
    "model.layers.1.self_attn.q_proj": {
        "snr": 0.14693336188793182,
        "type": "self_attn.q_proj"
    },
    "model.layers.2.self_attn.q_proj": {
        "snr": 0.16316214203834534,
        "type": "self_attn.q_proj"
    },
    "model.layers.3.self_attn.q_proj": {
        "snr": 0.1218630000948906,
        "type": "self_attn.q_proj"
    },
    "model.layers.4.self_attn.q_proj": {
        "snr": 0.13916714489459991,
        "type": "self_attn.q_proj"
    },
    "model.layers.5.self_attn.q_proj": {
        "snr": 0.155359148979187,
        "type": "self_attn.q_proj"
    },
    "model.layers.6.self_attn.q_proj": {
        "snr": 0.1590007096529007,
        "type": "self_attn.q_proj"
    },
    "model.layers.7.self_attn.q_proj": {
        "snr": 0.1958903819322586,
        "type": "self_attn.q_proj"
    },
    "model.layers.8.self_attn.q_proj": {
        "snr": 0.22448301315307617,
        "type": "self_attn.q_proj"
    },
    "model.layers.9.self_attn.q_proj": {
        "snr": 0.20126597583293915,
        "type": "self_attn.q_proj"
    },
    "model.layers.10.self_attn.q_proj": {
        "snr": 0.1980895698070526,
        "type": "self_attn.q_proj"
    },
    "model.layers.11.self_attn.q_proj": {
        "snr": 0.2289486974477768,
        "type": "self_attn.q_proj"
    },
    "model.layers.12.self_attn.q_proj": {
        "snr": 0.22922305762767792,
        "type": "self_attn.q_proj"
    },
    "model.layers.13.self_attn.q_proj": {
        "snr": 0.21452386677265167,
        "type": "self_attn.q_proj"
    },
    "model.layers.14.self_attn.q_proj": {
        "snr": 0.24151542782783508,
        "type": "self_attn.q_proj"
    },
    "model.layers.15.self_attn.q_proj": {
        "snr": 0.21893717348575592,
        "type": "self_attn.q_proj"
    },
    "model.layers.16.self_attn.q_proj": {
        "snr": 0.2321016639471054,
        "type": "self_attn.q_proj"
    },
    "model.layers.17.self_attn.q_proj": {
        "snr": 0.24078059196472168,
        "type": "self_attn.q_proj"
    },
    "model.layers.18.self_attn.q_proj": {
        "snr": 0.22774985432624817,
        "type": "self_attn.q_proj"
    },
    "model.layers.19.self_attn.q_proj": {
        "snr": 0.20914016664028168,
        "type": "self_attn.q_proj"
    },
    "model.layers.20.self_attn.q_proj": {
        "snr": 0.22847522795200348,
        "type": "self_attn.q_proj"
    },
    "model.layers.21.self_attn.q_proj": {
        "snr": 0.2500442862510681,
        "type": "self_attn.q_proj"
    },
    "model.layers.22.self_attn.q_proj": {
        "snr": 0.2353251725435257,
        "type": "self_attn.q_proj"
    },
    "model.layers.23.self_attn.q_proj": {
        "snr": 0.20365388691425323,
        "type": "self_attn.q_proj"
    },
    "model.layers.24.self_attn.q_proj": {
        "snr": 0.21967172622680664,
        "type": "self_attn.q_proj"
    },
    "model.layers.25.self_attn.q_proj": {
        "snr": 0.2122868150472641,
        "type": "self_attn.q_proj"
    },
    "model.layers.26.self_attn.q_proj": {
        "snr": 0.2415798157453537,
        "type": "self_attn.q_proj"
    },
    "model.layers.27.self_attn.q_proj": {
        "snr": 0.12347634881734848,
        "type": "self_attn.q_proj"
    },
    "model.layers.0.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.1.self_attn.v_proj": {
        "snr": 230.88636779785156,
        "type": "self_attn.v_proj"
    },
    "model.layers.2.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.3.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.4.self_attn.v_proj": {
        "snr": 22.38136100769043,
        "type": "self_attn.v_proj"
    },
    "model.layers.5.self_attn.v_proj": {
        "snr": 246.59597778320312,
        "type": "self_attn.v_proj"
    },
    "model.layers.6.self_attn.v_proj": {
        "snr": 499.61761474609375,
        "type": "self_attn.v_proj"
    },
    "model.layers.7.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.8.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.9.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.10.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.11.self_attn.v_proj": {
        "snr": 69.18345642089844,
        "type": "self_attn.v_proj"
    },
    "model.layers.12.self_attn.v_proj": {
        "snr": 984.9320068359375,
        "type": "self_attn.v_proj"
    },
    "model.layers.13.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.14.self_attn.v_proj": {
        "snr": 64.06214141845703,
        "type": "self_attn.v_proj"
    },
    "model.layers.15.self_attn.v_proj": {
        "snr": 28.43911361694336,
        "type": "self_attn.v_proj"
    },
    "model.layers.16.self_attn.v_proj": {
        "snr": 725.1439819335938,
        "type": "self_attn.v_proj"
    },
    "model.layers.17.self_attn.v_proj": {
        "snr": 63.43681716918945,
        "type": "self_attn.v_proj"
    },
    "model.layers.18.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.19.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.20.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.21.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.22.self_attn.v_proj": {
        "snr": 238.4695587158203,
        "type": "self_attn.v_proj"
    },
    "model.layers.23.self_attn.v_proj": {
        "snr": 111.88697814941406,
        "type": "self_attn.v_proj"
    },
    "model.layers.24.self_attn.v_proj": {
        "snr": 686.2830200195312,
        "type": "self_attn.v_proj"
    },
    "model.layers.25.self_attn.v_proj": {
        "snr": 566.2647705078125,
        "type": "self_attn.v_proj"
    },
    "model.layers.26.self_attn.v_proj": {
        "snr": 4.070064544677734,
        "type": "self_attn.v_proj"
    },
    "model.layers.27.self_attn.v_proj": {
        "snr": 4.3411664962768555,
        "type": "self_attn.v_proj"
    }
}


================================================
FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_Qwen-Qwen2.5-7B.json
================================================
{
    "model.layers.0.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.1.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.2.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.3.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.4.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.5.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.6.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.7.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.8.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.9.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.10.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.11.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.12.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.13.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.14.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.15.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.16.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.17.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.18.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.19.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.20.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.21.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.22.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.23.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.24.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.25.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.26.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.27.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "lm_head": {
        "snr": Infinity,
        "type": "lm_head"
    },
    "model.layers.0.mlp.down_proj": {
        "snr": 10.277782440185547,
        "type": "mlp.down_proj"
    },
    "model.layers.1.mlp.down_proj": {
        "snr": 1.2050706148147583,
        "type": "mlp.down_proj"
    },
    "model.layers.2.mlp.down_proj": {
        "snr": 19.284534454345703,
        "type": "mlp.down_proj"
    },
    "model.layers.3.mlp.down_proj": {
        "snr": 50.16513442993164,
        "type": "mlp.down_proj"
    },
    "model.layers.4.mlp.down_proj": {
        "snr": 114.24882507324219,
        "type": "mlp.down_proj"
    },
    "model.layers.5.mlp.down_proj": {
        "snr": 215.48194885253906,
        "type": "mlp.down_proj"
    },
    "model.layers.6.mlp.down_proj": {
        "snr": 204.39431762695312,
        "type": "mlp.down_proj"
    },
    "model.layers.7.mlp.down_proj": {
        "snr": 182.5116729736328,
        "type": "mlp.down_proj"
    },
    "model.layers.8.mlp.down_proj": {
        "snr": 74.9266128540039,
        "type": "mlp.down_proj"
    },
    "model.layers.9.mlp.down_proj": {
        "snr": 16.474102020263672,
        "type": "mlp.down_proj"
    },
    "model.layers.10.mlp.down_proj": {
        "snr": 55.30583572387695,
        "type": "mlp.down_proj"
    },
    "model.layers.11.mlp.down_proj": {
        "snr": 16.84047508239746,
        "type": "mlp.down_proj"
    },
    "model.layers.12.mlp.down_proj": {
        "snr": 58.62131118774414,
        "type": "mlp.down_proj"
    },
    "model.layers.13.mlp.down_proj": {
        "snr": 11.144298553466797,
        "type": "mlp.down_proj"
    },
    "model.layers.14.mlp.down_proj": {
        "snr": 65.28057098388672,
        "type": "mlp.down_proj"
    },
    "model.layers.15.mlp.down_proj": {
        "snr": 46.701290130615234,
        "type": "mlp.down_proj"
    },
    "model.layers.16.mlp.down_proj": {
        "snr": 14.278325080871582,
        "type": "mlp.down_proj"
    },
    "model.layers.17.mlp.down_proj": {
        "snr": 23.382247924804688,
        "type": "mlp.down_proj"
    },
    "model.layers.18.mlp.down_proj": {
        "snr": 93.8782958984375,
        "type": "mlp.down_proj"
    },
    "model.layers.19.mlp.down_proj": {
        "snr": 49.10498809814453,
        "type": "mlp.down_proj"
    },
    "model.layers.20.mlp.down_proj": {
        "snr": 1277.5101318359375,
        "type": "mlp.down_proj"
    },
    "model.layers.21.mlp.down_proj": {
        "snr": 51.7880859375,
        "type": "mlp.down_proj"
    },
    "model.layers.22.mlp.down_proj": {
        "snr": 143.03504943847656,
        "type": "mlp.down_proj"
    },
    "model.layers.23.mlp.down_proj": {
        "snr": 35.123931884765625,
        "type": "mlp.down_proj"
    },
    "model.layers.24.mlp.down_proj": {
        "snr": 21.403743743896484,
        "type": "mlp.down_proj"
    },
    "model.layers.25.mlp.down_proj": {
        "snr": 10.551352500915527,
        "type": "mlp.down_proj"
    },
    "model.layers.26.mlp.down_proj": {
        "snr": 21.62333869934082,
        "type": "mlp.down_proj"
    },
    "model.layers.27.mlp.down_proj": {
        "snr": 1541.98681640625,
        "type": "mlp.down_proj"
    },
    "model.layers.0.mlp.gate_proj": {
        "snr": 0.04497644677758217,
        "type": "mlp.gate_proj"
    },
    "model.layers.1.mlp.gate_proj": {
        "snr": 0.16878646612167358,
        "type": "mlp.gate_proj"
    },
    "model.layers.2.mlp.gate_proj": {
        "snr": 0.336302250623703,
        "type": "mlp.gate_proj"
    },
    "model.layers.3.mlp.gate_proj": {
        "snr": 3.141293525695801,
        "type": "mlp.gate_proj"
    },
    "model.layers.4.mlp.gate_proj": {
        "snr": 9.098686218261719,
        "type": "mlp.gate_proj"
    },
    "model.layers.5.mlp.gate_proj": {
        "snr": 15.89354419708252,
        "type": "mlp.gate_proj"
    },
    "model.layers.6.mlp.gate_proj": {
        "snr": 60.85503387451172,
        "type": "mlp.gate_proj"
    },
    "model.layers.7.mlp.gate_proj": {
        "snr": 57.53098678588867,
        "type": "mlp.gate_proj"
    },
    "model.layers.8.mlp.gate_proj": {
        "snr": 65.77096557617188,
        "type": "mlp.gate_proj"
    },
    "model.layers.9.mlp.gate_proj": {
        "snr": 10.453179359436035,
        "type": "mlp.gate_proj"
    },
    "model.layers.10.mlp.gate_proj": {
        "snr": 26.94801139831543,
        "type": "mlp.gate_proj"
    },
    "model.layers.11.mlp.gate_proj": {
        "snr": 31.111093521118164,
        "type": "mlp.gate_proj"
    },
    "model.layers.12.mlp.gate_proj": {
        "snr": 43.963191986083984,
        "type": "mlp.gate_proj"
    },
    "model.layers.13.mlp.gate_proj": {
        "snr": 20.690765380859375,
        "type": "mlp.gate_proj"
    },
    "model.layers.14.mlp.gate_proj": {
        "snr": 20.47557258605957,
        "type": "mlp.gate_proj"
    },
    "model.layers.15.mlp.gate_proj": {
        "snr": 42.63906478881836,
        "type": "mlp.gate_proj"
    },
    "model.layers.16.mlp.gate_proj": {
        "snr": 22.11542320251465,
        "type": "mlp.gate_proj"
    },
    "model.layers.17.mlp.gate_proj": {
        "snr": 22.590566635131836,
        "type": "mlp.gate_proj"
    },
    "model.layers.18.mlp.gate_proj": {
        "snr": 81.74773406982422,
        "type": "mlp.gate_proj"
    },
    "model.layers.19.mlp.gate_proj": {
        "snr": 20.872997283935547,
        "type": "mlp.gate_proj"
    },
    "model.layers.20.mlp.gate_proj": {
        "snr": 58.32197952270508,
        "type": "mlp.gate_proj"
    },
    "model.layers.21.mlp.gate_proj": {
        "snr": 22.784095764160156,
        "type": "mlp.gate_proj"
    },
    "model.layers.22.mlp.gate_proj": {
        "snr": 16.935768127441406,
        "type": "mlp.gate_proj"
    },
    "model.layers.23.mlp.gate_proj": {
        "snr": 16.830224990844727,
        "type": "mlp.gate_proj"
    },
    "model.layers.24.mlp.gate_proj": {
        "snr": 19.774564743041992,
        "type": "mlp.gate_proj"
    },
    "model.layers.25.mlp.gate_proj": {
        "snr": 27.770675659179688,
        "type": "mlp.gate_proj"
    },
    "model.layers.26.mlp.gate_proj": {
        "snr": 36.714595794677734,
        "type": "mlp.gate_proj"
    },
    "model.layers.27.mlp.gate_proj": {
        "snr": 51.81637191772461,
        "type": "mlp.gate_proj"
    },
    "model.layers.0.mlp.up_proj": {
        "snr": 0.22425401210784912,
        "type": "mlp.up_proj"
    },
    "model.layers.1.mlp.up_proj": {
        "snr": 0.4456978142261505,
        "type": "mlp.up_proj"
    },
    "model.layers.2.mlp.up_proj": {
        "snr": 1.7769725322723389,
        "type": "mlp.up_proj"
    },
    "model.layers.3.mlp.up_proj": {
        "snr": 17.8966121673584,
        "type": "mlp.up_proj"
    },
    "model.layers.4.mlp.up_proj": {
        "snr": 47.43608856201172,
        "type": "mlp.up_proj"
    },
    "model.layers.5.mlp.up_proj": {
        "snr": 56.2298698425293,
        "type": "mlp.up_proj"
    },
    "model.layers.6.mlp.up_proj": {
        "snr": 173.1498260498047,
        "type": "mlp.up_proj"
    },
    "model.layers.7.mlp.up_proj": {
        "snr": 148.02874755859375,
        "type": "mlp.up_proj"
    },
    "model.layers.8.mlp.up_proj": {
        "snr": 133.5174560546875,
        "type": "mlp.up_proj"
    },
    "model.layers.9.mlp.up_proj": {
        "snr": 83.45183563232422,
        "type": "mlp.up_proj"
    },
    "model.layers.10.mlp.up_proj": {
        "snr": 117.88772583007812,
        "type": "mlp.up_proj"
    },
    "model.layers.11.mlp.up_proj": {
        "snr": 94.41156768798828,
        "type": "mlp.up_proj"
    },
    "model.layers.12.mlp.up_proj": {
        "snr": 130.3107452392578,
        "type": "mlp.up_proj"
    },
    "model.layers.13.mlp.up_proj": {
        "snr": 76.04458618164062,
        "type": "mlp.up_proj"
    },
    "model.layers.14.mlp.up_proj": {
        "snr": 158.59634399414062,
        "type": "mlp.up_proj"
    },
    "model.layers.15.mlp.up_proj": {
        "snr": 143.59596252441406,
        "type": "mlp.up_proj"
    },
    "model.layers.16.mlp.up_proj": {
        "snr": 84.2161636352539,
        "type": "mlp.up_proj"
    },
    "model.layers.17.mlp.up_proj": {
        "snr": 116.55204010009766,
        "type": "mlp.up_proj"
    },
    "model.layers.18.mlp.up_proj": {
        "snr": 176.95449829101562,
        "type": "mlp.up_proj"
    },
    "model.layers.19.mlp.up_proj": {
        "snr": 82.37284088134766,
        "type": "mlp.up_proj"
    },
    "model.layers.20.mlp.up_proj": {
        "snr": 137.5695343017578,
        "type": "mlp.up_proj"
    },
    "model.layers.21.mlp.up_proj": {
        "snr": 89.87335205078125,
        "type": "mlp.up_proj"
    },
    "model.layers.22.mlp.up_proj": {
        "snr": 86.1510238647461,
        "type": "mlp.up_proj"
    },
    "model.layers.23.mlp.up_proj": {
        "snr": 61.37428665161133,
        "type": "mlp.up_proj"
    },
    "model.layers.24.mlp.up_proj": {
        "snr": 45.10757064819336,
        "type": "mlp.up_proj"
    },
    "model.layers.25.mlp.up_proj": {
        "snr": 60.16519546508789,
        "type": "mlp.up_proj"
    },
    "model.layers.26.mlp.up_proj": {
        "snr": 39.96969223022461,
        "type": "mlp.up_proj"
    },
    "model.layers.27.mlp.up_proj": {
        "snr": 48.04258346557617,
        "type": "mlp.up_proj"
    },
    "model.embed_tokens": {
        "snr": 0.08800078183412552,
        "type": "model.embed_tokens"
    },
    "model.norm": {
        "snr": Infinity,
        "type": "model.norm"
    },
    "model.layers.0.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.1.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.2.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.3.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.4.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.5.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.6.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.7.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.8.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.9.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.10.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.11.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.12.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.13.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.14.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.15.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.16.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.17.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.18.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.19.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.20.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.21.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.22.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.23.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.24.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.25.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.26.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.27.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.0.self_attn.k_proj": {
        "snr": 4.764852046966553,
        "type": "self_attn.k_proj"
    },
    "model.layers.1.self_attn.k_proj": {
        "snr": 0.46627077460289,
        "type": "self_attn.k_proj"
    },
    "model.layers.2.self_attn.k_proj": {
        "snr": 1.6155915260314941,
        "type": "self_attn.k_proj"
    },
    "model.layers.3.self_attn.k_proj": {
        "snr": 2.096365451812744,
        "type": "self_attn.k_proj"
    },
    "model.layers.4.self_attn.k_proj": {
        "snr": 1.431254267692566,
        "type": "self_attn.k_proj"
    },
    "model.layers.5.self_attn.k_proj": {
        "snr": 1.7440669536590576,
        "type": "self_attn.k_proj"
    },
    "model.layers.6.self_attn.k_proj": {
        "snr": 1.2815033197402954,
        "type": "self_attn.k_proj"
    },
    "model.layers.7.self_attn.k_proj": {
        "snr": 2.2301025390625,
        "type": "self_attn.k_proj"
    },
    "model.layers.8.self_attn.k_proj": {
        "snr": 1.5116536617279053,
        "type": "self_attn.k_proj"
    },
    "model.layers.9.self_attn.k_proj": {
        "snr": 1.2699830532073975,
        "type": "self_attn.k_proj"
    },
    "model.layers.10.self_attn.k_proj": {
        "snr": 3.3086464405059814,
        "type": "self_attn.k_proj"
    },
    "model.layers.11.self_attn.k_proj": {
        "snr": 1.59111487865448,
        "type": "self_attn.k_proj"
    },
    "model.layers.12.self_attn.k_proj": {
        "snr": 1.1007944345474243,
        "type": "self_attn.k_proj"
    },
    "model.layers.13.self_attn.k_proj": {
        "snr": 1.163416862487793,
        "type": "self_attn.k_proj"
    },
    "model.layers.14.self_attn.k_proj": {
        "snr": 0.9935113787651062,
        "type": "self_attn.k_proj"
    },
    "model.layers.15.self_attn.k_proj": {
        "snr": 1.483581304550171,
        "type": "self_attn.k_proj"
    },
    "model.layers.16.self_attn.k_proj": {
        "snr": 1.2992271184921265,
        "type": "self_attn.k_proj"
    },
    "model.layers.17.self_attn.k_proj": {
        "snr": 2.162485122680664,
        "type": "self_attn.k_proj"
    },
    "model.layers.18.self_attn.k_proj": {
        "snr": 1.3841017484664917,
        "type": "self_attn.k_proj"
    },
    "model.layers.19.self_attn.k_proj": {
        "snr": 1.453418493270874,
        "type": "self_attn.k_proj"
    },
    "model.layers.20.self_attn.k_proj": {
        "snr": 1.781678557395935,
        "type": "self_attn.k_proj"
    },
    "model.layers.21.self_attn.k_proj": {
        "snr": 1.7460925579071045,
        "type": "self_attn.k_proj"
    },
    "model.layers.22.self_attn.k_proj": {
        "snr": 1.3188031911849976,
        "type": "self_attn.k_proj"
    },
    "model.layers.23.self_attn.k_proj": {
        "snr": 1.723441243171692,
        "type": "self_attn.k_proj"
    },
    "model.layers.24.self_attn.k_proj": {
        "snr": 2.585094928741455,
        "type": "self_attn.k_proj"
    },
    "model.layers.25.self_attn.k_proj": {
        "snr": 1.6478856801986694,
        "type": "self_attn.k_proj"
    },
    "model.layers.26.self_attn.k_proj": {
        "snr": 1.3221096992492676,
        "type": "self_attn.k_proj"
    },
    "model.layers.27.self_attn.k_proj": {
        "snr": 0.9034463167190552,
        "type": "self_attn.k_proj"
    },
    "model.layers.0.self_attn.o_proj": {
        "snr": 0.10636883229017258,
        "type": "self_attn.o_proj"
    },
    "model.layers.1.self_attn.o_proj": {
        "snr": 0.24971255660057068,
        "type": "self_attn.o_proj"
    },
    "model.layers.2.self_attn.o_proj": {
        "snr": 0.21437697112560272,
        "type": "self_attn.o_proj"
    },
    "model.layers.3.self_attn.o_proj": {
        "snr": 0.2058248072862625,
        "type": "self_attn.o_proj"
    },
    "model.layers.4.self_attn.o_proj": {
        "snr": 0.21978946030139923,
        "type": "self_attn.o_proj"
    },
    "model.layers.5.self_attn.o_proj": {
        "snr": 0.2269466072320938,
        "type": "self_attn.o_proj"
    },
    "model.layers.6.self_attn.o_proj": {
        "snr": 0.17318543791770935,
        "type": "self_attn.o_proj"
    },
    "model.layers.7.self_attn.o_proj": {
        "snr": 0.23159846663475037,
        "type": "self_attn.o_proj"
    },
    "model.layers.8.self_attn.o_proj": {
        "snr": 0.2400084286928177,
        "type": "self_attn.o_proj"
    },
    "model.layers.9.self_attn.o_proj": {
        "snr": 0.134766086935997,
        "type": "self_attn.o_proj"
    },
    "model.layers.10.self_attn.o_proj": {
        "snr": 0.20152011513710022,
        "type": "self_attn.o_proj"
    },
    "model.layers.11.self_attn.o_proj": {
        "snr": 0.19492347538471222,
        "type": "self_attn.o_proj"
    },
    "model.layers.12.self_attn.o_proj": {
        "snr": 0.18607021868228912,
        "type": "self_attn.o_proj"
    },
    "model.layers.13.self_attn.o_proj": {
        "snr": 0.15107683837413788,
        "type": "self_attn.o_proj"
    },
    "model.layers.14.self_attn.o_proj": {
        "snr": 0.18565276265144348,
        "type": "self_attn.o_proj"
    },
    "model.layers.15.self_attn.o_proj": {
        "snr": 0.20626339316368103,
        "type": "self_attn.o_proj"
    },
    "model.layers.16.self_attn.o_proj": {
        "snr": 0.1541011780500412,
        "type": "self_attn.o_proj"
    },
    "model.layers.17.self_attn.o_proj": {
        "snr": 0.1784645915031433,
        "type": "self_attn.o_proj"
    },
    "model.layers.18.self_attn.o_proj": {
        "snr": 0.18307389318943024,
        "type": "self_attn.o_proj"
    },
    "model.layers.19.self_attn.o_proj": {
        "snr": 0.17449897527694702,
        "type": "self_attn.o_proj"
    },
    "model.layers.20.self_attn.o_proj": {
        "snr": 0.1787375956773758,
        "type": "self_attn.o_proj"
    },
    "model.layers.21.self_attn.o_proj": {
        "snr": 0.161802276968956,
        "type": "self_attn.o_proj"
    },
    "model.layers.22.self_attn.o_proj": {
        "snr": 0.1931520402431488,
        "type": "self_attn.o_proj"
    },
    "model.layers.23.self_attn.o_proj": {
        "snr": 0.14108893275260925,
        "type": "self_attn.o_proj"
    },
    "model.layers.24.self_attn.o_proj": {
        "snr": 0.14064815640449524,
        "type": "self_attn.o_proj"
    },
    "model.layers.25.self_attn.o_proj": {
        "snr": 0.14790543913841248,
        "type": "self_attn.o_proj"
    },
    "model.layers.26.self_attn.o_proj": {
        "snr": 0.11950570344924927,
        "type": "self_attn.o_proj"
    },
    "model.layers.27.self_attn.o_proj": {
        "snr": 0.062389008700847626,
        "type": "self_attn.o_proj"
    },
    "model.layers.0.self_attn.q_proj": {
        "snr": 0.02138795144855976,
        "type": "self_attn.q_proj"
    },
    "model.layers.1.self_attn.q_proj": {
        "snr": 0.14676862955093384,
        "type": "self_attn.q_proj"
    },
    "model.layers.2.self_attn.q_proj": {
        "snr": 0.16297142207622528,
        "type": "self_attn.q_proj"
    },
    "model.layers.3.self_attn.q_proj": {
        "snr": 0.12198334187269211,
        "type": "self_attn.q_proj"
    },
    "model.layers.4.self_attn.q_proj": {
        "snr": 0.13921146094799042,
        "type": "self_attn.q_proj"
    },
    "model.layers.5.self_attn.q_proj": {
        "snr": 0.15567339956760406,
        "type": "self_attn.q_proj"
    },
    "model.layers.6.self_attn.q_proj": {
        "snr": 0.1589033454656601,
        "type": "self_attn.q_proj"
    },
    "model.layers.7.self_attn.q_proj": {
        "snr": 0.195299431681633,
        "type": "self_attn.q_proj"
    },
    "model.layers.8.self_attn.q_proj": {
        "snr": 0.22430908679962158,
        "type": "self_attn.q_proj"
    },
    "model.layers.9.self_attn.q_proj": {
        "snr": 0.2011336237192154,
        "type": "self_attn.q_proj"
    },
    "model.layers.10.self_attn.q_proj": {
        "snr": 0.1982448250055313,
        "type": "self_attn.q_proj"
    },
    "model.layers.11.self_attn.q_proj": {
        "snr": 0.22880099713802338,
        "type": "self_attn.q_proj"
    },
    "model.layers.12.self_attn.q_proj": {
        "snr": 0.22898294031620026,
        "type": "self_attn.q_proj"
    },
    "model.layers.13.self_attn.q_proj": {
        "snr": 0.21394900977611542,
        "type": "self_attn.q_proj"
    },
    "model.layers.14.self_attn.q_proj": {
        "snr": 0.24130398035049438,
        "type": "self_attn.q_proj"
    },
    "model.layers.15.self_attn.q_proj": {
        "snr": 0.21905161440372467,
        "type": "self_attn.q_proj"
    },
    "model.layers.16.self_attn.q_proj": {
        "snr": 0.2319282442331314,
        "type": "self_attn.q_proj"
    },
    "model.layers.17.self_attn.q_proj": {
        "snr": 0.24004821479320526,
        "type": "self_attn.q_proj"
    },
    "model.layers.18.self_attn.q_proj": {
        "snr": 0.22754515707492828,
        "type": "self_attn.q_proj"
    },
    "model.layers.19.self_attn.q_proj": {
        "snr": 0.2086794078350067,
        "type": "self_attn.q_proj"
    },
    "model.layers.20.self_attn.q_proj": {
        "snr": 0.2290779948234558,
        "type": "self_attn.q_proj"
    },
    "model.layers.21.self_attn.q_proj": {
        "snr": 0.250373899936676,
        "type": "self_attn.q_proj"
    },
    "model.layers.22.self_attn.q_proj": {
        "snr": 0.23474709689617157,
        "type": "self_attn.q_proj"
    },
    "model.layers.23.self_attn.q_proj": {
        "snr": 0.20302507281303406,
        "type": "self_attn.q_proj"
    },
    "model.layers.24.self_attn.q_proj": {
        "snr": 0.21992310881614685,
        "type": "self_attn.q_proj"
    },
    "model.layers.25.self_attn.q_proj": {
        "snr": 0.2120121270418167,
        "type": "self_attn.q_proj"
    },
    "model.layers.26.self_attn.q_proj": {
        "snr": 0.24161922931671143,
        "type": "self_attn.q_proj"
    },
    "model.layers.27.self_attn.q_proj": {
        "snr": 0.12337693572044373,
        "type": "self_attn.q_proj"
    },
    "model.layers.0.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.1.self_attn.v_proj": {
        "snr": 231.07347106933594,
        "type": "self_attn.v_proj"
    },
    "model.layers.2.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.3.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.4.self_attn.v_proj": {
        "snr": 22.34870719909668,
        "type": "self_attn.v_proj"
    },
    "model.layers.5.self_attn.v_proj": {
        "snr": 246.30386352539062,
        "type": "self_attn.v_proj"
    },
    "model.layers.6.self_attn.v_proj": {
        "snr": 499.5611572265625,
        "type": "self_attn.v_proj"
    },
    "model.layers.7.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.8.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.9.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.10.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.11.self_attn.v_proj": {
        "snr": 69.09609985351562,
        "type": "self_attn.v_proj"
    },
    "model.layers.12.self_attn.v_proj": {
        "snr": 983.3341674804688,
        "type": "self_attn.v_proj"
    },
    "model.layers.13.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.14.self_attn.v_proj": {
        "snr": 64.04925537109375,
        "type": "self_attn.v_proj"
    },
    "model.layers.15.self_attn.v_proj": {
        "snr": 28.41021728515625,
        "type": "self_attn.v_proj"
    },
    "model.layers.16.self_attn.v_proj": {
        "snr": 724.2736206054688,
        "type": "self_attn.v_proj"
    },
    "model.layers.17.self_attn.v_proj": {
        "snr": 63.35670852661133,
        "type": "self_attn.v_proj"
    },
    "model.layers.18.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.19.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.20.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.21.self_attn.v_proj": {
        "snr": Infinity,
        "type": "self_attn.v_proj"
    },
    "model.layers.22.self_attn.v_proj": {
        "snr": 238.2569122314453,
        "type": "self_attn.v_proj"
    },
    "model.layers.23.self_attn.v_proj": {
        "snr": 111.78319549560547,
        "type": "self_attn.v_proj"
    },
    "model.layers.24.self_attn.v_proj": {
        "snr": 687.0054931640625,
        "type": "self_attn.v_proj"
    },
    "model.layers.25.self_attn.v_proj": {
        "snr": 565.3272705078125,
        "type": "self_attn.v_proj"
    },
    "model.layers.26.self_attn.v_proj": {
        "snr": 4.064513683319092,
        "type": "self_attn.v_proj"
    },
    "model.layers.27.self_attn.v_proj": {
        "snr": 4.335177421569824,
        "type": "self_attn.v_proj"
    }
}


================================================
FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_google-gemma-2-2b.json
================================================
{
    "model.layers.0.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.1.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.2.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.3.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.4.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.5.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.6.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.7.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.8.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.9.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.10.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.11.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.12.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.13.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.14.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.15.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.16.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.17.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.18.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.19.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.20.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.21.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.22.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.23.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.24.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.25.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "lm_head": {
        "snr": 4.538210391998291,
        "type": "lm_head"
    },
    "model.layers.0.mlp.down_proj": {
        "snr": 7.746472358703613,
        "type": "mlp.down_proj"
    },
    "model.layers.1.mlp.down_proj": {
        "snr": 4.3358893394470215,
        "type": "mlp.down_proj"
    },
    "model.layers.2.mlp.down_proj": {
        "snr": 26.88057518005371,
        "type": "mlp.down_proj"
    },
    "model.layers.3.mlp.down_proj": {
        "snr": 8.699942588806152,
        "type": "mlp.down_proj"
    },
    "model.layers.4.mlp.down_proj": {
        "snr": 32.808380126953125,
        "type": "mlp.down_proj"
    },
    "model.layers.5.mlp.down_proj": {
        "snr": 10.831522941589355,
        "type": "mlp.down_proj"
    },
    "model.layers.6.mlp.down_proj": {
        "snr": 18.843679428100586,
        "type": "mlp.down_proj"
    },
    "model.layers.7.mlp.down_proj": {
        "snr": 9.348078727722168,
        "type": "mlp.down_proj"
    },
    "model.layers.8.mlp.down_proj": {
        "snr": 7.061270236968994,
        "type": "mlp.down_proj"
    },
    "model.layers.9.mlp.down_proj": {
        "snr": 5.454320907592773,
        "type": "mlp.down_proj"
    },
    "model.layers.10.mlp.down_proj": {
        "snr": 7.386133193969727,
        "type": "mlp.down_proj"
    },
    "model.layers.11.mlp.down_proj": {
        "snr": 6.648562908172607,
        "type": "mlp.down_proj"
    },
    "model.layers.12.mlp.down_proj": {
        "snr": 5.853652477264404,
        "type": "mlp.down_proj"
    },
    "model.layers.13.mlp.down_proj": {
        "snr": 8.570493698120117,
        "type": "mlp.down_proj"
    },
    "model.layers.14.mlp.down_proj": {
        "snr": 13.120837211608887,
        "type": "mlp.down_proj"
    },
    "model.layers.15.mlp.down_proj": {
        "snr": 14.780969619750977,
        "type": "mlp.down_proj"
    },
    "model.layers.16.mlp.down_proj": {
        "snr": 6.953134059906006,
        "type": "mlp.down_proj"
    },
    "model.layers.17.mlp.down_proj": {
        "snr": 12.589436531066895,
        "type": "mlp.down_proj"
    },
    "model.layers.18.mlp.down_proj": {
        "snr": 8.844094276428223,
        "type": "mlp.down_proj"
    },
    "model.layers.19.mlp.down_proj": {
        "snr": 7.598869800567627,
        "type": "mlp.down_proj"
    },
    "model.layers.20.mlp.down_proj": {
        "snr": 11.293925285339355,
        "type": "mlp.down_proj"
    },
    "model.layers.21.mlp.down_proj": {
        "snr": 9.384604454040527,
        "type": "mlp.down_proj"
    },
    "model.layers.22.mlp.down_proj": {
        "snr": 12.12533187866211,
        "type": "mlp.down_proj"
    },
    "model.layers.23.mlp.down_proj": {
        "snr": 11.217570304870605,
        "type": "mlp.down_proj"
    },
    "model.layers.24.mlp.down_proj": {
        "snr": 14.197714805603027,
        "type": "mlp.down_proj"
    },
    "model.layers.25.mlp.down_proj": {
        "snr": 12.449926376342773,
        "type": "mlp.down_proj"
    },
    "model.layers.0.mlp.gate_proj": {
        "snr": 16.885862350463867,
        "type": "mlp.gate_proj"
    },
    "model.layers.1.mlp.gate_proj": {
        "snr": 23.410266876220703,
        "type": "mlp.gate_proj"
    },
    "model.layers.2.mlp.gate_proj": {
        "snr": 22.57662582397461,
        "type": "mlp.gate_proj"
    },
    "model.layers.3.mlp.gate_proj": {
        "snr": 17.29996681213379,
        "type": "mlp.gate_proj"
    },
    "model.layers.4.mlp.gate_proj": {
        "snr": 11.718637466430664,
        "type": "mlp.gate_proj"
    },
    "model.layers.5.mlp.gate_proj": {
        "snr": 6.376136779785156,
        "type": "mlp.gate_proj"
    },
    "model.layers.6.mlp.gate_proj": {
        "snr": 6.794021129608154,
        "type": "mlp.gate_proj"
    },
    "model.layers.7.mlp.gate_proj": {
        "snr": 3.2425343990325928,
        "type": "mlp.gate_proj"
    },
    "model.layers.8.mlp.gate_proj": {
        "snr": 2.368421792984009,
        "type": "mlp.gate_proj"
    },
    "model.layers.9.mlp.gate_proj": {
        "snr": 3.3193087577819824,
        "type": "mlp.gate_proj"
    },
    "model.layers.10.mlp.gate_proj": {
        "snr": 3.9515960216522217,
        "type": "mlp.gate_proj"
    },
    "model.layers.11.mlp.gate_proj": {
        "snr": 3.2761318683624268,
        "type": "mlp.gate_proj"
    },
    "model.layers.12.mlp.gate_proj": {
        "snr": 4.026322841644287,
        "type": "mlp.gate_proj"
    },
    "model.layers.13.mlp.gate_proj": {
        "snr": 3.415473699569702,
        "type": "mlp.gate_proj"
    },
    "model.layers.14.mlp.gate_proj": {
        "snr": 3.3418092727661133,
        "type": "mlp.gate_proj"
    },
    "model.layers.15.mlp.gate_proj": {
        "snr": 3.6233012676239014,
        "type": "mlp.gate_proj"
    },
    "model.layers.16.mlp.gate_proj": {
        "snr": 3.2199010848999023,
        "type": "mlp.gate_proj"
    },
    "model.layers.17.mlp.gate_proj": {
        "snr": 3.6848936080932617,
        "type": "mlp.gate_proj"
    },
    "model.layers.18.mlp.gate_proj": {
        "snr": 3.4439642429351807,
        "type": "mlp.gate_proj"
    },
    "model.layers.19.mlp.gate_proj": {
        "snr": 3.7366604804992676,
        "type": "mlp.gate_proj"
    },
    "model.layers.20.mlp.gate_proj": {
        "snr": 4.262336254119873,
        "type": "mlp.gate_proj"
    },
    "model.layers.21.mlp.gate_proj": {
        "snr": 4.333253860473633,
        "type": "mlp.gate_proj"
    },
    "model.layers.22.mlp.gate_proj": {
        "snr": 3.640247344970703,
        "type": "mlp.gate_proj"
    },
    "model.layers.23.mlp.gate_proj": {
        "snr": 4.2978034019470215,
        "type": "mlp.gate_proj"
    },
    "model.layers.24.mlp.gate_proj": {
        "snr": 4.339972496032715,
        "type": "mlp.gate_proj"
    },
    "model.layers.25.mlp.gate_proj": {
        "snr": 3.8502564430236816,
        "type": "mlp.gate_proj"
    },
    "model.layers.0.mlp.up_proj": {
        "snr": 28.129924774169922,
        "type": "mlp.up_proj"
    },
    "model.layers.1.mlp.up_proj": {
        "snr": 41.49960708618164,
        "type": "mlp.up_proj"
    },
    "model.layers.2.mlp.up_proj": {
        "snr": 125.47801971435547,
        "type": "mlp.up_proj"
    },
    "model.layers.3.mlp.up_proj": {
        "snr": 119.93355560302734,
        "type": "mlp.up_proj"
    },
    "model.layers.4.mlp.up_proj": {
        "snr": 162.62631225585938,
        "type": "mlp.up_proj"
    },
    "model.layers.5.mlp.up_proj": {
        "snr": 32.36909484863281,
        "type": "mlp.up_proj"
    },
    "model.layers.6.mlp.up_proj": {
        "snr": 49.10078430175781,
        "type": "mlp.up_proj"
    },
    "model.layers.7.mlp.up_proj": {
        "snr": 28.541580200195312,
        "type": "mlp.up_proj"
    },
    "model.layers.8.mlp.up_proj": {
        "snr": 14.764090538024902,
        "type": "mlp.up_proj"
    },
    "model.layers.9.mlp.up_proj": {
        "snr": 16.5697078704834,
        "type": "mlp.up_proj"
    },
    "model.layers.10.mlp.up_proj": {
        "snr": 19.26059913635254,
        "type": "mlp.up_proj"
    },
    "model.layers.11.mlp.up_proj": {
        "snr": 15.082040786743164,
        "type": "mlp.up_proj"
    },
    "model.layers.12.mlp.up_proj": {
        "snr": 15.5792875289917,
        "type": "mlp.up_proj"
    },
    "model.layers.13.mlp.up_proj": {
        "snr": 9.84595012664795,
        "type": "mlp.up_proj"
    },
    "model.layers.14.mlp.up_proj": {
        "snr": 11.506875991821289,
        "type": "mlp.up_proj"
    },
    "model.layers.15.mlp.up_proj": {
        "snr": 21.507600784301758,
        "type": "mlp.up_proj"
    },
    "model.layers.16.mlp.up_proj": {
        "snr": 15.110466957092285,
        "type": "mlp.up_proj"
    },
    "model.layers.17.mlp.up_proj": {
        "snr": 27.062183380126953,
        "type": "mlp.up_proj"
    },
    "model.layers.18.mlp.up_proj": {
        "snr": 16.40383529663086,
        "type": "mlp.up_proj"
    },
    "model.layers.19.mlp.up_proj": {
        "snr": 13.117464065551758,
        "type": "mlp.up_proj"
    },
    "model.layers.20.mlp.up_proj": {
        "snr": 11.393353462219238,
        "type": "mlp.up_proj"
    },
    "model.layers.21.mlp.up_proj": {
        "snr": 10.791608810424805,
        "type": "mlp.up_proj"
    },
    "model.layers.22.mlp.up_proj": {
        "snr": 7.512388706207275,
        "type": "mlp.up_proj"
    },
    "model.layers.23.mlp.up_proj": {
        "snr": 9.889434814453125,
        "type": "mlp.up_proj"
    },
    "model.layers.24.mlp.up_proj": {
        "snr": 7.587779521942139,
        "type": "mlp.up_proj"
    },
    "model.layers.25.mlp.up_proj": {
        "snr": 4.561068058013916,
        "type": "mlp.up_proj"
    },
    "model.embed_tokens": {
        "snr": 4.538210391998291,
        "type": "model.embed_tokens"
    },
    "model.norm": {
        "snr": Infinity,
        "type": "model.norm"
    },
    "model.layers.0.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.1.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.2.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.3.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.4.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.5.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.6.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.7.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.8.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.9.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.10.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.11.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.12.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.13.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.14.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.15.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.16.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.17.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.18.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.19.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.20.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.21.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.22.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.23.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.24.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.25.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.0.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.1.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.2.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.3.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.4.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.5.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.6.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.7.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.8.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.9.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.10.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.11.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.12.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.13.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.14.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.15.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.16.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.17.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.18.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.19.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.20.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.21.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.22.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.23.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.24.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.25.post_feedforward_layernorm": {
        "snr": Infinity,
        "type": "post_feedforward_layernorm"
    },
    "model.layers.0.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.1.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.2.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.3.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.4.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.5.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.6.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.7.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.8.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.9.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.10.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.11.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.12.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.13.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.14.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.15.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.16.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.17.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.18.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.19.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.20.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.21.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.22.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.23.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.24.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.25.pre_feedforward_layernorm": {
        "snr": Infinity,
        "type": "pre_feedforward_layernorm"
    },
    "model.layers.0.self_attn.k_proj": {
        "snr": 0.5685535073280334,
        "type": "self_attn.k_proj"
    },
    "model.layers.1.self_attn.k_proj": {
        "snr": 1.060130000114441,
        "type": "self_attn.k_proj"
    },
    "model.layers.2.self_attn.k_proj": {
        "snr": 1.0735561847686768,
        "type": "self_attn.k_proj"
    },
    "model.layers.3.self_attn.k_proj": {
        "snr": 1.0217311382293701,
        "type": "self_attn.k_proj"
    },
    "model.layers.4.self_attn.k_proj": {
        "snr": 0.9687430262565613,
        "type": "self_attn.k_proj"
    },
    "model.layers.5.self_attn.k_proj": {
        "snr": 0.8411160111427307,
        "type": "self_attn.k_proj"
    },
    "model.layers.6.self_attn.k_proj": {
        "snr": 0.936741054058075,
        "type": "self_attn.k_proj"
    },
    "model.layers.7.self_attn.k_proj": {
        "snr": 0.7236003279685974,
        "type": "self_attn.k_proj"
    },
    "model.layers.8.self_attn.k_proj": {
        "snr": 0.9032857418060303,
        "type": "self_attn.k_proj"
    },
    "model.layers.9.self_attn.k_proj": {
        "snr": 0.7513307929039001,
        "type": "self_attn.k_proj"
    },
    "model.layers.10.self_attn.k_proj": {
        "snr": 0.6875415444374084,
        "type": "self_attn.k_proj"
    },
    "model.layers.11.self_attn.k_proj": {
        "snr": 0.6611058712005615,
        "type": "self_attn.k_proj"
    },
    "model.layers.12.self_attn.k_proj": {
        "snr": 0.8023670315742493,
        "type": "self_attn.k_proj"
    },
    "model.layers.13.self_attn.k_proj": {
        "snr": 0.7188767194747925,
        "type": "self_attn.k_proj"
    },
    "model.layers.14.self_attn.k_proj": {
        "snr": 0.7930117249488831,
        "type": "self_attn.k_proj"
    },
    "model.layers.15.self_attn.k_proj": {
        "snr": 0.9076258540153503,
        "type": "self_attn.k_proj"
    },
    "model.layers.16.self_attn.k_proj": {
        "snr": 0.7295113801956177,
        "type": "self_attn.k_proj"
    },
    "model.layers.17.self_attn.k_proj": {
        "snr": 0.898467481136322,
        "type": "self_attn.k_proj"
    },
    "model.layers.18.self_attn.k_proj": {
        "snr": 0.9652048945426941,
        "type": "self_attn.k_proj"
    },
    "model.layers.19.self_attn.k_proj": {
        "snr": 0.9855819344520569,
        "type": "self_attn.k_proj"
    },
    "model.layers.20.self_attn.k_proj": {
        "snr": 1.2863355875015259,
        "type": "self_attn.k_proj"
    },
    "model.layers.21.self_attn.k_proj": {
        "snr": 1.116607904434204,
        "type": "self_attn.k_proj"
    },
    "model.layers.22.self_attn.k_proj": {
        "snr": 0.7438228130340576,
        "type": "self_attn.k_proj"
    },
    "model.layers.23.self_attn.k_proj": {
        "snr": 0.8499895334243774,
        "type": "self_attn.k_proj"
    },
    "model.layers.24.self_attn.k_proj": {
        "snr": 0.7764042019844055,
        "type": "self_attn.k_proj"
    },
    "model.layers.25.self_attn.k_proj": {
        "snr": 0.7127887606620789,
        "type": "self_attn.k_proj"
    },
    "model.layers.0.self_attn.o_proj": {
        "snr": 0.2556447386741638,
        "type": "self_attn.o_proj"
    },
    "model.layers.1.self_attn.o_proj": {
        "snr": 0.2930974066257477,
        "type": "self_attn.o_proj"
    },
    "model.layers.2.self_attn.o_proj": {
        "snr": 0.27571651339530945,
        "type": "self_attn.o_proj"
    },
    "model.layers.3.self_attn.o_proj": {
        "snr": 0.280631959438324,
        "type": "self_attn.o_proj"
    },
    "model.layers.4.self_attn.o_proj": {
        "snr": 0.2958097755908966,
        "type": "self_attn.o_proj"
    },
    "model.layers.5.self_attn.o_proj": {
        "snr": 0.3072899580001831,
        "type": "self_attn.o_proj"
    },
    "model.layers.6.self_attn.o_proj": {
        "snr": 0.31374114751815796,
        "type": "self_attn.o_proj"
    },
    "model.layers.7.self_attn.o_proj": {
        "snr": 0.2903076410293579,
        "type": "self_attn.o_proj"
    },
    "model.layers.8.self_attn.o_proj": {
        "snr": 0.2625811696052551,
        "type": "self_attn.o_proj"
    },
    "model.layers.9.self_attn.o_proj": {
        "snr": 0.2306082546710968,
        "type": "self_attn.o_proj"
    },
    "model.layers.10.self_attn.o_proj": {
        "snr": 0.24869701266288757,
        "type": "self_attn.o_proj"
    },
    "model.layers.11.self_attn.o_proj": {
        "snr": 0.2556127905845642,
        "type": "self_attn.o_proj"
    },
    "model.layers.12.self_attn.o_proj": {
        "snr": 0.28926730155944824,
        "type": "self_attn.o_proj"
    },
    "model.layers.13.self_attn.o_proj": {
        "snr": 0.25355643033981323,
        "type": "self_attn.o_proj"
    },
    "model.layers.14.self_attn.o_proj": {
        "snr": 0.23122912645339966,
        "type": "self_attn.o_proj"
    },
    "model.layers.15.self_attn.o_proj": {
        "snr": 0.28772857785224915,
        "type": "self_attn.o_proj"
    },
    "model.layers.16.self_attn.o_proj": {
        "snr": 0.22682352364063263,
        "type": "self_attn.o_proj"
    },
    "model.layers.17.self_attn.o_proj": {
        "snr": 0.2558597922325134,
        "type": "self_attn.o_proj"
    },
    "model.layers.18.self_attn.o_proj": {
        "snr": 0.1773315966129303,
        "type": "self_attn.o_proj"
    },
    "model.layers.19.self_attn.o_proj": {
        "snr": 0.2106105089187622,
        "type": "self_attn.o_proj"
    },
    "model.layers.20.self_attn.o_proj": {
        "snr": 0.2008877396583557,
        "type": "self_attn.o_proj"
    },
    "model.layers.21.self_attn.o_proj": {
        "snr": 0.1973956972360611,
        "type": "self_attn.o_proj"
    },
    "model.layers.22.self_attn.o_proj": {
        "snr": 0.25533634424209595,
        "type": "self_attn.o_proj"
    },
    "model.layers.23.self_attn.o_proj": {
        "snr": 0.20066529512405396,
        "type": "self_attn.o_proj"
    },
    "model.layers.24.self_attn.o_proj": {
        "snr": 0.18342143297195435,
        "type": "self_attn.o_proj"
    },
    "model.layers.25.self_attn.o_proj": {
        "snr": 0.3224162459373474,
        "type": "self_attn.o_proj"
    },
    "model.layers.0.self_attn.q_proj": {
        "snr": 0.2074502408504486,
        "type": "self_attn.q_proj"
    },
    "model.layers.1.self_attn.q_proj": {
        "snr": 0.33233126997947693,
        "type": "self_attn.q_proj"
    },
    "model.layers.2.self_attn.q_proj": {
        "snr": 0.3586291968822479,
        "type": "self_attn.q_proj"
    },
    "model.layers.3.self_attn.q_proj": {
        "snr": 0.2850974202156067,
        "type": "self_attn.q_proj"
    },
    "model.layers.4.self_attn.q_proj": {
        "snr": 0.37816473841667175,
        "type": "self_attn.q_proj"
    },
    "model.layers.5.self_attn.q_proj": {
        "snr": 0.31616899371147156,
        "type": "self_attn.q_proj"
    },
    "model.layers.6.self_attn.q_proj": {
        "snr": 0.4988365173339844,
        "type": "self_attn.q_proj"
    },
    "model.layers.7.self_attn.q_proj": {
        "snr": 0.4238639175891876,
        "type": "self_attn.q_proj"
    },
    "model.layers.8.self_attn.q_proj": {
        "snr": 0.2674674689769745,
        "type": "self_attn.q_proj"
    },
    "model.layers.9.self_attn.q_proj": {
        "snr": 0.34524214267730713,
        "type": "self_attn.q_proj"
    },
    "model.layers.10.self_attn.q_proj": {
        "snr": 0.4472109377384186,
        "type": "self_attn.q_proj"
    },
    "model.layers.11.self_attn.q_proj": {
        "snr": 0.41363632678985596,
        "type": "self_attn.q_proj"
    },
    "model.layers.12.self_attn.q_proj": {
        "snr": 0.44623735547065735,
        "type": "self_attn.q_proj"
    },
    "model.layers.13.self_attn.q_proj": {
        "snr": 0.4404333531856537,
        "type": "self_attn.q_proj"
    },
    "model.layers.14.self_attn.q_proj": {
        "snr": 0.5200268626213074,
        "type": "self_attn.q_proj"
    },
    "model.layers.15.self_attn.q_proj": {
        "snr": 0.4320363700389862,
        "type": "self_attn.q_proj"
    },
    "model.layers.16.self_attn.q_proj": {
        "snr": 0.46235284209251404,
        "type": "self_attn.q_proj"
    },
    "model.layers.17.self_attn.q_proj": {
        "snr": 0.47477203607559204,
        "type": "self_attn.q_proj"
    },
    "model.layers.18.self_attn.q_proj": {
        "snr": 0.4001321494579315,
        "type": "self_attn.q_proj"
    },
    "model.layers.19.self_attn.q_proj": {
        "snr": 0.42365774512290955,
        "type": "self_attn.q_proj"
    },
    "model.layers.20.self_attn.q_proj": {
        "snr": 0.37057873606681824,
        "type": "self_attn.q_proj"
    },
    "model.layers.21.self_attn.q_proj": {
        "snr": 0.3990235924720764,
        "type": "self_attn.q_proj"
    },
    "model.layers.22.self_attn.q_proj": {
        "snr": 0.35094162821769714,
        "type": "self_attn.q_proj"
    },
    "model.layers.23.self_attn.q_proj": {
        "snr": 0.35721710324287415,
        "type": "self_attn.q_proj"
    },
    "model.layers.24.self_attn.q_proj": {
        "snr": 0.2812618315219879,
        "type": "self_attn.q_proj"
    },
    "model.layers.25.self_attn.q_proj": {
        "snr": 0.19463211297988892,
        "type": "self_attn.q_proj"
    },
    "model.layers.0.self_attn.v_proj": {
        "snr": 1.3365743160247803,
        "type": "self_attn.v_proj"
    },
    "model.layers.1.self_attn.v_proj": {
        "snr": 2.402009963989258,
        "type": "self_attn.v_proj"
    },
    "model.layers.2.self_attn.v_proj": {
        "snr": 3.8695859909057617,
        "type": "self_attn.v_proj"
    },
    "model.layers.3.self_attn.v_proj": {
        "snr": 4.117948055267334,
        "type": "self_attn.v_proj"
    },
    "model.layers.4.self_attn.v_proj": {
        "snr": 5.651231288909912,
        "type": "self_attn.v_proj"
    },
    "model.layers.5.self_attn.v_proj": {
        "snr": 2.720799446105957,
        "type": "self_attn.v_proj"
    },
    "model.layers.6.self_attn.v_proj": {
        "snr": 1.4446897506713867,
        "type": "self_attn.v_proj"
    },
    "model.layers.7.self_attn.v_proj": {
        "snr": 4.497112274169922,
        "type": "self_attn.v_proj"
    },
    "model.layers.8.self_attn.v_proj": {
        "snr": 1.7241870164871216,
        "type": "self_attn.v_proj"
    },
    "model.layers.9.self_attn.v_proj": {
        "snr": 1.7104988098144531,
        "type": "self_attn.v_proj"
    },
    "model.layers.10.self_attn.v_proj": {
        "snr": 1.4231206178665161,
        "type": "self_attn.v_proj"
    },
    "model.layers.11.self_attn.v_proj": {
        "snr": 2.1643989086151123,
        "type": "self_attn.v_proj"
    },
    "model.layers.12.self_attn.v_proj": {
        "snr": 1.5254249572753906,
        "type": "self_attn.v_proj"
    },
    "model.layers.13.self_attn.v_proj": {
        "snr": 2.3788745403289795,
        "type": "self_attn.v_proj"
    },
    "model.layers.14.self_attn.v_proj": {
        "snr": 3.4155967235565186,
        "type": "self_attn.v_proj"
    },
    "model.layers.15.self_attn.v_proj": {
        "snr": 4.623549938201904,
        "type": "self_attn.v_proj"
    },
    "model.layers.16.self_attn.v_proj": {
        "snr": 1.5291141271591187,
        "type": "self_attn.v_proj"
    },
    "model.layers.17.self_attn.v_proj": {
        "snr": 3.9934189319610596,
        "type": "self_attn.v_proj"
    },
    "model.layers.18.self_attn.v_proj": {
        "snr": 9.035382270812988,
        "type": "self_attn.v_proj"
    },
    "model.layers.19.self_attn.v_proj": {
        "snr": 5.8578925132751465,
        "type": "self_attn.v_proj"
    },
    "model.layers.20.self_attn.v_proj": {
        "snr": 3.759958505630493,
        "type": "self_attn.v_proj"
    },
    "model.layers.21.self_attn.v_proj": {
        "snr": 4.558528900146484,
        "type": "self_attn.v_proj"
    },
    "model.layers.22.self_attn.v_proj": {
        "snr": 0.9163281917572021,
        "type": "self_attn.v_proj"
    },
    "model.layers.23.self_attn.v_proj": {
        "snr": 2.564377546310425,
        "type": "self_attn.v_proj"
    },
    "model.layers.24.self_attn.v_proj": {
        "snr": 3.689103841781616,
        "type": "self_attn.v_proj"
    },
    "model.layers.25.self_attn.v_proj": {
        "snr": 5.6444854736328125,
        "type": "self_attn.v_proj"
    }
}


================================================
FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-1B-Instruct.json
================================================
{
    "model.layers.0.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.1.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.2.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.3.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.4.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.5.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.6.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.7.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.8.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.9.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.10.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.11.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.12.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.13.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.14.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.15.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "lm_head": {
        "snr": Infinity,
        "type": "lm_head"
    },
    "model.layers.0.mlp.down_proj": {
        "snr": 70.0594253540039,
        "type": "mlp.down_proj"
    },
    "model.layers.1.mlp.down_proj": {
        "snr": 11.135851860046387,
        "type": "mlp.down_proj"
    },
    "model.layers.2.mlp.down_proj": {
        "snr": 7.035482883453369,
        "type": "mlp.down_proj"
    },
    "model.layers.3.mlp.down_proj": {
        "snr": 6.422532081604004,
        "type": "mlp.down_proj"
    },
    "model.layers.4.mlp.down_proj": {
        "snr": 5.748020172119141,
        "type": "mlp.down_proj"
    },
    "model.layers.5.mlp.down_proj": {
        "snr": 3.885556697845459,
        "type": "mlp.down_proj"
    },
    "model.layers.6.mlp.down_proj": {
        "snr": 3.4336745738983154,
        "type": "mlp.down_proj"
    },
    "model.layers.7.mlp.down_proj": {
        "snr": 2.791595935821533,
        "type": "mlp.down_proj"
    },
    "model.layers.8.mlp.down_proj": {
        "snr": 5.36277961730957,
        "type": "mlp.down_proj"
    },
    "model.layers.9.mlp.down_proj": {
        "snr": 4.459208011627197,
        "type": "mlp.down_proj"
    },
    "model.layers.10.mlp.down_proj": {
        "snr": 6.272170066833496,
        "type": "mlp.down_proj"
    },
    "model.layers.11.mlp.down_proj": {
        "snr": 5.264761447906494,
        "type": "mlp.down_proj"
    },
    "model.layers.12.mlp.down_proj": {
        "snr": 4.324735641479492,
        "type": "mlp.down_proj"
    },
    "model.layers.13.mlp.down_proj": {
        "snr": 3.878648042678833,
        "type": "mlp.down_proj"
    },
    "model.layers.14.mlp.down_proj": {
        "snr": 2.9773054122924805,
        "type": "mlp.down_proj"
    },
    "model.layers.15.mlp.down_proj": {
        "snr": 4.471445560455322,
        "type": "mlp.down_proj"
    },
    "model.layers.0.mlp.gate_proj": {
        "snr": 25.227100372314453,
        "type": "mlp.gate_proj"
    },
    "model.layers.1.mlp.gate_proj": {
        "snr": 6.58299446105957,
        "type": "mlp.gate_proj"
    },
    "model.layers.2.mlp.gate_proj": {
        "snr": 3.4688243865966797,
        "type": "mlp.gate_proj"
    },
    "model.layers.3.mlp.gate_proj": {
        "snr": 1.555246114730835,
        "type": "mlp.gate_proj"
    },
    "model.layers.4.mlp.gate_proj": {
        "snr": 0.7770601511001587,
        "type": "mlp.gate_proj"
    },
    "model.layers.5.mlp.gate_proj": {
        "snr": 0.6239906549453735,
        "type": "mlp.gate_proj"
    },
    "model.layers.6.mlp.gate_proj": {
        "snr": 0.6440379023551941,
        "type": "mlp.gate_proj"
    },
    "model.layers.7.mlp.gate_proj": {
        "snr": 0.5120116472244263,
        "type": "mlp.gate_proj"
    },
    "model.layers.8.mlp.gate_proj": {
        "snr": 0.6544050574302673,
        "type": "mlp.gate_proj"
    },
    "model.layers.9.mlp.gate_proj": {
        "snr": 0.5381016731262207,
        "type": "mlp.gate_proj"
    },
    "model.layers.10.mlp.gate_proj": {
        "snr": 0.622873842716217,
        "type": "mlp.gate_proj"
    },
    "model.layers.11.mlp.gate_proj": {
        "snr": 0.9361700415611267,
        "type": "mlp.gate_proj"
    },
    "model.layers.12.mlp.gate_proj": {
        "snr": 1.475605845451355,
        "type": "mlp.gate_proj"
    },
    "model.layers.13.mlp.gate_proj": {
        "snr": 1.608325719833374,
        "type": "mlp.gate_proj"
    },
    "model.layers.14.mlp.gate_proj": {
        "snr": 1.0720024108886719,
        "type": "mlp.gate_proj"
    },
    "model.layers.15.mlp.gate_proj": {
        "snr": 0.7111338973045349,
        "type": "mlp.gate_proj"
    },
    "model.layers.0.mlp.up_proj": {
        "snr": 28.431896209716797,
        "type": "mlp.up_proj"
    },
    "model.layers.1.mlp.up_proj": {
        "snr": 15.546019554138184,
        "type": "mlp.up_proj"
    },
    "model.layers.2.mlp.up_proj": {
        "snr": 23.048023223876953,
        "type": "mlp.up_proj"
    },
    "model.layers.3.mlp.up_proj": {
        "snr": 25.790977478027344,
        "type": "mlp.up_proj"
    },
    "model.layers.4.mlp.up_proj": {
        "snr": 18.552549362182617,
        "type": "mlp.up_proj"
    },
    "model.layers.5.mlp.up_proj": {
        "snr": 8.85106372833252,
        "type": "mlp.up_proj"
    },
    "model.layers.6.mlp.up_proj": {
        "snr": 10.653799057006836,
        "type": "mlp.up_proj"
    },
    "model.layers.7.mlp.up_proj": {
        "snr": 7.365357875823975,
        "type": "mlp.up_proj"
    },
    "model.layers.8.mlp.up_proj": {
        "snr": 11.98373794555664,
        "type": "mlp.up_proj"
    },
    "model.layers.9.mlp.up_proj": {
        "snr": 8.04493236541748,
        "type": "mlp.up_proj"
    },
    "model.layers.10.mlp.up_proj": {
        "snr": 8.523039817810059,
        "type": "mlp.up_proj"
    },
    "model.layers.11.mlp.up_proj": {
        "snr": 5.381742477416992,
        "type": "mlp.up_proj"
    },
    "model.layers.12.mlp.up_proj": {
        "snr": 3.9845118522644043,
        "type": "mlp.up_proj"
    },
    "model.layers.13.mlp.up_proj": {
        "snr": 3.4893221855163574,
        "type": "mlp.up_proj"
    },
    "model.layers.14.mlp.up_proj": {
        "snr": 1.764201045036316,
        "type": "mlp.up_proj"
    },
    "model.layers.15.mlp.up_proj": {
        "snr": 0.9730708599090576,
        "type": "mlp.up_proj"
    },
    "model.embed_tokens": {
        "snr": Infinity,
        "type": "model.embed_tokens"
    },
    "model.norm": {
        "snr": Infinity,
        "type": "model.norm"
    },
    "model.layers.0.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.1.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.2.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.3.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.4.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.5.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.6.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.7.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.8.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.9.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.10.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.11.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.12.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.13.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.14.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.15.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.0.self_attn.k_proj": {
        "snr": 0.11727584153413773,
        "type": "self_attn.k_proj"
    },
    "model.layers.1.self_attn.k_proj": {
        "snr": 0.24786807596683502,
        "type": "self_attn.k_proj"
    },
    "model.layers.2.self_attn.k_proj": {
        "snr": 0.36378130316734314,
        "type": "self_attn.k_proj"
    },
    "model.layers.3.self_attn.k_proj": {
        "snr": 0.2983120381832123,
        "type": "self_attn.k_proj"
    },
    "model.layers.4.self_attn.k_proj": {
        "snr": 0.33789733052253723,
        "type": "self_attn.k_proj"
    },
    "model.layers.5.self_attn.k_proj": {
        "snr": 0.29155924916267395,
        "type": "self_attn.k_proj"
    },
    "model.layers.6.self_attn.k_proj": {
        "snr": 0.2537297010421753,
        "type": "self_attn.k_proj"
    },
    "model.layers.7.self_attn.k_proj": {
        "snr": 0.28204113245010376,
        "type": "self_attn.k_proj"
    },
    "model.layers.8.self_attn.k_proj": {
        "snr": 0.2776711583137512,
        "type": "self_attn.k_proj"
    },
    "model.layers.9.self_attn.k_proj": {
        "snr": 0.2927376627922058,
        "type": "self_attn.k_proj"
    },
    "model.layers.10.self_attn.k_proj": {
        "snr": 0.31486213207244873,
        "type": "self_attn.k_proj"
    },
    "model.layers.11.self_attn.k_proj": {
        "snr": 0.32363659143447876,
        "type": "self_attn.k_proj"
    },
    "model.layers.12.self_attn.k_proj": {
        "snr": 0.31382912397384644,
        "type": "self_attn.k_proj"
    },
    "model.layers.13.self_attn.k_proj": {
        "snr": 0.4635234773159027,
        "type": "self_attn.k_proj"
    },
    "model.layers.14.self_attn.k_proj": {
        "snr": 0.25379249453544617,
        "type": "self_attn.k_proj"
    },
    "model.layers.15.self_attn.k_proj": {
        "snr": 0.2628238797187805,
        "type": "self_attn.k_proj"
    },
    "model.layers.0.self_attn.o_proj": {
        "snr": 0.27602291107177734,
        "type": "self_attn.o_proj"
    },
    "model.layers.1.self_attn.o_proj": {
        "snr": 0.2149604707956314,
        "type": "self_attn.o_proj"
    },
    "model.layers.2.self_attn.o_proj": {
        "snr": 0.2540294826030731,
        "type": "self_attn.o_proj"
    },
    "model.layers.3.self_attn.o_proj": {
        "snr": 0.27978822588920593,
        "type": "self_attn.o_proj"
    },
    "model.layers.4.self_attn.o_proj": {
        "snr": 0.3121289908885956,
        "type": "self_attn.o_proj"
    },
    "model.layers.5.self_attn.o_proj": {
        "snr": 0.35037684440612793,
        "type": "self_attn.o_proj"
    },
    "model.layers.6.self_attn.o_proj": {
        "snr": 0.366205096244812,
        "type": "self_attn.o_proj"
    },
    "model.layers.7.self_attn.o_proj": {
        "snr": 0.3692712187767029,
        "type": "self_attn.o_proj"
    },
    "model.layers.8.self_attn.o_proj": {
        "snr": 0.3301038146018982,
        "type": "self_attn.o_proj"
    },
    "model.layers.9.self_attn.o_proj": {
        "snr": 0.3003396987915039,
        "type": "self_attn.o_proj"
    },
    "model.layers.10.self_attn.o_proj": {
        "snr": 0.30804169178009033,
        "type": "self_attn.o_proj"
    },
    "model.layers.11.self_attn.o_proj": {
        "snr": 0.28501132130622864,
        "type": "self_attn.o_proj"
    },
    "model.layers.12.self_attn.o_proj": {
        "snr": 0.2171541005373001,
        "type": "self_attn.o_proj"
    },
    "model.layers.13.self_attn.o_proj": {
        "snr": 0.19183959066867828,
        "type": "self_attn.o_proj"
    },
    "model.layers.14.self_attn.o_proj": {
        "snr": 0.19215913116931915,
        "type": "self_attn.o_proj"
    },
    "model.layers.15.self_attn.o_proj": {
        "snr": 0.25486502051353455,
        "type": "self_attn.o_proj"
    },
    "model.layers.0.self_attn.q_proj": {
        "snr": 0.03850084915757179,
        "type": "self_attn.q_proj"
    },
    "model.layers.1.self_attn.q_proj": {
        "snr": 0.0713055431842804,
        "type": "self_attn.q_proj"
    },
    "model.layers.2.self_attn.q_proj": {
        "snr": 0.07948919385671616,
        "type": "self_attn.q_proj"
    },
    "model.layers.3.self_attn.q_proj": {
        "snr": 0.08047746121883392,
        "type": "self_attn.q_proj"
    },
    "model.layers.4.self_attn.q_proj": {
        "snr": 0.0852593332529068,
        "type": "self_attn.q_proj"
    },
    "model.layers.5.self_attn.q_proj": {
        "snr": 0.09794823825359344,
        "type": "self_attn.q_proj"
    },
    "model.layers.6.self_attn.q_proj": {
        "snr": 0.09627152234315872,
        "type": "self_attn.q_proj"
    },
    "model.layers.7.self_attn.q_proj": {
        "snr": 0.11065381020307541,
        "type": "self_attn.q_proj"
    },
    "model.layers.8.self_attn.q_proj": {
        "snr": 0.12031875550746918,
        "type": "self_attn.q_proj"
    },
    "model.layers.9.self_attn.q_proj": {
        "snr": 0.09804573655128479,
        "type": "self_attn.q_proj"
    },
    "model.layers.10.self_attn.q_proj": {
        "snr": 0.10897502303123474,
        "type": "self_attn.q_proj"
    },
    "model.layers.11.self_attn.q_proj": {
        "snr": 0.09267337620258331,
        "type": "self_attn.q_proj"
    },
    "model.layers.12.self_attn.q_proj": {
        "snr": 0.08803492039442062,
        "type": "self_attn.q_proj"
    },
    "model.layers.13.self_attn.q_proj": {
        "snr": 0.0902542844414711,
        "type": "self_attn.q_proj"
    },
    "model.layers.14.self_attn.q_proj": {
        "snr": 0.10154066979885101,
        "type": "self_attn.q_proj"
    },
    "model.layers.15.self_attn.q_proj": {
        "snr": 0.09083802253007889,
        "type": "self_attn.q_proj"
    },
    "model.layers.0.self_attn.v_proj": {
        "snr": 2.842210054397583,
        "type": "self_attn.v_proj"
    },
    "model.layers.1.self_attn.v_proj": {
        "snr": 10.59461498260498,
        "type": "self_attn.v_proj"
    },
    "model.layers.2.self_attn.v_proj": {
        "snr": 8.993025779724121,
        "type": "self_attn.v_proj"
    },
    "model.layers.3.self_attn.v_proj": {
        "snr": 62.567787170410156,
        "type": "self_attn.v_proj"
    },
    "model.layers.4.self_attn.v_proj": {
        "snr": 23.80082893371582,
        "type": "self_attn.v_proj"
    },
    "model.layers.5.self_attn.v_proj": {
        "snr": 7.957369804382324,
        "type": "self_attn.v_proj"
    },
    "model.layers.6.self_attn.v_proj": {
        "snr": 12.01815414428711,
        "type": "self_attn.v_proj"
    },
    "model.layers.7.self_attn.v_proj": {
        "snr": 5.095500469207764,
        "type": "self_attn.v_proj"
    },
    "model.layers.8.self_attn.v_proj": {
        "snr": 11.719332695007324,
        "type": "self_attn.v_proj"
    },
    "model.layers.9.self_attn.v_proj": {
        "snr": 555.0869750976562,
        "type": "self_attn.v_proj"
    },
    "model.layers.10.self_attn.v_proj": {
        "snr": 22.95538330078125,
        "type": "self_attn.v_proj"
    },
    "model.layers.11.self_attn.v_proj": {
        "snr": 30.042158126831055,
        "type": "self_attn.v_proj"
    },
    "model.layers.12.self_attn.v_proj": {
        "snr": 9.577271461486816,
        "type": "self_attn.v_proj"
    },
    "model.layers.13.self_attn.v_proj": {
        "snr": 18.176361083984375,
        "type": "self_attn.v_proj"
    },
    "model.layers.14.self_attn.v_proj": {
        "snr": 1.5695856809616089,
        "type": "self_attn.v_proj"
    },
    "model.layers.15.self_attn.v_proj": {
        "snr": 2.7235565185546875,
        "type": "self_attn.v_proj"
    }
}


================================================
FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-1B.json
================================================
{
    "model.layers.0.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.1.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.2.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.3.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.4.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.5.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.6.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.7.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.8.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.9.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.10.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.11.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.12.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.13.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.14.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.15.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "lm_head": {
        "snr": Infinity,
        "type": "lm_head"
    },
    "model.layers.0.mlp.down_proj": {
        "snr": 57.09797286987305,
        "type": "mlp.down_proj"
    },
    "model.layers.1.mlp.down_proj": {
        "snr": 9.538983345031738,
        "type": "mlp.down_proj"
    },
    "model.layers.2.mlp.down_proj": {
        "snr": 6.227016925811768,
        "type": "mlp.down_proj"
    },
    "model.layers.3.mlp.down_proj": {
        "snr": 5.660686492919922,
        "type": "mlp.down_proj"
    },
    "model.layers.4.mlp.down_proj": {
        "snr": 5.178432464599609,
        "type": "mlp.down_proj"
    },
    "model.layers.5.mlp.down_proj": {
        "snr": 3.5638349056243896,
        "type": "mlp.down_proj"
    },
    "model.layers.6.mlp.down_proj": {
        "snr": 3.0918056964874268,
        "type": "mlp.down_proj"
    },
    "model.layers.7.mlp.down_proj": {
        "snr": 2.456392288208008,
        "type": "mlp.down_proj"
    },
    "model.layers.8.mlp.down_proj": {
        "snr": 4.525328636169434,
        "type": "mlp.down_proj"
    },
    "model.layers.9.mlp.down_proj": {
        "snr": 3.9409055709838867,
        "type": "mlp.down_proj"
    },
    "model.layers.10.mlp.down_proj": {
        "snr": 5.447249412536621,
        "type": "mlp.down_proj"
    },
    "model.layers.11.mlp.down_proj": {
        "snr": 4.807600975036621,
        "type": "mlp.down_proj"
    },
    "model.layers.12.mlp.down_proj": {
        "snr": 3.915374517440796,
        "type": "mlp.down_proj"
    },
    "model.layers.13.mlp.down_proj": {
        "snr": 3.4820363521575928,
        "type": "mlp.down_proj"
    },
    "model.layers.14.mlp.down_proj": {
        "snr": 2.6045074462890625,
        "type": "mlp.down_proj"
    },
    "model.layers.15.mlp.down_proj": {
        "snr": 3.7237701416015625,
        "type": "mlp.down_proj"
    },
    "model.layers.0.mlp.gate_proj": {
        "snr": 22.160131454467773,
        "type": "mlp.gate_proj"
    },
    "model.layers.1.mlp.gate_proj": {
        "snr": 6.072206020355225,
        "type": "mlp.gate_proj"
    },
    "model.layers.2.mlp.gate_proj": {
        "snr": 3.2467362880706787,
        "type": "mlp.gate_proj"
    },
    "model.layers.3.mlp.gate_proj": {
        "snr": 1.4111896753311157,
        "type": "mlp.gate_proj"
    },
    "model.layers.4.mlp.gate_proj": {
        "snr": 0.7405938506126404,
        "type": "mlp.gate_proj"
    },
    "model.layers.5.mlp.gate_proj": {
        "snr": 0.5916463136672974,
        "type": "mlp.gate_proj"
    },
    "model.layers.6.mlp.gate_proj": {
        "snr": 0.6149423718452454,
        "type": "mlp.gate_proj"
    },
    "model.layers.7.mlp.gate_proj": {
        "snr": 0.48369669914245605,
        "type": "mlp.gate_proj"
    },
    "model.layers.8.mlp.gate_proj": {
        "snr": 0.6047574877738953,
        "type": "mlp.gate_proj"
    },
    "model.layers.9.mlp.gate_proj": {
        "snr": 0.5092479586601257,
        "type": "mlp.gate_proj"
    },
    "model.layers.10.mlp.gate_proj": {
        "snr": 0.5999670624732971,
        "type": "mlp.gate_proj"
    },
    "model.layers.11.mlp.gate_proj": {
        "snr": 0.8980127573013306,
        "type": "mlp.gate_proj"
    },
    "model.layers.12.mlp.gate_proj": {
        "snr": 1.4252448081970215,
        "type": "mlp.gate_proj"
    },
    "model.layers.13.mlp.gate_proj": {
        "snr": 1.509937047958374,
        "type": "mlp.gate_proj"
    },
    "model.layers.14.mlp.gate_proj": {
        "snr": 1.0066585540771484,
        "type": "mlp.gate_proj"
    },
    "model.layers.15.mlp.gate_proj": {
        "snr": 0.6413647532463074,
        "type": "mlp.gate_proj"
    },
    "model.layers.0.mlp.up_proj": {
        "snr": 26.08852195739746,
        "type": "mlp.up_proj"
    },
    "model.layers.1.mlp.up_proj": {
        "snr": 13.382951736450195,
        "type": "mlp.up_proj"
    },
    "model.layers.2.mlp.up_proj": {
        "snr": 20.088768005371094,
        "type": "mlp.up_proj"
    },
    "model.layers.3.mlp.up_proj": {
        "snr": 23.0632381439209,
        "type": "mlp.up_proj"
    },
    "model.layers.4.mlp.up_proj": {
        "snr": 16.07433319091797,
        "type": "mlp.up_proj"
    },
    "model.layers.5.mlp.up_proj": {
        "snr": 8.00507640838623,
        "type": "mlp.up_proj"
    },
    "model.layers.6.mlp.up_proj": {
        "snr": 9.538354873657227,
        "type": "mlp.up_proj"
    },
    "model.layers.7.mlp.up_proj": {
        "snr": 6.286602973937988,
        "type": "mlp.up_proj"
    },
    "model.layers.8.mlp.up_proj": {
        "snr": 10.092820167541504,
        "type": "mlp.up_proj"
    },
    "model.layers.9.mlp.up_proj": {
        "snr": 7.193963527679443,
        "type": "mlp.up_proj"
    },
    "model.layers.10.mlp.up_proj": {
        "snr": 7.320116996765137,
        "type": "mlp.up_proj"
    },
    "model.layers.11.mlp.up_proj": {
        "snr": 4.8728532791137695,
        "type": "mlp.up_proj"
    },
    "model.layers.12.mlp.up_proj": {
        "snr": 3.596583366394043,
        "type": "mlp.up_proj"
    },
    "model.layers.13.mlp.up_proj": {
        "snr": 3.166161298751831,
        "type": "mlp.up_proj"
    },
    "model.layers.14.mlp.up_proj": {
        "snr": 1.5600818395614624,
        "type": "mlp.up_proj"
    },
    "model.layers.15.mlp.up_proj": {
        "snr": 0.8726214170455933,
        "type": "mlp.up_proj"
    },
    "model.embed_tokens": {
        "snr": Infinity,
        "type": "model.embed_tokens"
    },
    "model.norm": {
        "snr": Infinity,
        "type": "model.norm"
    },
    "model.layers.0.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.1.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.2.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.3.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.4.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.5.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.6.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.7.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.8.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.9.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.10.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.11.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.12.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.13.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.14.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.15.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.0.self_attn.k_proj": {
        "snr": 0.1154392883181572,
        "type": "self_attn.k_proj"
    },
    "model.layers.1.self_attn.k_proj": {
        "snr": 0.24299409985542297,
        "type": "self_attn.k_proj"
    },
    "model.layers.2.self_attn.k_proj": {
        "snr": 0.3624322712421417,
        "type": "self_attn.k_proj"
    },
    "model.layers.3.self_attn.k_proj": {
        "snr": 0.29509487748146057,
        "type": "self_attn.k_proj"
    },
    "model.layers.4.self_attn.k_proj": {
        "snr": 0.32953736186027527,
        "type": "self_attn.k_proj"
    },
    "model.layers.5.self_attn.k_proj": {
        "snr": 0.2908833622932434,
        "type": "self_attn.k_proj"
    },
    "model.layers.6.self_attn.k_proj": {
        "snr": 0.2488437294960022,
        "type": "self_attn.k_proj"
    },
    "model.layers.7.self_attn.k_proj": {
        "snr": 0.27847856283187866,
        "type": "self_attn.k_proj"
    },
    "model.layers.8.self_attn.k_proj": {
        "snr": 0.27143892645835876,
        "type": "self_attn.k_proj"
    },
    "model.layers.9.self_attn.k_proj": {
        "snr": 0.28804272413253784,
        "type": "self_attn.k_proj"
    },
    "model.layers.10.self_attn.k_proj": {
        "snr": 0.31197959184646606,
        "type": "self_attn.k_proj"
    },
    "model.layers.11.self_attn.k_proj": {
        "snr": 0.3203586935997009,
        "type": "self_attn.k_proj"
    },
    "model.layers.12.self_attn.k_proj": {
        "snr": 0.30905747413635254,
        "type": "self_attn.k_proj"
    },
    "model.layers.13.self_attn.k_proj": {
        "snr": 0.46828722953796387,
        "type": "self_attn.k_proj"
    },
    "model.layers.14.self_attn.k_proj": {
        "snr": 0.24205778539180756,
        "type": "self_attn.k_proj"
    },
    "model.layers.15.self_attn.k_proj": {
        "snr": 0.2559327781200409,
        "type": "self_attn.k_proj"
    },
    "model.layers.0.self_attn.o_proj": {
        "snr": 0.2638678550720215,
        "type": "self_attn.o_proj"
    },
    "model.layers.1.self_attn.o_proj": {
        "snr": 0.21109595894813538,
        "type": "self_attn.o_proj"
    },
    "model.layers.2.self_attn.o_proj": {
        "snr": 0.24751724302768707,
        "type": "self_attn.o_proj"
    },
    "model.layers.3.self_attn.o_proj": {
        "snr": 0.2728094160556793,
        "type": "self_attn.o_proj"
    },
    "model.layers.4.self_attn.o_proj": {
        "snr": 0.3001374304294586,
        "type": "self_attn.o_proj"
    },
    "model.layers.5.self_attn.o_proj": {
        "snr": 0.33903488516807556,
        "type": "self_attn.o_proj"
    },
    "model.layers.6.self_attn.o_proj": {
        "snr": 0.3530929982662201,
        "type": "self_attn.o_proj"
    },
    "model.layers.7.self_attn.o_proj": {
        "snr": 0.36753255128860474,
        "type": "self_attn.o_proj"
    },
    "model.layers.8.self_attn.o_proj": {
        "snr": 0.3373180329799652,
        "type": "self_attn.o_proj"
    },
    "model.layers.9.self_attn.o_proj": {
        "snr": 0.2970578670501709,
        "type": "self_attn.o_proj"
    },
    "model.layers.10.self_attn.o_proj": {
        "snr": 0.3076324760913849,
        "type": "self_attn.o_proj"
    },
    "model.layers.11.self_attn.o_proj": {
        "snr": 0.2766900658607483,
        "type": "self_attn.o_proj"
    },
    "model.layers.12.self_attn.o_proj": {
        "snr": 0.20973259210586548,
        "type": "self_attn.o_proj"
    },
    "model.layers.13.self_attn.o_proj": {
        "snr": 0.18185566365718842,
        "type": "self_attn.o_proj"
    },
    "model.layers.14.self_attn.o_proj": {
        "snr": 0.18329747021198273,
        "type": "self_attn.o_proj"
    },
    "model.layers.15.self_attn.o_proj": {
        "snr": 0.2437991499900818,
        "type": "self_attn.o_proj"
    },
    "model.layers.0.self_attn.q_proj": {
        "snr": 0.038040731102228165,
        "type": "self_attn.q_proj"
    },
    "model.layers.1.self_attn.q_proj": {
        "snr": 0.0707998052239418,
        "type": "self_attn.q_proj"
    },
    "model.layers.2.self_attn.q_proj": {
        "snr": 0.0787411704659462,
        "type": "self_attn.q_proj"
    },
    "model.layers.3.self_attn.q_proj": {
        "snr": 0.08089710026979446,
        "type": "self_attn.q_proj"
    },
    "model.layers.4.self_attn.q_proj": {
        "snr": 0.08591937273740768,
        "type": "self_attn.q_proj"
    },
    "model.layers.5.self_attn.q_proj": {
        "snr": 0.09852176159620285,
        "type": "self_attn.q_proj"
    },
    "model.layers.6.self_attn.q_proj": {
        "snr": 0.09690654277801514,
        "type": "self_attn.q_proj"
    },
    "model.layers.7.self_attn.q_proj": {
        "snr": 0.11181341856718063,
        "type": "self_attn.q_proj"
    },
    "model.layers.8.self_attn.q_proj": {
        "snr": 0.12042108923196793,
        "type": "self_attn.q_proj"
    },
    "model.layers.9.self_attn.q_proj": {
        "snr": 0.09799323976039886,
        "type": "self_attn.q_proj"
    },
    "model.layers.10.self_attn.q_proj": {
        "snr": 0.10901063680648804,
        "type": "self_attn.q_proj"
    },
    "model.layers.11.self_attn.q_proj": {
        "snr": 0.09307146072387695,
        "type": "self_attn.q_proj"
    },
    "model.layers.12.self_attn.q_proj": {
        "snr": 0.0880950540304184,
        "type": "self_attn.q_proj"
    },
    "model.layers.13.self_attn.q_proj": {
        "snr": 0.08886399120092392,
        "type": "self_attn.q_proj"
    },
    "model.layers.14.self_attn.q_proj": {
        "snr": 0.09955056011676788,
        "type": "self_attn.q_proj"
    },
    "model.layers.15.self_attn.q_proj": {
        "snr": 0.08929339051246643,
        "type": "self_attn.q_proj"
    },
    "model.layers.0.self_attn.v_proj": {
        "snr": 2.5501928329467773,
        "type": "self_attn.v_proj"
    },
    "model.layers.1.self_attn.v_proj": {
        "snr": 9.449499130249023,
        "type": "self_attn.v_proj"
    },
    "model.layers.2.self_attn.v_proj": {
        "snr": 7.9920830726623535,
        "type": "self_attn.v_proj"
    },
    "model.layers.3.self_attn.v_proj": {
        "snr": 50.69462585449219,
        "type": "self_attn.v_proj"
    },
    "model.layers.4.self_attn.v_proj": {
        "snr": 19.083511352539062,
        "type": "self_attn.v_proj"
    },
    "model.layers.5.self_attn.v_proj": {
        "snr": 7.21597146987915,
        "type": "self_attn.v_proj"
    },
    "model.layers.6.self_attn.v_proj": {
        "snr": 11.27744197845459,
        "type": "self_attn.v_proj"
    },
    "model.layers.7.self_attn.v_proj": {
        "snr": 4.579711437225342,
        "type": "self_attn.v_proj"
    },
    "model.layers.8.self_attn.v_proj": {
        "snr": 10.940719604492188,
        "type": "self_attn.v_proj"
    },
    "model.layers.9.self_attn.v_proj": {
        "snr": 553.4417724609375,
        "type": "self_attn.v_proj"
    },
    "model.layers.10.self_attn.v_proj": {
        "snr": 20.59434700012207,
        "type": "self_attn.v_proj"
    },
    "model.layers.11.self_attn.v_proj": {
        "snr": 26.636865615844727,
        "type": "self_attn.v_proj"
    },
    "model.layers.12.self_attn.v_proj": {
        "snr": 8.614749908447266,
        "type": "self_attn.v_proj"
    },
    "model.layers.13.self_attn.v_proj": {
        "snr": 17.722007751464844,
        "type": "self_attn.v_proj"
    },
    "model.layers.14.self_attn.v_proj": {
        "snr": 1.48500657081604,
        "type": "self_attn.v_proj"
    },
    "model.layers.15.self_attn.v_proj": {
        "snr": 2.5776851177215576,
        "type": "self_attn.v_proj"
    }
}


================================================
FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-3B-Instruct.json
================================================
{
    "model.layers.0.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.1.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.2.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.3.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.4.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.5.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.6.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.7.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.8.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.9.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.10.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.11.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.12.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.13.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.14.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.15.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.16.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.17.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.18.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.19.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.20.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.21.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.22.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.23.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.24.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.25.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.26.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.27.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "lm_head": {
        "snr": Infinity,
        "type": "lm_head"
    },
    "model.layers.0.mlp.down_proj": {
        "snr": 2.306217670440674,
        "type": "mlp.down_proj"
    },
    "model.layers.1.mlp.down_proj": {
        "snr": 2.2327167987823486,
        "type": "mlp.down_proj"
    },
    "model.layers.2.mlp.down_proj": {
        "snr": 1.4501516819000244,
        "type": "mlp.down_proj"
    },
    "model.layers.3.mlp.down_proj": {
        "snr": 1.363667607307434,
        "type": "mlp.down_proj"
    },
    "model.layers.4.mlp.down_proj": {
        "snr": 1.4520279169082642,
        "type": "mlp.down_proj"
    },
    "model.layers.5.mlp.down_proj": {
        "snr": 1.4664665460586548,
        "type": "mlp.down_proj"
    },
    "model.layers.6.mlp.down_proj": {
        "snr": 1.4122329950332642,
        "type": "mlp.down_proj"
    },
    "model.layers.7.mlp.down_proj": {
        "snr": 1.0504299402236938,
        "type": "mlp.down_proj"
    },
    "model.layers.8.mlp.down_proj": {
        "snr": 0.9837537407875061,
        "type": "mlp.down_proj"
    },
    "model.layers.9.mlp.down_proj": {
        "snr": 0.8659006357192993,
        "type": "mlp.down_proj"
    },
    "model.layers.10.mlp.down_proj": {
        "snr": 0.7936406135559082,
        "type": "mlp.down_proj"
    },
    "model.layers.11.mlp.down_proj": {
        "snr": 0.9000886678695679,
        "type": "mlp.down_proj"
    },
    "model.layers.12.mlp.down_proj": {
        "snr": 1.1559213399887085,
        "type": "mlp.down_proj"
    },
    "model.layers.13.mlp.down_proj": {
        "snr": 1.3054672479629517,
        "type": "mlp.down_proj"
    },
    "model.layers.14.mlp.down_proj": {
        "snr": 1.196791410446167,
        "type": "mlp.down_proj"
    },
    "model.layers.15.mlp.down_proj": {
        "snr": 1.3163655996322632,
        "type": "mlp.down_proj"
    },
    "model.layers.16.mlp.down_proj": {
        "snr": 1.3388997316360474,
        "type": "mlp.down_proj"
    },
    "model.layers.17.mlp.down_proj": {
        "snr": 1.592497706413269,
        "type": "mlp.down_proj"
    },
    "model.layers.18.mlp.down_proj": {
        "snr": 1.5399079322814941,
        "type": "mlp.down_proj"
    },
    "model.layers.19.mlp.down_proj": {
        "snr": 1.5683293342590332,
        "type": "mlp.down_proj"
    },
    "model.layers.20.mlp.down_proj": {
        "snr": 1.4739630222320557,
        "type": "mlp.down_proj"
    },
    "model.layers.21.mlp.down_proj": {
        "snr": 1.2608393430709839,
        "type": "mlp.down_proj"
    },
    "model.layers.22.mlp.down_proj": {
        "snr": 1.2087301015853882,
        "type": "mlp.down_proj"
    },
    "model.layers.23.mlp.down_proj": {
        "snr": 1.1851829290390015,
        "type": "mlp.down_proj"
    },
    "model.layers.24.mlp.down_proj": {
        "snr": 1.0537594556808472,
        "type": "mlp.down_proj"
    },
    "model.layers.25.mlp.down_proj": {
        "snr": 1.1649317741394043,
        "type": "mlp.down_proj"
    },
    "model.layers.26.mlp.down_proj": {
        "snr": 1.2376821041107178,
        "type": "mlp.down_proj"
    },
    "model.layers.27.mlp.down_proj": {
        "snr": 1.147771954536438,
        "type": "mlp.down_proj"
    },
    "model.layers.0.mlp.gate_proj": {
        "snr": 0.9385462999343872,
        "type": "mlp.gate_proj"
    },
    "model.layers.1.mlp.gate_proj": {
        "snr": 0.8528683185577393,
        "type": "mlp.gate_proj"
    },
    "model.layers.2.mlp.gate_proj": {
        "snr": 0.761657178401947,
        "type": "mlp.gate_proj"
    },
    "model.layers.3.mlp.gate_proj": {
        "snr": 0.6598325371742249,
        "type": "mlp.gate_proj"
    },
    "model.layers.4.mlp.gate_proj": {
        "snr": 0.44578588008880615,
        "type": "mlp.gate_proj"
    },
    "model.layers.5.mlp.gate_proj": {
        "snr": 0.4053060710430145,
        "type": "mlp.gate_proj"
    },
    "model.layers.6.mlp.gate_proj": {
        "snr": 0.3588462769985199,
        "type": "mlp.gate_proj"
    },
    "model.layers.7.mlp.gate_proj": {
        "snr": 0.35667839646339417,
        "type": "mlp.gate_proj"
    },
    "model.layers.8.mlp.gate_proj": {
        "snr": 0.3106202781200409,
        "type": "mlp.gate_proj"
    },
    "model.layers.9.mlp.gate_proj": {
        "snr": 0.2821919322013855,
        "type": "mlp.gate_proj"
    },
    "model.layers.10.mlp.gate_proj": {
        "snr": 0.29143741726875305,
        "type": "mlp.gate_proj"
    },
    "model.layers.11.mlp.gate_proj": {
        "snr": 0.29830989241600037,
        "type": "mlp.gate_proj"
    },
    "model.layers.12.mlp.gate_proj": {
        "snr": 0.2862427532672882,
        "type": "mlp.gate_proj"
    },
    "model.layers.13.mlp.gate_proj": {
        "snr": 0.2797018587589264,
        "type": "mlp.gate_proj"
    },
    "model.layers.14.mlp.gate_proj": {
        "snr": 0.2679217755794525,
        "type": "mlp.gate_proj"
    },
    "model.layers.15.mlp.gate_proj": {
        "snr": 0.2782425880432129,
        "type": "mlp.gate_proj"
    },
    "model.layers.16.mlp.gate_proj": {
        "snr": 0.3503592610359192,
        "type": "mlp.gate_proj"
    },
    "model.layers.17.mlp.gate_proj": {
        "snr": 0.3968559205532074,
        "type": "mlp.gate_proj"
    },
    "model.layers.18.mlp.gate_proj": {
        "snr": 0.4318574070930481,
        "type": "mlp.gate_proj"
    },
    "model.layers.19.mlp.gate_proj": {
        "snr": 0.4693693220615387,
        "type": "mlp.gate_proj"
    },
    "model.layers.20.mlp.gate_proj": {
        "snr": 0.5051979422569275,
        "type": "mlp.gate_proj"
    },
    "model.layers.21.mlp.gate_proj": {
        "snr": 0.5675955414772034,
        "type": "mlp.gate_proj"
    },
    "model.layers.22.mlp.gate_proj": {
        "snr": 0.5861843824386597,
        "type": "mlp.gate_proj"
    },
    "model.layers.23.mlp.gate_proj": {
        "snr": 0.4759417772293091,
        "type": "mlp.gate_proj"
    },
    "model.layers.24.mlp.gate_proj": {
        "snr": 0.38529056310653687,
        "type": "mlp.gate_proj"
    },
    "model.layers.25.mlp.gate_proj": {
        "snr": 0.3180919587612152,
        "type": "mlp.gate_proj"
    },
    "model.layers.26.mlp.gate_proj": {
        "snr": 0.2695689797401428,
        "type": "mlp.gate_proj"
    },
    "model.layers.27.mlp.gate_proj": {
        "snr": 0.21765239536762238,
        "type": "mlp.gate_proj"
    },
    "model.layers.0.mlp.up_proj": {
        "snr": 1.4919718503952026,
        "type": "mlp.up_proj"
    },
    "model.layers.1.mlp.up_proj": {
        "snr": 1.7983858585357666,
        "type": "mlp.up_proj"
    },
    "model.layers.2.mlp.up_proj": {
        "snr": 2.1709094047546387,
        "type": "mlp.up_proj"
    },
    "model.layers.3.mlp.up_proj": {
        "snr": 2.751326560974121,
        "type": "mlp.up_proj"
    },
    "model.layers.4.mlp.up_proj": {
        "snr": 3.063521385192871,
        "type": "mlp.up_proj"
    },
    "model.layers.5.mlp.up_proj": {
        "snr": 2.4026951789855957,
        "type": "mlp.up_proj"
    },
    "model.layers.6.mlp.up_proj": {
        "snr": 2.3890223503112793,
        "type": "mlp.up_proj"
    },
    "model.layers.7.mlp.up_proj": {
        "snr": 2.3861353397369385,
        "type": "mlp.up_proj"
    },
    "model.layers.8.mlp.up_proj": {
        "snr": 2.0745043754577637,
        "type": "mlp.up_proj"
    },
    "model.layers.9.mlp.up_proj": {
        "snr": 1.8550645112991333,
        "type": "mlp.up_proj"
    },
    "model.layers.10.mlp.up_proj": {
        "snr": 1.6184496879577637,
        "type": "mlp.up_proj"
    },
    "model.layers.11.mlp.up_proj": {
        "snr": 1.9287559986114502,
        "type": "mlp.up_proj"
    },
    "model.layers.12.mlp.up_proj": {
        "snr": 1.7427546977996826,
        "type": "mlp.up_proj"
    },
    "model.layers.13.mlp.up_proj": {
        "snr": 1.9872609376907349,
        "type": "mlp.up_proj"
    },
    "model.layers.14.mlp.up_proj": {
        "snr": 2.0224087238311768,
        "type": "mlp.up_proj"
    },
    "model.layers.15.mlp.up_proj": {
        "snr": 1.7851638793945312,
        "type": "mlp.up_proj"
    },
    "model.layers.16.mlp.up_proj": {
        "snr": 1.7160604000091553,
        "type": "mlp.up_proj"
    },
    "model.layers.17.mlp.up_proj": {
        "snr": 1.6870195865631104,
        "type": "mlp.up_proj"
    },
    "model.layers.18.mlp.up_proj": {
        "snr": 1.6585396528244019,
        "type": "mlp.up_proj"
    },
    "model.layers.19.mlp.up_proj": {
        "snr": 1.5509096384048462,
        "type": "mlp.up_proj"
    },
    "model.layers.20.mlp.up_proj": {
        "snr": 1.4310423135757446,
        "type": "mlp.up_proj"
    },
    "model.layers.21.mlp.up_proj": {
        "snr": 1.5009464025497437,
        "type": "mlp.up_proj"
    },
    "model.layers.22.mlp.up_proj": {
        "snr": 1.4866929054260254,
        "type": "mlp.up_proj"
    },
    "model.layers.23.mlp.up_proj": {
        "snr": 1.332513689994812,
        "type": "mlp.up_proj"
    },
    "model.layers.24.mlp.up_proj": {
        "snr": 1.073512077331543,
        "type": "mlp.up_proj"
    },
    "model.layers.25.mlp.up_proj": {
        "snr": 0.7472100257873535,
        "type": "mlp.up_proj"
    },
    "model.layers.26.mlp.up_proj": {
        "snr": 0.4880162179470062,
        "type": "mlp.up_proj"
    },
    "model.layers.27.mlp.up_proj": {
        "snr": 0.2527681589126587,
        "type": "mlp.up_proj"
    },
    "model.embed_tokens": {
        "snr": Infinity,
        "type": "model.embed_tokens"
    },
    "model.norm": {
        "snr": Infinity,
        "type": "model.norm"
    },
    "model.layers.0.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.1.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.2.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.3.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.4.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.5.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.6.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.7.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.8.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.9.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.10.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.11.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.12.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.13.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.14.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.15.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.16.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.17.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.18.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.19.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.20.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.21.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.22.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.23.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.24.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.25.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.26.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.27.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.0.self_attn.k_proj": {
        "snr": 0.08262510597705841,
        "type": "self_attn.k_proj"
    },
    "model.layers.1.self_attn.k_proj": {
        "snr": 0.1441459059715271,
        "type": "self_attn.k_proj"
    },
    "model.layers.2.self_attn.k_proj": {
        "snr": 0.21418076753616333,
        "type": "self_attn.k_proj"
    },
    "model.layers.3.self_attn.k_proj": {
        "snr": 0.22496014833450317,
        "type": "self_attn.k_proj"
    },
    "model.layers.4.self_attn.k_proj": {
        "snr": 0.23101305961608887,
        "type": "self_attn.k_proj"
    },
    "model.layers.5.self_attn.k_proj": {
        "snr": 0.23644132912158966,
        "type": "self_attn.k_proj"
    },
    "model.layers.6.self_attn.k_proj": {
        "snr": 0.23666173219680786,
        "type": "self_attn.k_proj"
    },
    "model.layers.7.self_attn.k_proj": {
        "snr": 0.19791515171527863,
        "type": "self_attn.k_proj"
    },
    "model.layers.8.self_attn.k_proj": {
        "snr": 0.22062039375305176,
        "type": "self_attn.k_proj"
    },
    "model.layers.9.self_attn.k_proj": {
        "snr": 0.21218444406986237,
        "type": "self_attn.k_proj"
    },
    "model.layers.10.self_attn.k_proj": {
        "snr": 0.24218571186065674,
        "type": "self_attn.k_proj"
    },
    "model.layers.11.self_attn.k_proj": {
        "snr": 0.21870514750480652,
        "type": "self_attn.k_proj"
    },
    "model.layers.12.self_attn.k_proj": {
        "snr": 0.22160987555980682,
        "type": "self_attn.k_proj"
    },
    "model.layers.13.self_attn.k_proj": {
        "snr": 0.22726823389530182,
        "type": "self_attn.k_proj"
    },
    "model.layers.14.self_attn.k_proj": {
        "snr": 0.20256873965263367,
        "type": "self_attn.k_proj"
    },
    "model.layers.15.self_attn.k_proj": {
        "snr": 0.24100735783576965,
        "type": "self_attn.k_proj"
    },
    "model.layers.16.self_attn.k_proj": {
        "snr": 0.23794010281562805,
        "type": "self_attn.k_proj"
    },
    "model.layers.17.self_attn.k_proj": {
        "snr": 0.2913324534893036,
        "type": "self_attn.k_proj"
    },
    "model.layers.18.self_attn.k_proj": {
        "snr": 0.28093472123146057,
        "type": "self_attn.k_proj"
    },
    "model.layers.19.self_attn.k_proj": {
        "snr": 0.31062793731689453,
        "type": "self_attn.k_proj"
    },
    "model.layers.20.self_attn.k_proj": {
        "snr": 0.2942160367965698,
        "type": "self_attn.k_proj"
    },
    "model.layers.21.self_attn.k_proj": {
        "snr": 0.28014805912971497,
        "type": "self_attn.k_proj"
    },
    "model.layers.22.self_attn.k_proj": {
        "snr": 0.3512437045574188,
        "type": "self_attn.k_proj"
    },
    "model.layers.23.self_attn.k_proj": {
        "snr": 0.2837671637535095,
        "type": "self_attn.k_proj"
    },
    "model.layers.24.self_attn.k_proj": {
        "snr": 0.2960015535354614,
        "type": "self_attn.k_proj"
    },
    "model.layers.25.self_attn.k_proj": {
        "snr": 0.5086414813995361,
        "type": "self_attn.k_proj"
    },
    "model.layers.26.self_attn.k_proj": {
        "snr": 0.24054698646068573,
        "type": "self_attn.k_proj"
    },
    "model.layers.27.self_attn.k_proj": {
        "snr": 0.247616246342659,
        "type": "self_attn.k_proj"
    },
    "model.layers.0.self_attn.o_proj": {
        "snr": 0.18390265107154846,
        "type": "self_attn.o_proj"
    },
    "model.layers.1.self_attn.o_proj": {
        "snr": 0.14759540557861328,
        "type": "self_attn.o_proj"
    },
    "model.layers.2.self_attn.o_proj": {
        "snr": 0.15726515650749207,
        "type": "self_attn.o_proj"
    },
    "model.layers.3.self_attn.o_proj": {
        "snr": 0.16903570294380188,
        "type": "self_attn.o_proj"
    },
    "model.layers.4.self_attn.o_proj": {
        "snr": 0.17953157424926758,
        "type": "self_attn.o_proj"
    },
    "model.layers.5.self_attn.o_proj": {
        "snr": 0.2351229190826416,
        "type": "self_attn.o_proj"
    },
    "model.layers.6.self_attn.o_proj": {
        "snr": 0.22804339230060577,
        "type": "self_attn.o_proj"
    },
    "model.layers.7.self_attn.o_proj": {
        "snr": 0.24786025285720825,
        "type": "self_attn.o_proj"
    },
    "model.layers.8.self_attn.o_proj": {
        "snr": 0.21847976744174957,
        "type": "self_attn.o_proj"
    },
    "model.layers.9.self_attn.o_proj": {
        "snr": 0.2092437595129013,
        "type": "self_attn.o_proj"
    },
    "model.layers.10.self_attn.o_proj": {
        "snr": 0.23278094828128815,
        "type": "self_attn.o_proj"
    },
    "model.layers.11.self_attn.o_proj": {
        "snr": 0.20468176901340485,
        "type": "self_attn.o_proj"
    },
    "model.layers.12.self_attn.o_proj": {
        "snr": 0.2353818416595459,
        "type": "self_attn.o_proj"
    },
    "model.layers.13.self_attn.o_proj": {
        "snr": 0.2702614367008209,
        "type": "self_attn.o_proj"
    },
    "model.layers.14.self_attn.o_proj": {
        "snr": 0.19177420437335968,
        "type": "self_attn.o_proj"
    },
    "model.layers.15.self_attn.o_proj": {
        "snr": 0.18293911218643188,
        "type": "self_attn.o_proj"
    },
    "model.layers.16.self_attn.o_proj": {
        "snr": 0.20286045968532562,
        "type": "self_attn.o_proj"
    },
    "model.layers.17.self_attn.o_proj": {
        "snr": 0.20763878524303436,
        "type": "self_attn.o_proj"
    },
    "model.layers.18.self_attn.o_proj": {
        "snr": 0.190629780292511,
        "type": "self_attn.o_proj"
    },
    "model.layers.19.self_attn.o_proj": {
        "snr": 0.22044304013252258,
        "type": "self_attn.o_proj"
    },
    "model.layers.20.self_attn.o_proj": {
        "snr": 0.21491236984729767,
        "type": "self_attn.o_proj"
    },
    "model.layers.21.self_attn.o_proj": {
        "snr": 0.23289704322814941,
        "type": "self_attn.o_proj"
    },
    "model.layers.22.self_attn.o_proj": {
        "snr": 0.21457163989543915,
        "type": "self_attn.o_proj"
    },
    "model.layers.23.self_attn.o_proj": {
        "snr": 0.1949365884065628,
        "type": "self_attn.o_proj"
    },
    "model.layers.24.self_attn.o_proj": {
        "snr": 0.1606779545545578,
        "type": "self_attn.o_proj"
    },
    "model.layers.25.self_attn.o_proj": {
        "snr": 0.13892440497875214,
        "type": "self_attn.o_proj"
    },
    "model.layers.26.self_attn.o_proj": {
        "snr": 0.1407029926776886,
        "type": "self_attn.o_proj"
    },
    "model.layers.27.self_attn.o_proj": {
        "snr": 0.16027599573135376,
        "type": "self_attn.o_proj"
    },
    "model.layers.0.self_attn.q_proj": {
        "snr": 0.0534212663769722,
        "type": "self_attn.q_proj"
    },
    "model.layers.1.self_attn.q_proj": {
        "snr": 0.06873775273561478,
        "type": "self_attn.q_proj"
    },
    "model.layers.2.self_attn.q_proj": {
        "snr": 0.07522258907556534,
        "type": "self_attn.q_proj"
    },
    "model.layers.3.self_attn.q_proj": {
        "snr": 0.06616844981908798,
        "type": "self_attn.q_proj"
    },
    "model.layers.4.self_attn.q_proj": {
        "snr": 0.06809444725513458,
        "type": "self_attn.q_proj"
    },
    "model.layers.5.self_attn.q_proj": {
        "snr": 0.0758095383644104,
        "type": "self_attn.q_proj"
    },
    "model.layers.6.self_attn.q_proj": {
        "snr": 0.07800278812646866,
        "type": "self_attn.q_proj"
    },
    "model.layers.7.self_attn.q_proj": {
        "snr": 0.07535763084888458,
        "type": "self_attn.q_proj"
    },
    "model.layers.8.self_attn.q_proj": {
        "snr": 0.09488166123628616,
        "type": "self_attn.q_proj"
    },
    "model.layers.9.self_attn.q_proj": {
        "snr": 0.09709945321083069,
        "type": "self_attn.q_proj"
    },
    "model.layers.10.self_attn.q_proj": {
        "snr": 0.09381720423698425,
        "type": "self_attn.q_proj"
    },
    "model.layers.11.self_attn.q_proj": {
        "snr": 0.08205580711364746,
        "type": "self_attn.q_proj"
    },
    "model.layers.12.self_attn.q_proj": {
        "snr": 0.10723169893026352,
        "type": "self_attn.q_proj"
    },
    "model.layers.13.self_attn.q_proj": {
        "snr": 0.10166660696268082,
        "type": "self_attn.q_proj"
    },
    "model.layers.14.self_attn.q_proj": {
        "snr": 0.08822792023420334,
        "type": "self_attn.q_proj"
    },
    "model.layers.15.self_attn.q_proj": {
        "snr": 0.0814041867852211,
        "type": "self_attn.q_proj"
    },
    "model.layers.16.self_attn.q_proj": {
        "snr": 0.07586681097745895,
        "type": "self_attn.q_proj"
    },
    "model.layers.17.self_attn.q_proj": {
        "snr": 0.07040166854858398,
        "type": "self_attn.q_proj"
    },
    "model.layers.18.self_attn.q_proj": {
        "snr": 0.0728282704949379,
        "type": "self_attn.q_proj"
    },
    "model.layers.19.self_attn.q_proj": {
        "snr": 0.06912193447351456,
        "type": "self_attn.q_proj"
    },
    "model.layers.20.self_attn.q_proj": {
        "snr": 0.06646180897951126,
        "type": "self_attn.q_proj"
    },
    "model.layers.21.self_attn.q_proj": {
        "snr": 0.06960278004407883,
        "type": "self_attn.q_proj"
    },
    "model.layers.22.self_attn.q_proj": {
        "snr": 0.06566876918077469,
        "type": "self_attn.q_proj"
    },
    "model.layers.23.self_attn.q_proj": {
        "snr": 0.07412787526845932,
        "type": "self_attn.q_proj"
    },
    "model.layers.24.self_attn.q_proj": {
        "snr": 0.07131384313106537,
        "type": "self_attn.q_proj"
    },
    "model.layers.25.self_attn.q_proj": {
        "snr": 0.07768437266349792,
        "type": "self_attn.q_proj"
    },
    "model.layers.26.self_attn.q_proj": {
        "snr": 0.0809575766324997,
        "type": "self_attn.q_proj"
    },
    "model.layers.27.self_attn.q_proj": {
        "snr": 0.06796683371067047,
        "type": "self_attn.q_proj"
    },
    "model.layers.0.self_attn.v_proj": {
        "snr": 1.4029983282089233,
        "type": "self_attn.v_proj"
    },
    "model.layers.1.self_attn.v_proj": {
        "snr": 3.123720169067383,
        "type": "self_attn.v_proj"
    },
    "model.layers.2.self_attn.v_proj": {
        "snr": 2.4177253246307373,
        "type": "self_attn.v_proj"
    },
    "model.layers.3.self_attn.v_proj": {
        "snr": 5.588768005371094,
        "type": "self_attn.v_proj"
    },
    "model.layers.4.self_attn.v_proj": {
        "snr": 4.395562648773193,
        "type": "self_attn.v_proj"
    },
    "model.layers.5.self_attn.v_proj": {
        "snr": 3.2982685565948486,
        "type": "self_attn.v_proj"
    },
    "model.layers.6.self_attn.v_proj": {
        "snr": 3.2798449993133545,
        "type": "self_attn.v_proj"
    },
    "model.layers.7.self_attn.v_proj": {
        "snr": 2.109200954437256,
        "type": "self_attn.v_proj"
    },
    "model.layers.8.self_attn.v_proj": {
        "snr": 3.229325532913208,
        "type": "self_attn.v_proj"
    },
    "model.layers.9.self_attn.v_proj": {
        "snr": 1.7349927425384521,
        "type": "self_attn.v_proj"
    },
    "model.layers.10.self_attn.v_proj": {
        "snr": 1.5926740169525146,
        "type": "self_attn.v_proj"
    },
    "model.layers.11.self_attn.v_proj": {
        "snr": 1.9097802639007568,
        "type": "self_attn.v_proj"
    },
    "model.layers.12.self_attn.v_proj": {
        "snr": 2.5654332637786865,
        "type": "self_attn.v_proj"
    },
    "model.layers.13.self_attn.v_proj": {
        "snr": 3.536489963531494,
        "type": "self_attn.v_proj"
    },
    "model.layers.14.self_attn.v_proj": {
        "snr": 8.366667747497559,
        "type": "self_attn.v_proj"
    },
    "model.layers.15.self_attn.v_proj": {
        "snr": 7.348303318023682,
        "type": "self_attn.v_proj"
    },
    "model.layers.16.self_attn.v_proj": {
        "snr": 2.815748691558838,
        "type": "self_attn.v_proj"
    },
    "model.layers.17.self_attn.v_proj": {
        "snr": 4.048776149749756,
        "type": "self_attn.v_proj"
    },
    "model.layers.18.self_attn.v_proj": {
        "snr": 4.426101207733154,
        "type": "self_attn.v_proj"
    },
    "model.layers.19.self_attn.v_proj": {
        "snr": 7.098501682281494,
        "type": "self_attn.v_proj"
    },
    "model.layers.20.self_attn.v_proj": {
        "snr": 3.700288772583008,
        "type": "self_attn.v_proj"
    },
    "model.layers.21.self_attn.v_proj": {
        "snr": 2.1859049797058105,
        "type": "self_attn.v_proj"
    },
    "model.layers.22.self_attn.v_proj": {
        "snr": 3.6953284740448,
        "type": "self_attn.v_proj"
    },
    "model.layers.23.self_attn.v_proj": {
        "snr": 11.148802757263184,
        "type": "self_attn.v_proj"
    },
    "model.layers.24.self_attn.v_proj": {
        "snr": 2.4171905517578125,
        "type": "self_attn.v_proj"
    },
    "model.layers.25.self_attn.v_proj": {
        "snr": 4.404144287109375,
        "type": "self_attn.v_proj"
    },
    "model.layers.26.self_attn.v_proj": {
        "snr": 2.340604782104492,
        "type": "self_attn.v_proj"
    },
    "model.layers.27.self_attn.v_proj": {
        "snr": 3.284160614013672,
        "type": "self_attn.v_proj"
    }
}


================================================
FILE: src/axolotl/integrations/spectrum/model_snr_results/snr_results_meta-llama-Llama-3.2-3B.json
================================================
{
    "model.layers.0.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.1.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.2.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.3.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.4.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.5.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.6.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.7.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.8.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.9.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.10.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.11.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.12.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.13.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.14.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.15.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.16.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.17.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.18.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.19.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.20.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.21.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.22.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.23.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.24.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.25.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.26.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "model.layers.27.input_layernorm": {
        "snr": Infinity,
        "type": "input_layernorm"
    },
    "lm_head": {
        "snr": Infinity,
        "type": "lm_head"
    },
    "model.layers.0.mlp.down_proj": {
        "snr": 2.364603281021118,
        "type": "mlp.down_proj"
    },
    "model.layers.1.mlp.down_proj": {
        "snr": 2.229910373687744,
        "type": "mlp.down_proj"
    },
    "model.layers.2.mlp.down_proj": {
        "snr": 1.4312117099761963,
        "type": "mlp.down_proj"
    },
    "model.layers.3.mlp.down_proj": {
        "snr": 1.3216407299041748,
        "type": "mlp.down_proj"
    },
    "model.layers.4.mlp.down_proj": {
        "snr": 1.4183496236801147,
        "type": "mlp.down_proj"
    },
    "model.layers.5.mlp.down_proj": {
        "snr": 1.4453660249710083,
        "type": "mlp.down_proj"
    },
    "model.layers.6.mlp.down_proj": {
        "snr": 1.4030662775039673,
        "type": "mlp.down_proj"
    },
    "model.layers.7.mlp.down_proj": {
        "snr": 1.042332649230957,
        "type": "mlp.down_proj"
    },
    "model.layers.8.mlp.down_proj": {
        "snr": 0.9530982375144958,
        "type": "mlp.down_proj"
    },
    "model.layers.9.mlp.down_proj": {
        "snr": 0.849862277507782,
        "type": "mlp.down_proj"
    },
    "model.layers.10.mlp.down_proj": {
        "snr": 0.7704945206642151,
        "type": "mlp.down_proj"
    },
    "model.layers.11.mlp.down_proj": {
        "snr": 0.8871145844459534,
        "type": "mlp.down_proj"
    },
    "model.layers.12.mlp.down_proj": {
        "snr": 1.1408143043518066,
        "type": "mlp.down_proj"
    },
    "model.layers.13.mlp.down_proj": {
        "snr": 1.2769343852996826,
        "type": "mlp.down_proj"
    },
    "model.layers.14.mlp.down_proj": {
        "snr": 1.1703068017959595,
        "type": "mlp.down_proj"
    },
    "model.layers.15.mlp.down_proj": {
        "snr": 1.2794467210769653,
        "type": "mlp.down_proj"
    },
    "model.layers.16.mlp.down_proj": {
        "snr": 1.3154453039169312,
        "type": "mlp.down_proj"
    },
    "model.layers.17.mlp.down_proj": {
        "snr": 1.5596749782562256,
        "type": "mlp.down_proj"
    },
    "model.layers.18.mlp.down_proj": {
        "snr": 1.4949405193328857,
        "type": "mlp.down_proj"
    },
    "model.layers.19.mlp.down_proj": {
        "snr": 1.5329173803329468,
        "type": "mlp.down_proj"
    },
    "model.layers.20.mlp.down_proj": {
        "snr": 1.4396660327911377,
        "type": "mlp.down_proj"
    },
    "model.layers.21.mlp.down_proj": {
        "snr": 1.217085838317871,
        "type": "mlp.down_proj"
    },
    "model.layers.22.mlp.down_proj": {
        "snr": 1.150472640991211,
        "type": "mlp.down_proj"
    },
    "model.layers.23.mlp.down_proj": {
        "snr": 1.1166225671768188,
        "type": "mlp.down_proj"
    },
    "model.layers.24.mlp.down_proj": {
        "snr": 0.9966591000556946,
        "type": "mlp.down_proj"
    },
    "model.layers.25.mlp.down_proj": {
        "snr": 1.0938347578048706,
        "type": "mlp.down_proj"
    },
    "model.layers.26.mlp.down_proj": {
        "snr": 1.1505423784255981,
        "type": "mlp.down_proj"
    },
    "model.layers.27.mlp.down_proj": {
        "snr": 1.1156749725341797,
        "type": "mlp.down_proj"
    },
    "model.layers.0.mlp.gate_proj": {
        "snr": 0.9329171776771545,
        "type": "mlp.gate_proj"
    },
    "model.layers.1.mlp.gate_proj": {
        "snr": 0.8513413667678833,
        "type": "mlp.gate_proj"
    },
    "model.layers.2.mlp.gate_proj": {
        "snr": 0.7584061026573181,
        "type": "mlp.gate_proj"
    },
    "model.layers.3.mlp.gate_proj": {
        "snr": 0.65835040807724,
        "type": "mlp.gate_proj"
    },
    "model.layers.4.mlp.gate_proj": {
        "snr": 0.436420738697052,
        "type": "mlp.gate_proj"
    },
    "model.layers.5.mlp.gate_proj": {
        "snr": 0.39712461829185486,
        "type": "mlp.gate_proj"
    },
    "model.layers.6.mlp.gate_proj": {
        "snr": 0.3530206084251404,
        "type": "mlp.gate_proj"
    },
    "model.layers.7.mlp.gate_proj": {
        "snr": 0.34982794523239136,
        "type": "mlp.gate_proj"
    },
    "model.layers.8.mlp.gate_proj": {
        "snr": 0.30338960886001587,
        "type": "mlp.gate_proj"
    },
    "model.layers.9.mlp.gate_proj": {
        "snr": 0.27569833397865295,
        "type": "mlp.gate_proj"
    },
    "model.layers.10.mlp.gate_proj": {
        "snr": 0.28934162855148315,
        "type": "mlp.gate_proj"
    },
    "model.layers.11.mlp.gate_proj": {
        "snr": 0.2929173707962036,
        "type": "mlp.gate_proj"
    },
    "model.layers.12.mlp.gate_proj": {
        "snr": 0.28263387084007263,
        "type": "mlp.gate_proj"
    },
    "model.layers.13.mlp.gate_proj": {
        "snr": 0.27778616547584534,
        "type": "mlp.gate_proj"
    },
    "model.layers.14.mlp.gate_proj": {
        "snr": 0.26527827978134155,
        "type": "mlp.gate_proj"
    },
    "model.layers.15.mlp.gate_proj": {
        "snr": 0.27635642886161804,
        "type": "mlp.gate_proj"
    },
    "model.layers.16.mlp.gate_proj": {
        "snr": 0.35072311758995056,
        "type": "mlp.gate_proj"
    },
    "model.layers.17.mlp.gate_proj": {
        "snr": 0.4002636671066284,
        "type": "mlp.gate_proj"
    },
    "model.layers.18.mlp.gate_proj": {
        "snr": 0.4319891333580017,
        "type": "mlp.gate_proj"
    },
    "model.layers.19.mlp.gate_proj": {
        "snr": 0.47527065873146057,
        "type": "mlp.gate_proj"
    },
    "model.layers.20.mlp.gate_proj": {
        "snr": 0.5112077593803406,
        "type": "mlp.gate_proj"
    },
    "model.layers.21.mlp.gate_proj": {
        "snr": 0.5749644637107849,
        "type": "mlp.gate_proj"
    },
    "model.layers.22.mlp.gate_proj": {
        "snr": 0.5967603921890259,
        "type": "mlp.gate_proj"
    },
    "model.layers.23.mlp.gate_proj": {
        "snr": 0.48045310378074646,
        "type": "mlp.gate_proj"
    },
    "model.layers.24.mlp.gate_proj": {
        "snr": 0.3838970363140106,
        "type": "mlp.gate_proj"
    },
    "model.layers.25.mlp.gate_proj": {
        "snr": 0.3108249604701996,
        "type": "mlp.gate_proj"
    },
    "model.layers.26.mlp.gate_proj": {
        "snr": 0.26704445481300354,
        "type": "mlp.gate_proj"
    },
    "model.layers.27.mlp.gate_proj": {
        "snr": 0.20953254401683807,
        "type": "mlp.gate_proj"
    },
    "model.layers.0.mlp.up_proj": {
        "snr": 1.5084924697875977,
        "type": "mlp.up_proj"
    },
    "model.layers.1.mlp.up_proj": {
        "snr": 1.7789595127105713,
        "type": "mlp.up_proj"
    },
    "model.layers.2.mlp.up_proj": {
        "snr": 2.1431775093078613,
        "type": "mlp.up_proj"
    },
    "model.layers.3.mlp.up_proj": {
        "snr": 2.762744903564453,
        "type": "mlp.up_proj"
    },
    "model.layers.4.mlp.up_proj": {
        "snr": 3.0324745178222656,
        "type": "mlp.up_proj"
    },
    "model.layers.5.mlp.up_proj": {
        "snr": 2.3884809017181396,
        "type": "mlp.up_proj"
    },
    "model.layers.6.mlp.up_proj": {
        "snr": 2.388005256652832,
        "type": "mlp.up_proj"
    },
    "model.layers.7.mlp.up_proj": {
        "snr": 2.339340925216675,
        "type": "mlp.up_proj"
    },
    "model.layers.8.mlp.up_proj": {
        "snr": 2.0497021675109863,
        "type": "mlp.up_proj"
    },
    "model.layers.9.mlp.up_proj": {
        "snr": 1.822119116783142,
        "type": "mlp.up_proj"
    },
    "model.layers.10.mlp.up_proj": {
        "snr": 1.600373387336731,
        "type": "mlp.up_proj"
    },
    "model.layers.11.mlp.up_proj": {
        "snr": 1.9298171997070312,
        "type": "mlp.up_proj"
    },
    "model.layers.12.mlp.up_proj": {
        "snr": 1.728783369064331,
        "type": "mlp.up_proj"
    },
    "model.layers.13.mlp.up_proj": {
        "snr": 1.965298056602478,
        "type": "mlp.up_proj"
    },
    "model.layers.14.mlp.up_proj": {
        "snr": 2.023681640625,
        "type": "mlp.up_proj"
    },
    "model.layers.15.mlp.up_proj": {
        "snr": 1.7721818685531616,
        "type": "mlp.up_proj"
    },
    "model.layers.16.mlp.up_proj": {
        "snr": 1.7068361043930054,
        "type": "mlp.up_proj"
    },
    "model.layers.17.mlp.up_proj": {
        "snr": 1.6673219203948975,
        "type": "mlp.up_proj"
    },
    "model.layers.18.mlp.up_proj": {
        "snr": 1.6240718364715576,
        "type": "mlp.up_proj"
    },
    "model.layers.19.mlp.up_proj": {
        "snr": 1.5169662237167358,
        "type": "mlp.up_proj"
    },
    "model.layers.20.mlp.up_proj": {
        "snr": 1.4018198251724243,
        "type": "mlp.up_proj"
    },
    "model.layers.21.mlp.up_proj": {
        "snr": 1.4556466341018677,
        "type": "mlp.up_proj"
    },
    "model.layers.22.mlp.up_proj": {
        "snr": 1.4304454326629639,
        "type": "mlp.up_proj"
    },
    "model.layers.23.mlp.up_proj": {
        "snr": 1.2785290479660034,
        "type": "mlp.up_proj"
    },
    "model.layers.24.mlp.up_proj": {
        "snr": 1.023495078086853,
        "type": "mlp.up_proj"
    },
    "model.layers.25.mlp.up_proj": {
        "snr": 0.6992124915122986,
        "type": "mlp.up_proj"
    },
    "model.layers.26.mlp.up_proj": {
        "snr": 0.4549211859703064,
        "type": "mlp.up_proj"
    },
    "model.layers.27.mlp.up_proj": {
        "snr": 0.23889905214309692,
        "type": "mlp.up_proj"
    },
    "model.embed_tokens": {
        "snr": Infinity,
        "type": "model.embed_tokens"
    },
    "model.norm": {
        "snr": Infinity,
        "type": "model.norm"
    },
    "model.layers.0.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.1.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.2.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.3.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.4.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.5.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.6.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.7.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.8.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.9.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.10.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.11.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.12.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.13.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.14.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.15.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.16.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.17.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.18.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.19.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.20.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.21.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.22.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.23.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.24.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.25.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.26.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.27.post_attention_layernorm": {
        "snr": Infinity,
        "type": "post_attention_layernorm"
    },
    "model.layers.0.self_attn.k_proj": {
        "snr": 0.08150045573711395,
        "type": "self_attn.k_proj"
    },
    "model.layers.1.self_attn.k_proj": {
        "snr": 0.1428358554840088,
        "type": "self_attn.k_proj"
    },
    "model.layers.2.self_attn.k_proj": {
        "snr": 0.2096949815750122,
        "type": "self_attn.k_proj"
    },
    "model.layers.3.self_attn.k_proj": {
        "snr": 0.22633400559425354,
        "type": "self_attn.k_proj"
    },
    "model.layers.4.self_attn.k_proj": {
        "snr": 0.2293967455625534,
        "type": "self_attn.k_proj"
    },
    "model.layers.5.self_attn.k_proj": {
        "snr": 0.23336802423000336,
        "type": "self_attn.k_proj"
    },
    "model.layers.6.self_attn.k_proj": {
        "snr": 0.23429904878139496,
        "type": "self_attn.k_proj"
    },
    "model.layers.7.self_attn.k_proj": {
        "snr": 0.19610290229320526,
        "type": "self_attn.k_proj"
    },
    "model.layers.8.self_attn.k_proj": {
        "snr": 0.2163258045911789,
        "type": "self_attn.k_proj"
    },
    "model.layers.9.self_attn.k_proj": {
        "snr": 0.21039333939552307,
        "type": "self_attn.k_proj"
    },
    "model.layers.10.self_attn.k_proj": {
        "snr": 0.23533931374549866,
        "type": "self_attn.k_proj"
    },
    "model.layers.11.self_attn.k_proj": {
        "snr": 0.21457058191299438,
        "type": "self_attn.k_proj"
    },
    "model.layers.12.self_attn.k_proj": {
        "snr": 0.21686571836471558,
        "type": "self_attn.k_proj"
    },
    "model.layers.13.self_attn.k_proj": {
        "snr": 0.22398065030574799,
        "type": "self_attn.k_proj"
    },
    "model.layers.14.self_attn.k_proj": {
        "snr": 0.20160657167434692,
        "type": "self_attn.k_proj"
    },
    "model.layers.15.self_attn.k_proj": {
        "snr": 0.23705022037029266,
        "type": "self_attn.k_proj"
    },
    "model.layers.16.self_attn.k_proj": {
        "snr": 0.23254962265491486,
        "type": "self_attn.k_proj"
    },
    "model.layers.17.self_attn.k_proj": {
        "snr": 0.2892642617225647,
        "type": "self_attn.k_proj"
    },
    "model.layers.18.self_attn.k_proj": {
        "snr": 0.27587130665779114,
        "type": "self_attn.k_proj"
    },
    "model.layers.19.self_attn.k_proj": {
        "snr": 0.30891212821006775,
        "type": "self_attn.k_proj"
    },
    "model.layers.20.self_attn.k_proj": {
        "snr": 0.28997519612312317,
        "type": "self_attn.k_proj"
    },
    "model.layers.21.self_attn.k_proj": {
        "snr": 0.27534863352775574,
        "type": "self_attn.k_proj"
    },
    "model.layers.22.self_attn.k_proj": {
        "snr": 0.35139667987823486,
        "type": "self_attn.k_proj"
    },
    "model.layers.23.self_attn.k_proj": {
        "snr": 0.2773109972476959,
        "type": "self_attn.k_proj"
    },
    "model.layers.24.self_attn.k_proj": {
        "snr": 0.2853511571884155,
        "type": "self_attn.k_proj"
    },
    "model.layers.25.self_attn.k_proj": {
        "snr": 0.5030262470245361,
        "type": "self_attn.k_proj"
    },
    "model.layers.26.self_attn.k_proj": {
        "snr": 0.2317112237215042,
        "type": "self_attn.k_proj"
    },
    "model.layers.27.self_attn.k_proj": {
        "snr": 0.24419328570365906,
        "type": "self_attn.k_proj"
    },
    "model.layers.0.self_attn.o_proj": {
        "snr": 0.17767645418643951,
        "type": "self_attn.o_proj"
    },
    "model.layers.1.self_attn.o_proj": {
        "snr": 0.14102177321910858,
        "type": "self_attn.o_proj"
    },
    "model.layers.2.self_attn.o_proj": {
        "snr": 0.1523692011833191,
        "type": "self_attn.o_proj"
    },
    "model.layers.3.self_attn.o_proj": {
        "snr": 0.16522075235843658,
        "type": "self_attn.o_proj"
    },
    "model.layers.4.self_attn.o_proj": {
        "snr": 0.17483487725257874,
        "type": "self_attn.o_proj"
    },
    "model.layers.5.self_attn.o_proj": {
        "snr": 0.227921262383461,
        "type": "self_attn.o_proj"
    },
    "model.layers.6.self_attn.o_proj": {
        "snr": 0.2196175903081894,
        "type": "self_attn.o_proj"
    },
    "model.layers.7.self_attn.o_proj": {
        "snr": 0.24270132184028625,
        "type": "self_attn.o_proj"
    },
    "model.layers.8.self_attn.o_proj": {
        "snr": 0.2118290364742279,
        "type": "self_attn.o_proj"
    },
    "model.layers.9.self_attn.o_proj": {
        "snr": 0.20525991916656494,
        "type": "self_attn.o_proj"
    },
    "model.layers.10.self_attn.o_proj": {
        "snr": 0.22847208380699158,
        "type": "self_attn.o_proj"
    },
    "model.layers.11.self_attn.o_proj": {
        "snr": 0.19665324687957764,
        "type": "self_attn.o_proj"
    },
    "model.layers.12.self_attn.o_proj": {
        "snr": 0.23233532905578613,
        "type": "self_attn.o_proj"
    },
    "model.layers.13.self_attn.o_proj": {
        "snr": 0.2624332308769226,
        "type": "self_attn.o_proj"
    },
    "model.layers.14.self_attn.o_proj": {
        "snr": 0.1868327558040619,
        "type": "self_attn.o_proj"
    },
    "model.layers.15.self_attn.o_proj": {
        "snr": 0.17706255614757538,
        "type": "self_attn.o_proj"
    },
    "model.layers.16.self_attn.o_proj": {
        "snr": 0.19422705471515656,
        "type": "self_attn.o_proj"
    },
    "model.layers.17.self_attn.o_proj": {
        "snr": 0.2000615894794464,
        "type": "self_attn.o_proj"
    },
    "model.layers.18.self_attn.o_proj": {
        "snr": 0.1874573826789856,
        "type": "self_attn.o_proj"
    },
    "model.layers.19.self_attn.o_proj": {
        "snr": 0.21297843754291534,
        "type": "self_attn.o_proj"
    },
    "model.layers.20.self_attn.o_proj": {
        "snr": 0.2100859135389328,
        "type": "self_attn.o_proj"
    },
    "model.layers.21.self_attn.o_proj": {
        "snr": 0.22561520338058472,
        "type": "self_attn.o_proj"
    },
    "model.layers.22.self_attn.o_proj": {
        "snr": 0.20994484424591064,
        "type": "self_attn.o_proj"
    },
    "model.layers.23.self_attn.o_proj": {
        "snr": 0.18978221714496613,
        "type": "self_attn.o_proj"
    },
    "model.layers.24.self_attn.o_proj": {
        "snr": 0.1571759581565857,
        "type": "self_attn.o_proj"
    },
    "model.layers.25.self_attn.o_proj": {
        "snr": 0.1349896937608719,
        "type": "self_attn.o_proj"
    },
    "model.layers.26.self_attn.o_proj": {
        "snr": 0.1368866115808487,
        "type": "self_attn.o_proj"
    },
    "model.layers.27.self_attn.o_proj": {
        "snr": 0.1571887582540512,
        "type": "self_attn.o_proj"
    },
    "model.layers.0.self_attn.q_proj": {
        "snr": 0.05295897275209427,
        "type": "self_attn.q_proj"
    },
    "model.layers.1.self_attn.q_proj": {
        "snr": 0.06835605204105377,
        "type": "self_attn.q_proj"
    },
    "model.layers.2.self_attn.q_proj": {
        "snr": 0.0746372863650322,
        "type": "self_attn.q_proj"
    },
    "model.layers.3.self_attn.q_proj": {
        "snr": 0.06615085154771805,
        "type": "self_attn.q_proj"
    },
    "model.layers.4.self_attn.q_proj": {
        "snr": 0.06788161396980286,
        "type": "self_attn.q_proj"
    },
    "model.layers.5.self_attn.q_proj": {
        "snr": 0.07514483481645584,
        "type": "self_attn.q_proj"
    },
    "model.layers.6.self_attn.q_proj": {
        "snr": 0.07777862250804901,
        "type": "self_attn.q_proj"
    },
    "model.layers.7.self_attn.q_proj": {
        "snr": 0.07534090429544449,
        "type": "self_attn.q_proj"
    },
    "model.layers.8.self_attn.q_proj": {
        "snr": 0.09494179487228394,
        "type": "self_attn.q_proj"
    },
    "model.layers.9.self_attn.q_proj": {
        "snr": 0.09699037671089172,
        "type": "self_attn.q_proj"
    },
    "model.layers.10.self_attn.q_proj": {
        "snr": 0.09426294267177582,
        "type": "self_attn.q_proj"
    },
    "model.layers.11.self_attn.q_proj": {
        "snr": 0.08260341733694077,
        "type": "self_attn.q_proj"
    },
    "model.layers.12.self_attn.q_proj": {
        "snr": 0.10650420933961868,
        "type": "self_attn.q_proj"
    },
    "model.layers.13.self_attn.q_proj": {
        "snr": 0.10250870138406754,
        "type": "self_attn.q_proj"
    },
    "model.layers.14.self_attn.q_proj": {
        "snr": 0.08775162696838379,
        "type": "self_attn.q_proj"
    },
    "model.layers.15.self_attn.q_proj": {
        "snr": 0.08071447163820267,
        "type": "self_attn.q_proj"
    },
    "model.layers.16.self_attn.q_proj": {
        "snr": 0.07530857622623444,
        "type": "self_attn.q_proj"
    },
    "model.layers.17.self_attn.q_proj": {
        "snr": 0.06964966654777527,
        "type": "self_attn.q_proj"
    },
    "model.layers.18.self_attn.q_proj": {
        "snr": 0.07150755077600479,
        "type": "self_attn.q_proj"
    },
    "model.layers.19.self_attn.q_proj": {
        "snr": 0.0676807165145874,
        "type": "self_attn.q_proj"
    },
    "model.layers.20.self_attn.q_proj": {
        "snr": 0.06511317938566208,
        "type": "self_attn.q_proj"
    },
    "model.layers.21.self_attn.q_proj": {
        "snr": 0.06773187220096588,
        "type": "self_attn.q_proj"
    },
    "model.layers.22.self_attn.q_proj": {
        "snr": 0.06400436162948608,
        "type": "self_attn.q_proj"
    },
    "model.layers.23.self_attn.q_proj": {
        "snr": 0.0726117342710495,
        "type": "self_attn.q_proj"
    },
    "model.layers.24.self_attn.q_proj": {
        "snr": 0.06882446259260178,
        "type": "self_attn.q_proj"
    },
    "model.layers.25.self_attn.q_proj": {
        "snr": 0.07506493479013443,
        "type": "self_attn.q_proj"
    },
    "model.layers.26.self_attn.q_proj": {
        "snr": 0.07797915488481522,
        "type": "self_attn.q_proj"
    },
    "model.layers.27.self_attn.q_proj": {
        "snr": 0.06680692732334137,
        "type": "self_attn.q_proj"
    },
    "model.layers.0.self_attn.v_proj": {
        "snr": 1.326789379119873,
        "type": "self_attn.v_proj"
    },
    "model.layers.1.self_attn.v_proj": {
        "snr": 3.043806791305542,
        "type": "self_attn.v_proj"
    },
    "model.layers.2.self_attn.v_proj": {
        "snr": 2.295107841491699,
        "type": "self_attn.v_proj"
    },
    "model.layers.3.self_attn.v_proj": {
        "snr": 5.2584614753723145,
        "type": "self_attn.v_proj"
    },
    "model.layers.4.self_attn.v_proj": {
        "snr": 4.038785934448242,
        "type": "self_attn.v_proj"
    },
    "model.layers.5.self_attn.v_proj": {
        "snr": 3.0907773971557617,
        "type": "self_attn.v_proj"
    },
    "model.layers.6.self_attn.v_proj": {
        "snr": 3.114994525909424,
        "type": "self_attn.v_proj"
    },
    "model.layers.7.self_attn.v_proj": {
        "snr": 1.9747973680496216,
        "type": "self_attn.v_proj"
    },
    "model.layers.8.self_attn.v_proj": {
        "snr": 3.0469374656677246,
        "type": "self_attn.v_proj"
    },
    "model.layers.9.self_attn.v_proj": {
        "snr": 1.602966547012329,
        "type": "self_attn.v_proj"
    },
    "model.layers.10.self_attn.v_proj": {
        "snr": 1.489019513130188,
        "type": "self_attn.v_proj"
    },
    "model.layers.11.self_attn.v_proj": {
        "snr": 1.7490826845169067,
        "type": "self_attn.v_proj"
    },
    "model.layers.12.self_attn.v_proj": {
        "snr": 2.451310396194458,
        "type": "self_attn.v_proj"
    },
    "model.layers.13.self_attn.v_proj": {
        "snr": 3.250821590423584,
        "type": "self_attn.v_proj"
    },
    "model.layers.14.self_attn.v_proj": {
        "snr": 7.944663047790527,
        "type": "self_attn.v_proj"
    },
    "model.layers.15.self_attn.v_proj": {
        "snr": 7.013208389282227,
        "type": "self_attn.v_proj"
    },
    "model.layers.16.self_attn.v_proj": {
        "snr": 2.68644118309021,
        "type": "self_attn.v_proj"
    },
    "model.layers.17.self_attn.v_proj": {
        "snr": 3.9063122272491455,
        "type": "self_attn.v_proj"
    },
    "model.layers.18.self_attn.v_proj": {
        "snr": 4.1816816329956055,
        "type": "self_attn.v_proj"
    },
    "model.layers.19.self_attn.v_proj": {
        "snr": 6.794488906860352,
        "type": "self_attn.v_proj"
    },
    "model.layers.20.self_attn.v_proj": {
        "snr": 3.401334285736084,
        "type": "self_attn.v_proj"
    },
    "model.layers.21.self_attn.v_proj": {
        "snr": 2.051994562149048,
        "type": "self_attn.v_proj"
    },
    "model.layers.22.self_attn.v_proj": {
        "snr": 3.614379405975342,
        "type": "self_attn.v_proj"
    },
    "model.layers.23.self_attn.v_proj": {
        "snr": 11.180968284606934,
        "type": "self_attn.v_proj"
    },
    "model.layers.24.self_attn.v_proj": {
        "snr": 2.3629775047302246,
        "type": "self_attn.v_proj"
    },
    "model.layers.25.self_attn.v_proj": {
        "snr": 4.137593746185303,
        "type": "self_attn.v_proj"
    },
    "model.layers.26.self_attn.v_proj": {
        "snr": 2.3465518951416016,
        "type": "self_attn.v_proj"
    },
    "model.layers.27.self_attn.v_proj": {
        "snr": 3.10064697265625,
        "type": "self_attn.v_proj"
    }
}


================================================
FILE: src/axolotl/integrations/swanlab/README.md
================================================
# SwanLab Integration for Axolotl

SwanLab is an open-source, lightweight AI experiment tracking and visualization tool that provides a platform for tracking, recording, comparing, and collaborating on experiments.

This integration enables seamless experiment tracking and visualization of Axolotl training runs using SwanLab.

## Features

- 📊 **Automatic Metrics Logging**: Training loss, learning rate, and other metrics are automatically logged
- 🎯 **Hyperparameter Tracking**: Model configuration and training parameters are tracked
- 📈 **Real-time Visualization**: Monitor training progress in real-time through SwanLab dashboard
- ☁️ **Cloud & Local Support**: Works in both cloud-synced and offline modes
- 🔄 **Experiment Comparison**: Compare multiple training runs easily
- 🤝 **Team Collaboration**: Share experiments with team members
- 🎭 **RLHF Completion Logging**: Automatically log model outputs during DPO/KTO/ORPO/GRPO training for qualitative analysis
- ⚡ **Performance Profiling**: Built-in profiling decorators to measure and optimize training performance
- 🔔 **Lark Notifications**: Send real-time training updates to team chat (Feishu/Lark integration)

## Installation

```bash
pip install swanlab
```

## Quick Start

### 1. Register for SwanLab (Optional for cloud mode)

If you want to use cloud sync features, register at [https://swanlab.cn](https://swanlab.cn) to get your API key.

### 2. Configure Axolotl Config File

Add SwanLab configuration to your Axolotl YAML config:

```yaml
# Enable SwanLab plugin
plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin

# SwanLab configuration
use_swanlab: true
swanlab_project: my-llm-project
swanlab_experiment_name: qwen-finetune-v1
swanlab_mode: cloud  # Options: cloud, local, offline, disabled
swanlab_workspace: my-team  # Optional: organization name
swanlab_api_key: YOUR_API_KEY  # Optional: can also use env var SWANLAB_API_KEY
```

### 3. Run Training

```bash
# Set API key via environment variable (recommended)
export SWANLAB_API_KEY=your-api-key-here

# Or login once
swanlab login

# Run training as usual
accelerate launch -m axolotl.cli.train your-config.yaml
```

## Configuration Options

### Basic Configuration

| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `use_swanlab` | bool | `false` | Enable SwanLab tracking |
| `swanlab_project` | str | `None` | Project name (required) |
| `swanlab_experiment_name` | str | `None` | Experiment name |
| `swanlab_description` | str | `None` | Experiment description |
| `swanlab_mode` | str | `cloud` | Sync mode: `cloud`, `local`, `offline`, `disabled` |

### Advanced Configuration

| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `swanlab_workspace` | str | `None` | Workspace/organization name |
| `swanlab_api_key` | str | `None` | API key (prefer env var) |
| `swanlab_web_host` | str | `None` | Private deployment web host |
| `swanlab_api_host` | str | `None` | Private deployment API host |
| `swanlab_log_model` | bool | `false` | Log model checkpoints (coming soon) |
| `swanlab_lark_webhook_url` | str | `None` | Lark (Feishu) webhook URL for team notifications |
| `swanlab_lark_secret` | str | `None` | Lark webhook HMAC secret for authentication |
| `swanlab_log_completions` | bool | `true` | Enable RLHF completion table logging (DPO/KTO/ORPO/GRPO) |
| `swanlab_completion_log_interval` | int | `100` | Steps between completion logging |
| `swanlab_completion_max_buffer` | int | `128` | Max completions to buffer (memory bound) |

## Configuration Examples

### Example 1: Basic Cloud Sync

```yaml
plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin

use_swanlab: true
swanlab_project: llama-finetune
swanlab_experiment_name: llama-3-8b-instruct-v1
swanlab_mode: cloud
```

### Example 2: Offline/Local Mode

```yaml
plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin

use_swanlab: true
swanlab_project: local-experiments
swanlab_experiment_name: test-run-1
swanlab_mode: local  # or 'offline'
```

### Example 3: Team Workspace

```yaml
plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin

use_swanlab: true
swanlab_project: research-project
swanlab_experiment_name: experiment-42
swanlab_workspace: my-research-team
swanlab_mode: cloud
```

### Example 4: Private Deployment

```yaml
plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin

use_swanlab: true
swanlab_project: internal-project
swanlab_experiment_name: secure-training
swanlab_mode: cloud
swanlab_web_host: https://swanlab.yourcompany.com
swanlab_api_host: https://api.swanlab.yourcompany.com
```

## Team Notifications with Lark (Feishu)

SwanLab supports sending real-time training notifications to your team chat via Lark (Feishu), ByteDance's enterprise collaboration platform. This is especially useful for:
- **Production training monitoring**: Get alerts when training starts, completes, or encounters errors
- **Team collaboration**: Keep your ML team informed about long-running experiments
- **Multi-timezone teams**: Team members can check training progress without being online

### Prerequisites

1. **Lark Bot Setup**: Create a custom bot in your Lark group chat
2. **Webhook URL**: Get the webhook URL from your Lark bot settings
3. **HMAC Secret** (recommended): Enable signature verification in your Lark bot for security

For detailed Lark bot setup instructions, see [Lark Custom Bot Documentation](https://open.feishu.cn/document/ukTMukTMukTM/ucTM5YjL3ETO24yNxkjN).

### Example 5: Basic Lark Notifications

Send training notifications to a Lark group chat:

```yaml
plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin

use_swanlab: true
swanlab_project: production-training
swanlab_experiment_name: llama-3-finetune-v2
swanlab_mode: cloud

# Lark notification (basic, no HMAC verification)
swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx
```

**Note**: This configuration will work, but you'll see a security warning recommending HMAC secret configuration.

### Example 6: Lark Notifications with HMAC Security (Recommended)

For production use, enable HMAC signature verification:

```yaml
plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin

use_swanlab: true
swanlab_project: production-training
swanlab_experiment_name: llama-3-finetune-v2
swanlab_mode: cloud

# Lark notification with HMAC authentication
swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx
swanlab_lark_secret: your-webhook-secret-key
```

**Why HMAC secret matters**:
- Prevents unauthorized parties from sending fake notifications to your Lark group
- Ensures notifications genuinely come from your training jobs
- Required for production deployments with sensitive training data

### Example 7: Team Workspace + Lark Notifications

Combine team workspace collaboration with Lark notifications:

```yaml
plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin

use_swanlab: true
swanlab_project: research-project
swanlab_experiment_name: multimodal-experiment-42
swanlab_workspace: ml-research-team
swanlab_mode: cloud

# Notify team via Lark when training starts/completes
swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxx
swanlab_lark_secret: your-webhook-secret-key
```

### What Notifications Are Sent?

SwanLab's Lark integration sends notifications for key training events:
- **Training Start**: When your experiment begins
- **Training Complete**: When training finishes successfully
- **Training Errors**: If training crashes or encounters critical errors
- **Metric Milestones**: Configurable alerts for metric thresholds (if configured in SwanLab)

Each notification includes:
- Experiment name and project
- Training status
- Key metrics (loss, learning rate)
- Direct link to SwanLab dashboard

### Lark Configuration Validation

The plugin validates your Lark configuration at startup:

#### ✅ Valid Configurations

```yaml
# Option 1: No Lark (default)
use_swanlab: true
swanlab_project: my-project
# No swanlab_lark_webhook_url → Lark disabled, no warnings

# Option 2: Lark with HMAC secret (recommended)
use_swanlab: true
swanlab_project: my-project
swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxx
swanlab_lark_secret: your-secret
# ✅ Logs: "Registered Lark notification callback with HMAC authentication"

# Option 3: Lark without secret (works but not recommended)
use_swanlab: true
swanlab_project: my-project
swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxx
# ⚠️ Logs: "Registered Lark notification callback (no HMAC secret)"
# ⚠️ Warning: "Lark webhook has no secret configured. For production use, set 'swanlab_lark_secret'..."
```

### Security Best Practices

1. **Always use HMAC secret in production**:
   ```yaml
   swanlab_lark_webhook_url: https://open.feishu.cn/...
   swanlab_lark_secret: your-secret-key  # ✅ Add this!
   ```

2. **Store secrets in environment variables** (even better):
   ```yaml
   # In your training script/environment
   export SWANLAB_LARK_WEBHOOK_URL="https://open.feishu.cn/..."
   export SWANLAB_LARK_SECRET="your-secret-key"
   ```

   Then in config:
   ```yaml
   # SwanLab plugin will auto-detect environment variables
   use_swanlab: true
   swanlab_project: my-project
   # Lark URL and secret read from env vars
   ```

3. **Rotate webhook secrets periodically**: Update your Lark bot's secret every 90 days

4. **Use separate webhooks for dev/prod**: Don't mix development and production notifications

### Distributed Training

Lark notifications are automatically deduplicated in distributed training:
- Only **rank 0** sends notifications
- Other GPU ranks skip Lark registration
- Prevents duplicate messages in multi-GPU training

```bash
# Running on 4 GPUs
torchrun --nproc_per_node=4 -m axolotl.cli.train config.yml

# Expected logs:
# [Rank 0] Registered Lark notification callback with HMAC authentication
# [Rank 1-3] (no Lark registration messages)
```

## RLHF Completion Table Logging

For RLHF (Reinforcement Learning from Human Feedback) training methods like DPO, KTO, ORPO, and GRPO, SwanLab can log model completions (prompts, chosen/rejected responses, rewards) to a visual table for qualitative analysis. This helps you:

- **Inspect model behavior**: See actual model outputs during training
- **Debug preference learning**: Compare chosen vs rejected responses
- **Track reward patterns**: Monitor how rewards evolve over training
- **Share examples with team**: Visual tables in SwanLab dashboard

### Features

- ✅ **Automatic detection**: Works with DPO, KTO, ORPO, GRPO trainers
- ✅ **Memory-safe buffering**: Bounded buffer prevents memory leaks in long training runs
- ✅ **Periodic logging**: Configurable logging interval to reduce overhead
- ✅ **Rich visualization**: SwanLab tables show prompts, responses, and metrics side-by-side

### Configuration

| Parameter | Type | Default | Description |
|-----------|------|---------|-------------|
| `swanlab_log_completions` | bool | `true` | Enable completion logging for RLHF trainers |
| `swanlab_completion_log_interval` | int | `100` | Log completions to SwanLab every N training steps |
| `swanlab_completion_max_buffer` | int | `128` | Maximum completions to buffer (memory bound) |

### Example: DPO Training with Completion Logging

```yaml
plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin

use_swanlab: true
swanlab_project: dpo-training
swanlab_experiment_name: llama-3-dpo-v1
swanlab_mode: cloud

# RLHF completion logging (enabled by default)
swanlab_log_completions: true
swanlab_completion_log_interval: 100  # Log every 100 steps
swanlab_completion_max_buffer: 128    # Keep last 128 completions

# DPO-specific config
rl: dpo
datasets:
  - path: /path/to/preference_dataset
    type: chatml.intel
```

### Example: Disable Completion Logging

If you're doing a quick test run or don't need completion tables:

```yaml
plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin

use_swanlab: true
swanlab_project: dpo-training

# Disable completion logging
swanlab_log_completions: false
```

### Supported RLHF Trainers

The completion logging callback automatically activates for these trainer types:

- **DPO (Direct Preference Optimization)**: Logs prompts, chosen, rejected, reward_diff
- **KTO (Kahneman-Tversky Optimization)**: Logs prompts, completions, labels, rewards
- **ORPO (Odds Ratio Preference Optimization)**: Logs prompts, chosen, rejected, log_odds_ratio
- **GRPO (Group Relative Policy Optimization)**: Logs prompts, completions, rewards, advantages
- **CPO (Constrained Policy Optimization)**: Logs prompts, chosen, rejected

For non-RLHF trainers (standard supervised fine-tuning), the completion callback is automatically skipped.

### How It Works

1. **Auto-detection**: Plugin detects trainer type at initialization
2. **Buffering**: Completions are buffered in memory (up to `swanlab_completion_max_buffer`)
3. **Periodic logging**: Every `swanlab_completion_log_interval` steps, buffer is logged to SwanLab
4. **Memory safety**: Old completions are automatically dropped when buffer is full (uses `collections.deque`)
5. **Final flush**: Remaining completions are logged when training completes

### Viewing Completion Tables

After training starts, you can view completion tables in your SwanLab dashboard:

1. Navigate to your experiment in SwanLab
2. Look for the "rlhf_completions" table in the metrics panel
3. The table shows:
   - **step**: Training step when completion was generated
   - **prompt**: Input prompt
   - **chosen**: Preferred response (DPO/ORPO)
   - **rejected**: Non-preferred response (DPO/ORPO)
   - **completion**: Model output (KTO/GRPO)
   - **reward_diff/reward**: Reward metrics
   - Trainer-specific metrics (e.g., log_odds_ratio for ORPO)

### Memory Management

The completion buffer is **memory-bounded** to prevent memory leaks:

```python
# Internal implementation uses deque with maxlen
from collections import deque

buffer = deque(maxlen=128)  # Old completions automatically dropped
```

**Memory usage estimate**:
- Average completion: ~500 characters (prompt + responses)
- Buffer size 128: ~64 KB (negligible)
- Buffer size 1024: ~512 KB (still small)

**Recommendation**: Default buffer size (128) works well for most cases. Increase to 512-1024 only if you need to review more historical completions.

### Performance Impact

Completion logging has minimal overhead:

- **Buffering**: O(1) append operation, negligible CPU/memory
- **Logging**: Only happens every N steps (default: 100)
- **Network**: SwanLab batches table uploads efficiently

**Expected overhead**: < 0.5% per training step

### Troubleshooting

#### Completions not appearing in SwanLab

**Cause**: Trainer may not be logging completion data in the expected format.

**Diagnostic steps**:
1. Check trainer type detection in logs:
   ```text
   INFO: SwanLab RLHF completion logging enabled for DPOTrainer (type: dpo)
   ```
2. Verify your trainer is an RLHF trainer (DPO/KTO/ORPO/GRPO)
3. Check if trainer logs completion data (this depends on TRL version)

**Note**: The current implementation expects trainers to log completion data in the `logs` dict during `on_log()` callback. Some TRL trainers may not expose this data by default. You may need to patch the trainer to expose completions.

#### Buffer fills up too quickly

**Cause**: High logging frequency with small buffer size.

**Solution**: Increase buffer size or logging interval:
```yaml
swanlab_completion_log_interval: 200  # Log less frequently
swanlab_completion_max_buffer: 512    # Larger buffer
```

#### Memory usage growing over time

**Cause**: Buffer should be bounded, so this indicates a bug.

**Solution**:
1. Verify `swanlab_completion_max_buffer` is set
2. Check SwanLab version is up to date
3. Report issue with memory profiling data

## Performance Profiling

SwanLab integration includes profiling utilities to measure and log execution time of trainer methods. This helps you:

- **Identify bottlenecks**: Find slow operations in your training loop
- **Optimize performance**: Track improvements after optimization changes
- **Monitor distributed training**: See per-rank timing differences
- **Debug hangs**: Detect methods that take unexpectedly long

### Features

- ✅ **Zero-config profiling**: Automatic timing of key trainer methods
- ✅ **Decorator-based**: Easy to add profiling to custom methods with `@swanlab_profile`
- ✅ **Context manager**: Fine-grained profiling with `swanlab_profiling_context()`
- ✅ **Advanced filtering**: `ProfilingConfig` for throttling and minimum duration thresholds
- ✅ **Exception-safe**: Logs duration even if function raises an exception

### Basic Usage: Decorator

Add profiling to any trainer method with the `@swanlab_profile` decorator:

```python
from axolotl.integrations.swanlab.profiling import swanlab_profile

class MyCustomTrainer(AxolotlTrainer):
    @swanlab_profile
    def training_step(self, model, inputs):
        # Your training step logic
        return super().training_step(model, inputs)

    @swanlab_profile
    def prediction_step(self, model, inputs, prediction_loss_only):
        # Your prediction logic
        return super().prediction_step(model, inputs, prediction_loss_only)
```

The decorator automatically:
1. Measures execution time with high-precision timer
2. Logs to SwanLab as `profiling/Time taken: ClassName.method_name`
3. Only logs if SwanLab is enabled (`use_swanlab: true`)
4. Gracefully handles exceptions (logs duration, then re-raises)

### Advanced Usage: Context Manager

For fine-grained profiling within a method:

```python
from axolotl.integrations.swanlab.profiling import swanlab_profiling_context

class MyTrainer(AxolotlTrainer):
    def complex_training_step(self, model, inputs):
        # Profile just the forward pass
        with swanlab_profiling_context(self, "forward_pass"):
            outputs = model(**inputs)

        # Profile just the backward pass
        with swanlab_profiling_context(self, "backward_pass"):
            loss = outputs.loss
            loss.backward()

        return outputs
```

### Advanced Usage: ProfilingConfig

Filter and throttle profiling logs with `ProfilingConfig`:

```python
from axolotl.integrations.swanlab.profiling import (
    swanlab_profiling_context_advanced,
    ProfilingConfig,
)

# Create custom profiling config
profiling_config = ProfilingConfig(
    enabled=True,
    min_duration_ms=1.0,    # Only log if duration > 1ms
    log_interval=10,        # Log every 10th call
)

class MyTrainer(AxolotlTrainer):
    def frequently_called_method(self, data):
        with swanlab_profiling_context_advanced(
            self,
            "frequent_op",
            config=profiling_config
        ):
            # This only logs every 10th call, and only if it takes > 1ms
            result = expensive_computation(data)
        return result
```

**ProfilingConfig Parameters**:
- `enabled`: Enable/disable profiling globally (default: `True`)
- `min_duration_ms`: Minimum duration to log in milliseconds (default: `0.1`)
- `log_interval`: Log every Nth function call (default: `1` = log all)

**Use cases**:
- **High-frequency methods**: Use `log_interval=100` to reduce logging overhead
- **Filter noise**: Use `min_duration_ms=1.0` to skip very fast operations
- **Debugging**: Use `log_interval=1, min_duration_ms=0.0` to log everything

### Viewing Profiling Metrics

In your SwanLab dashboard, profiling metrics appear under the "profiling" namespace:

```text
profiling/Time taken: AxolotlTrainer.training_step
profiling/Time taken: AxolotlTrainer.prediction_step
profiling/Time taken: MyTrainer.forward_pass
profiling/Time taken: MyTrainer.backward_pass
```

You can:
- **Track over time**: See if methods get faster/slower during training
- **Compare runs**: Compare profiling metrics across experiments
- **Identify regressions**: Detect if a code change slowed down training

### Configuration in Axolotl Config

Profiling is automatically enabled when SwanLab is enabled. No additional config needed:

```yaml
plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin

use_swanlab: true
swanlab_project: my-project

# Profiling is automatically enabled
# Add @swanlab_profile decorators to your custom trainer methods
```

To disable profiling while keeping SwanLab enabled:

```python
# In your custom trainer code
from axolotl.integrations.swanlab.profiling import DEFAULT_PROFILING_CONFIG

# Disable profiling globally
DEFAULT_PROFILING_CONFIG.enabled = False
```

### Performance Impact

- **Decorator overhead**: ~2-5 microseconds per call (negligible)
- **Context manager overhead**: ~1-3 microseconds (negligible)
- **Logging overhead**: Only when SwanLab is enabled and method duration exceeds threshold
- **Network overhead**: SwanLab batches metrics efficiently

**Expected overhead**: < 0.1% per training step (effectively zero)

### Best Practices

1. **Profile bottlenecks first**: Start by profiling suspected slow operations
2. **Use min_duration_ms**: Filter out fast operations (< 1ms) to reduce noise
3. **Throttle high-frequency calls**: Use `log_interval` for methods called > 100 times/step
4. **Profile across runs**: Compare profiling metrics before/after optimization
5. **Monitor distributed training**: Check for rank-specific slowdowns

### Example: Complete Profiling Setup

```python
from axolotl.integrations.swanlab.profiling import (
    swanlab_profile,
    swanlab_profiling_context,
    ProfilingConfig,
)

class OptimizedTrainer(AxolotlTrainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        # Custom profiling config for high-frequency operations
        self.fast_op_config = ProfilingConfig(
            enabled=True,
            min_duration_ms=0.5,
            log_interval=50,
        )

    @swanlab_profile
    def training_step(self, model, inputs):
        """Main training step - always profile."""
        return super().training_step(model, inputs)

    @swanlab_profile
    def compute_loss(self, model, inputs, return_outputs=False):
        """Loss computation - always profile."""
        return super().compute_loss(model, inputs, return_outputs)

    def _prepare_inputs(self, inputs):
        """High-frequency operation - throttled profiling."""
        with swanlab_profiling_context_advanced(
            self,
            "prepare_inputs",
            config=self.fast_op_config,
        ):
            return super()._prepare_inputs(inputs)
```

### Troubleshooting

#### Profiling metrics not appearing in SwanLab

**Cause**: SwanLab is not enabled or not initialized.

**Solution**:
```yaml
# Ensure SwanLab is enabled
use_swanlab: true
swanlab_project: my-project
```

Check logs for:
```text
INFO: SwanLab initialized for project: my-project
```

#### Too many profiling metrics cluttering dashboard

**Cause**: Profiling every function call for high-frequency operations.

**Solution**: Use `ProfilingConfig` with throttling:
```python
config = ProfilingConfig(
    min_duration_ms=1.0,    # Skip fast ops
    log_interval=100,       # Log every 100th call
)
```

#### Profiling overhead impacting training speed

**Cause**: Profiling itself should have negligible overhead (< 0.1%). If you see > 1% slowdown, this indicates a bug.

**Solution**:
1. Disable profiling temporarily to confirm:
   ```python
   DEFAULT_PROFILING_CONFIG.enabled = False
   ```
2. Report issue with profiling data and trainer details

#### Profiling shows inconsistent timing

**Cause**: Normal variation due to GPU warmup, data loading, or system load.

**Solution**:
- Ignore first few steps (warmup period)
- Look at average/median timing over many steps
- Use `log_interval` to reduce noise from individual outliers

## Complete Config Example

Here's a complete example integrating SwanLab with your RVQ-Alpha training:

```yaml
base_model: /path/to/your/model
model_type: Qwen2ForCausalLM

# SwanLab Integration
plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin
  - axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin

use_swanlab: true
swanlab_project: RVQ-Alpha-Training
swanlab_experiment_name: Qwen2.5-7B-MetaQA-Perturb-P020
swanlab_description: "Training on MetaQA and Perturbation datasets with NEW-RVQ encoding"
swanlab_mode: cloud
swanlab_workspace: single-cell-genomics

# Training configuration
sequence_len: 32768
micro_batch_size: 1
gradient_accumulation_steps: 1
num_epochs: 2
learning_rate: 2e-5
optimizer: adamw_torch_fused

# Datasets
datasets:
  - path: /path/to/dataset
    type: chat_template

# Output
output_dir: ./outputs
```

## Modes Explained

### `cloud` Mode (Default)
- Syncs experiments to SwanLab cloud in real-time
- Requires API key and internet connection
- Best for: Team collaboration, remote monitoring

### `local` Mode
- Saves experiments locally only
- No cloud sync
- Best for: Local development, air-gapped environments

### `offline` Mode
- Saves metadata locally
- Can sync to cloud later using `swanlab sync`
- Best for: Unstable internet, sync later

### `disabled` Mode
- Turns off SwanLab completely
- No logging or tracking
- Best for: Debugging, testing

## Configuration Validation & Conflict Detection

SwanLab integration includes comprehensive validation and conflict detection to help you catch configuration errors early and avoid performance issues.

### Required Fields Validation

The plugin validates your configuration at startup and provides clear error messages with solutions:

#### Missing Project Name

```yaml
# ❌ INVALID: use_swanlab enabled but no project
use_swanlab: true
# Error: SwanLab enabled but 'swanlab_project' is not set.
```

**Solution**:
```yaml
# ✅ VALID: Provide project name
use_swanlab: true
swanlab_project: my-project
```

#### Invalid Mode

```yaml
# ❌ INVALID: Unknown mode
use_swanlab: true
swanlab_project: my-project
swanlab_mode: invalid-mode
# Error: Invalid swanlab_mode: 'invalid-mode'. Valid options: cloud, local, offline, disabled
```

**Solution**:
```yaml
# ✅ VALID: Use one of the valid modes
use_swanlab: true
swanlab_project: my-project
swanlab_mode: cloud  # or: local, offline, disabled
```

#### Empty Project Name

```yaml
# ❌ INVALID: Empty string project name
use_swanlab: true
swanlab_project: ""
# Error: swanlab_project cannot be an empty string.
```

**Solution**:
```yaml
# ✅ VALID: Provide non-empty project name
use_swanlab: true
swanlab_project: my-project
```

### Cloud Mode API Key Warning

When using `cloud` mode without an API key, you'll receive a warning with multiple solutions:

```yaml
use_swanlab: true
swanlab_project: my-project
swanlab_mode: cloud
# No API key set
# Warning: SwanLab cloud mode enabled but no API key found.
```

**Solutions**:
1. Set environment variable: `export SWANLAB_API_KEY=your-api-key`
2. Add to config (less secure): `swanlab_api_key: your-api-key`
3. Run `swanlab login` before training
4. Use `swanlab_mode: local` for offline tracking

### Multi-Logger Performance Warnings

Using multiple logging tools simultaneously (SwanLab + WandB + MLflow + Comet) can impact training performance:

#### Two Loggers - Warning

```yaml
use_swanlab: true
swanlab_project: my-project

use_wandb: true
wandb_project: my-project

# Warning: Multiple logging tools enabled: SwanLab, WandB
# Expected overhead: ~3.0% per training step.
```

**Impact**:
- Performance overhead: ~1-2% per logger (cumulative)
- Increased memory usage
- Longer training time per step
- Potential config/callback conflicts

**Recommendations**:
- Choose ONE primary logging tool for production training
- Use multiple loggers only for:
  - Migration period (transitioning between tools)
  - Short comparison runs
  - Debugging specific tool issues
- Monitor system resources (CPU, memory) during training

#### Three+ Loggers - Error-Level Warning

```yaml
use_swanlab: true
swanlab_project: my-project

use_wandb: true
wandb_project: my-project

use_mlflow: true
mlflow_tracking_uri: http://localhost:5000

# ERROR: 3 logging tools enabled simultaneously!
# Expected overhead: ~4.5% per training step.
# STRONGLY RECOMMEND: Disable all but ONE logging tool
```

**Why This Matters**:
- With 3 loggers: ~4-5% overhead per step → significant slowdown over long training
- Example: 10,000 steps at 2s/step → ~400-500 seconds extra (6-8 minutes)
- Memory overhead scales with number of loggers
- Rare edge cases with callback ordering conflicts

### Auto-Enable Logic

For convenience, SwanLab will auto-enable if you specify a project without setting `use_swanlab`:

```yaml
# This configuration:
swanlab_project: my-project

# Automatically becomes:
use_swanlab: true
swanlab_project: my-project
```

### Distributed Training Detection

In distributed training scenarios (multi-GPU), the plugin automatically detects and reports:

```yaml
use_swanlab: true
swanlab_project: my-project
swanlab_mode: cloud

# When running with torchrun --nproc_per_node=4:
# Info: Distributed training detected (world_size=4)
# Info: SwanLab mode: cloud
# Info: Only rank 0 will initialize SwanLab
# Info: Other ranks will skip SwanLab to avoid conflicts
```

**Why Only Rank 0**:
- Avoids duplicate experiment runs
- Reduces network/cloud API overhead on worker ranks
- Prevents race conditions in metric logging

## Authentication

### Method 1: Environment Variable (Recommended)
```bash
export SWANLAB_API_KEY=your-api-key-here
```

### Method 2: Login Command
```bash
swanlab login
# Enter your API key when prompted
```

### Method 3: Config File
```yaml
swanlab_api_key: your-api-key-here
```

## What Gets Logged?

### Automatically Logged Metrics
- Training loss
- Learning rate
- Gradient norm
- Training steps
- Epoch progress

### Automatically Logged Config
- Model configuration (base_model, model_type)
- Training hyperparameters (learning_rate, batch_size, etc.)
- Optimizer settings
- Parallelization settings (FSDP, DeepSpeed, Context Parallel)
- Axolotl configuration file
- DeepSpeed configuration (if used)

## Viewing Your Experiments

### Cloud Mode
Visit [https://swanlab.cn](https://swanlab.cn) and navigate to your project to view:
- Real-time training metrics
- Hyperparameter comparison
- System resource usage
- Configuration files

### Local Mode
```bash
# Start local dashboard
swanlab watch ./swanlog

# Open browser to http://localhost:5092
```

## Integration with Existing Tools

SwanLab can work alongside other tracking tools:

```yaml
plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin

# Use both SwanLab and Wandb
use_swanlab: true
swanlab_project: my-project

use_wandb: true
wandb_project: my-project
```

## Troubleshooting

### Configuration Errors

#### Error: "SwanLab enabled but 'swanlab_project' is not set"

**Cause**: You enabled SwanLab (`use_swanlab: true`) but forgot to specify a project name.

**Solution**:
```yaml
use_swanlab: true
swanlab_project: my-project  # Add this line
```

#### Error: "Invalid swanlab_mode: 'xxx'"

**Cause**: You provided an invalid mode value.

**Solution**: Use one of the valid modes:
```yaml
swanlab_mode: cloud     # or: local, offline, disabled
```

#### Error: "swanlab_project cannot be an empty string"

**Cause**: You set `swanlab_project: ""` (empty string).

**Solution**: Either provide a valid name or remove the field:
```yaml
# Option 1: Provide valid name
swanlab_project: my-project

# Option 2: Remove the field entirely
# swanlab_project: ""  <- Remove this line
```

### Import Errors

#### Error: "SwanLab is not installed"

**Cause**: SwanLab package is not installed in your environment.

**Solution**:
```bash
pip install swanlab
# or
pip install swanlab>=0.3.0
```

### Performance Issues

#### Warning: "Multiple logging tools enabled"

**Cause**: You have multiple experiment tracking tools enabled (e.g., SwanLab + WandB + MLflow).

**Impact**: ~1-2% performance overhead per logger, cumulative.

**Solution**: For production training, disable all but one logger:
```yaml
# Option 1: Keep only SwanLab
use_swanlab: true
swanlab_project: my-project
use_wandb: false      # Disable others
use_mlflow: false

# Option 2: Keep only WandB
use_swanlab: false
use_wandb: true
wandb_project: my-project
```

**Exception**: Multiple loggers are acceptable for:
- Short comparison runs (< 100 steps)
- Migration testing between logging tools
- Debugging logger-specific issues

### Distributed Training Issues

#### SwanLab creates duplicate runs in multi-GPU training

**Cause**: All ranks are initializing SwanLab instead of just rank 0.

**Expected Behavior**: The plugin automatically ensures only rank 0 initializes SwanLab. You should see:
```text
Info: Distributed training detected (world_size=4)
Info: Only rank 0 will initialize SwanLab
Info: Other ranks will skip SwanLab to avoid conflicts
```

**If you see duplicates**:
1. Check your plugin is loaded correctly
2. Verify you're using the latest SwanLab integration code
3. Check logs for initialization messages on all ranks

### SwanLab not logging metrics

**Solution**: Ensure SwanLab is initialized before training starts. The plugin automatically handles this in `pre_model_load`.

### API Key errors

**Solution**:
```bash
# Verify API key
echo $SWANLAB_API_KEY

# Re-login
swanlab login
```

### Cloud sync issues

**Solution**: Use `offline` mode and sync later:
```yaml
swanlab_mode: offline
```

Then sync when ready:
```bash
swanlab sync ./swanlog
```

### Plugin not loaded

**Solution**: Verify plugin path in config:
```yaml
plugins:
  - axolotl.integrations.swanlab.SwanLabPlugin  # Correct path
```

### Lark Notification Issues

#### Error: "Failed to import SwanLab Lark plugin"

**Cause**: Your SwanLab version doesn't include the Lark plugin (requires SwanLab >= 0.3.0).

**Solution**:
```bash
# Upgrade SwanLab to latest version
pip install --upgrade swanlab

# Or install specific version
pip install 'swanlab>=0.3.0'
```

#### Warning: "Lark webhook has no secret configured"

**Cause**: You provided `swanlab_lark_webhook_url` but no `swanlab_lark_secret`.

**Impact**: Lark notifications will work, but without HMAC authentication (security risk).

**Solution**: Add HMAC secret for production use:
```yaml
swanlab_lark_webhook_url: https://open.feishu.cn/open-apis/bot/v2/hook/xxx
swanlab_lark_secret: your-webhook-secret  # Add this line
```

**When it's OK to skip secret**:
- Local development and testing
- Internal networks with restricted access
- Non-sensitive training experiments

**When secret is required**:
- Production training jobs
- Training with proprietary data
- Multi-team shared Lark groups

#### Error: "Failed to register Lark callback"

**Cause**: Invalid webhook URL or network connectivity issues.

**Diagnostic steps**:
```bash
# 1. Test webhook URL manually
curl -X POST "YOUR_WEBHOOK_URL" \
  -H 'Content-Type: application/json' \
  -d '{"msg_type":"text","content":{"text":"Test from Axolotl"}}'

# 2. Check SwanLab version
pip show swanlab

# 3. Verify webhook URL format
# Should start with: https://open.feishu.cn/open-apis/bot/v2/hook/
```

**Solution**:
1. Verify webhook URL is correct (copy from Lark bot settings)
2. Check network connectivity to Lark API
3. Ensure webhook is not expired (Lark webhooks can expire)
4. Regenerate webhook URL in Lark bot settings if needed

#### Lark notifications not received

**Cause**: Multiple possible causes.

**Diagnostic checklist**:

1. **Check training logs** for Lark registration confirmation:
   ```text
   # Expected log message (rank 0 only):
   INFO: Registered Lark notification callback with HMAC authentication
   ```

2. **Verify webhook in Lark**: Test webhook manually (see above)

3. **Check distributed training**: Only rank 0 sends notifications
   ```bash
   # If running multi-GPU, check rank 0 logs specifically
   grep "Registered Lark" logs/rank_0.log
   ```

4. **Verify SwanLab is initialized**: Lark callback needs SwanLab to be running
   ```yaml
   use_swanlab: true  # Must be enabled
   swanlab_project: my-project  # Must be set
   ```

5. **Check Lark bot permissions**: Ensure bot is added to the target group chat

#### Duplicate Lark notifications in multi-GPU training

**Expected Behavior**: Should NOT happen - only rank 0 sends notifications.

**If you see duplicates**:
1. Check that all GPUs are using the same config file
2. Verify plugin is loaded correctly on all ranks
3. Check logs for unexpected Lark initialization on non-zero ranks
4. Ensure `RANK` or `LOCAL_RANK` environment variables are set correctly

**Solution**: This is a bug if it occurs. Report with:
- Full training command
- Logs from all ranks
- Config file

## Comparison: SwanLab vs WandB

| Feature | SwanLab | WandB |
|---------|---------|-------|
| Open Source | ✅ Yes | ❌ No |
| Self-Hosting | ✅ Easy | ⚠️ Complex |
| Free Tier | ✅ Generous | ⚠️ Limited |
| Chinese Support | ✅ Native | ⚠️ Limited |
| Offline Mode | ✅ Full support | ✅ Supported |
| Integration | 🆕 New | ✅ Mature |

## Advanced Usage

### Custom Logging

You can add custom metrics in your callbacks:

```python
import swanlab

# In your custom callback
swanlab.log({
    "custom_metric": value,
    "epoch": epoch_num
})
```

### Experiment Comparison

```bash
# Compare multiple experiments
swanlab compare run1 run2 run3
```

## Support

- **Documentation**: [https://docs.swanlab.cn](https://docs.swanlab.cn)
- **GitHub**: [https://github.com/SwanHubX/SwanLab](https://github.com/SwanHubX/SwanLab)
- **Issues**: Report bugs at [GitHub Issues](https://github.com/SwanHubX/SwanLab/issues)

## License

This integration follows the Axolotl Community License Agreement.

## Acknowledgements

This integration is built on top of:
- [SwanLab](https://github.com/SwanHubX/SwanLab) - Experiment tracking tool
- [Transformers](https://github.com/huggingface/transformers) - SwanLabCallback
- [Axolotl](https://github.com/axolotl-ai-cloud/axolotl) - Training framework


================================================
FILE: src/axolotl/integrations/swanlab/__init__.py
================================================
"""SwanLab integration plugin for Axolotl"""

from axolotl.integrations.swanlab.args import SwanLabConfig
from axolotl.integrations.swanlab.plugins import SwanLabPlugin

__all__ = ["SwanLabConfig", "SwanLabPlugin"]


================================================
FILE: src/axolotl/integrations/swanlab/args.py
================================================
"""SwanLab configuration arguments"""

from pydantic import BaseModel, Field, field_validator, model_validator


class SwanLabConfig(BaseModel):
    """SwanLab configuration subset"""

    use_swanlab: bool | None = Field(
        default=True,
        json_schema_extra={
            "description": "Enable SwanLab experiment tracking and visualization"
        },
    )
    swanlab_project: str | None = Field(
        default=None,
        json_schema_extra={"description": "Your SwanLab project name"},
    )
    swanlab_experiment_name: str | None = Field(
        default=None,
        json_schema_extra={"description": "Set the name of your SwanLab experiment"},
    )
    swanlab_description: str | None = Field(
        default=None,
        json_schema_extra={"description": "Description for your SwanLab experiment"},
    )
    swanlab_mode: str | None = Field(
        default=None,
        json_schema_extra={
            "description": '"cloud" to sync to SwanLab cloud, "local" for local only, "offline" to save metadata locally, "disabled" to turn off SwanLab'
        },
    )
    swanlab_workspace: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "SwanLab workspace name (organization or username)"
        },
    )
    swanlab_api_key: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "SwanLab API key for authentication. Can also be set via SWANLAB_API_KEY environment variable"
        },
    )
    swanlab_log_model: bool | None = Field(
        default=False,
        json_schema_extra={
            "description": "Whether to log model checkpoints to SwanLab (feature coming soon)"
        },
    )
    swanlab_web_host: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Web address for SwanLab cloud environment (for private deployment)"
        },
    )
    swanlab_api_host: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "API address for SwanLab cloud environment (for private deployment)"
        },
    )
    swanlab_lark_webhook_url: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Lark (Feishu) webhook URL for sending training notifications to team chat"
        },
    )
    swanlab_lark_secret: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Secret for Lark webhook HMAC signature authentication (optional)"
        },
    )
    swanlab_log_completions: bool | None = Field(
        default=True,
        json_schema_extra={
            "description": "Enable logging RLHF completions to SwanLab for qualitative analysis (DPO/KTO/ORPO/GRPO)"
        },
    )
    swanlab_completion_log_interval: int | None = Field(
        default=100,
        json_schema_extra={
            "description": "Number of training steps between completion table logging to SwanLab"
        },
    )
    swanlab_completion_max_buffer: int | None = Field(
        default=128,
        json_schema_extra={
            "description": "Maximum number of completions to buffer before logging (prevents memory leaks)"
        },
    )

    @field_validator("swanlab_mode")
    @classmethod
    def validate_swanlab_mode(cls, v):
        """Validate swanlab_mode is one of the allowed values."""
        if v is None:
            return v

        valid_modes = ["cloud", "local", "offline", "disabled"]
        if v not in valid_modes:
            raise ValueError(
                f"Invalid swanlab_mode: '{v}'.\n\n"
                f"Valid options: {', '.join(valid_modes)}\n\n"
                f"Examples:\n"
                f"  swanlab_mode: cloud     # Sync to SwanLab cloud\n"
                f"  swanlab_mode: local     # Local only, no cloud sync\n"
                f"  swanlab_mode: offline   # Save metadata locally\n"
                f"  swanlab_mode: disabled  # Turn off SwanLab\n"
            )
        return v

    @field_validator("swanlab_project")
    @classmethod
    def validate_swanlab_project(cls, v):
        """Validate swanlab_project is non-empty when provided."""
        if v is not None and isinstance(v, str) and len(v.strip()) == 0:
            raise ValueError(
                "swanlab_project cannot be an empty string.\n\n"
                "Either:\n"
                "  1. Provide a valid project name: swanlab_project: my-project\n"
                "  2. Remove the swanlab_project field entirely\n"
            )
        return v

    @model_validator(mode="after")
    def validate_swanlab_enabled_requires_project(self):
        """Validate that if use_swanlab is True, swanlab_project must be set."""
        if self.use_swanlab is True and not self.swanlab_project:
            raise ValueError(
                "SwanLab enabled (use_swanlab: true) but 'swanlab_project' is not set.\n\n"
                "Solutions:\n"
                "  1. Add 'swanlab_project: your-project-name' to your config\n"
                "  2. Set 'use_swanlab: false' to disable SwanLab\n\n"
                "Example:\n"
                "  use_swanlab: true\n"
                "  swanlab_project: my-llm-training\n"
            )
        return self


================================================
FILE: src/axolotl/integrations/swanlab/callbacks.py
================================================
"""SwanLab callbacks for Axolotl trainers.

This module provides HuggingFace Trainer callbacks for logging
RLHF completions to SwanLab.
"""

from transformers import (
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
)

from axolotl.integrations.swanlab.completion_logger import CompletionLogger
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


class SwanLabRLHFCompletionCallback(TrainerCallback):
    """Callback for logging RLHF completions to SwanLab.

    This callback periodically logs model completions (prompts, chosen/rejected
    responses, rewards) to SwanLab during RLHF training for qualitative analysis.

    Supports DPO, KTO, ORPO, and GRPO trainers.

    Example usage:
        >>> callback = SwanLabRLHFCompletionCallback(
        ...     log_interval=100,  # Log every 100 steps
        ...     max_completions=128,  # Keep last 128 completions
        ... )
        >>> trainer.add_callback(callback)

    Attributes:
        logger: CompletionLogger instance
        log_interval: Number of steps between SwanLab logging
        trainer_type: Auto-detected trainer type (dpo/kto/orpo/grpo)
    """

    def __init__(
        self,
        log_interval: int = 100,
        max_completions: int = 128,
        table_name: str = "rlhf_completions",
    ):
        """Initialize SwanLab RLHF completion callback.

        Args:
            log_interval: Log to SwanLab every N steps. Default: 100
            max_completions: Maximum completions to buffer. Default: 128
            table_name: SwanLab table name. Default: "rlhf_completions"
        """
        super().__init__()
        self.logger = CompletionLogger(maxlen=max_completions)
        self.log_interval = log_interval
        self.table_name = table_name
        self.trainer_type: str | None = None  # Auto-detected
        self._last_logged_step = 0

    def on_init_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Detect trainer type on initialization."""
        trainer = kwargs.get("trainer")
        if trainer is not None:
            trainer_name = trainer.__class__.__name__
            if "DPO" in trainer_name:
                self.trainer_type = "dpo"
            elif "KTO" in trainer_name:
                self.trainer_type = "kto"
            elif "ORPO" in trainer_name:
                self.trainer_type = "orpo"
            elif "GRPO" in trainer_name:
                self.trainer_type = "grpo"
            else:
                self.trainer_type = "unknown"

            LOG.info(
                f"SwanLab RLHF completion logging enabled for {trainer_name} "
                f"(type: {self.trainer_type})"
            )

    def on_log(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        logs: dict | None = None,
        **kwargs,
    ):
        """Capture completions from logs and buffer them.

        Different trainers log completions in different formats:
        - DPO: logs['dpo/chosen'], logs['dpo/rejected'], logs['dpo/reward_diff']
        - KTO: logs['kto/completion'], logs['kto/label'], logs['kto/reward']
        - ORPO: logs['orpo/chosen'], logs['orpo/rejected']
        - GRPO: logs['grpo/completion'], logs['grpo/reward']

        Note: This is a placeholder implementation. Actual log keys depend
        on the TRL trainer implementation. You may need to patch the trainers
        to expose completion data in logs.
        """
        if logs is None or self.trainer_type is None:
            return

        step = state.global_step

        # DPO completions
        if self.trainer_type == "dpo":
            if all(key in logs for key in ["dpo/prompt", "dpo/chosen", "dpo/rejected"]):
                self.logger.add_dpo_completion(
                    step=step,
                    prompt=logs.get("dpo/prompt", ""),
                    chosen=logs.get("dpo/chosen", ""),
                    rejected=logs.get("dpo/rejected", ""),
                    reward_diff=logs.get("dpo/reward_diff"),
                )

        # KTO completions
        elif self.trainer_type == "kto":
            if all(key in logs for key in ["kto/prompt", "kto/completion"]):
                self.logger.add_kto_completion(
                    step=step,
                    prompt=logs.get("kto/prompt", ""),
                    completion=logs.get("kto/completion", ""),
                    label=logs.get("kto/label", False),
                    reward=logs.get("kto/reward"),
                )

        # ORPO completions
        elif self.trainer_type == "orpo":
            if all(
                key in logs for key in ["orpo/prompt", "orpo/chosen", "orpo/rejected"]
            ):
                self.logger.add_orpo_completion(
                    step=step,
                    prompt=logs.get("orpo/prompt", ""),
                    chosen=logs.get("orpo/chosen", ""),
                    rejected=logs.get("orpo/rejected", ""),
                    log_odds_ratio=logs.get("orpo/log_odds_ratio"),
                )

        # GRPO completions
        elif self.trainer_type == "grpo":
            if all(key in logs for key in ["grpo/prompt", "grpo/completion"]):
                self.logger.add_grpo_completion(
                    step=step,
                    prompt=logs.get("grpo/prompt", ""),
                    completion=logs.get("grpo/completion", ""),
                    reward=logs.get("grpo/reward"),
                    advantage=logs.get("grpo/advantage"),
                )

        # Periodically log to SwanLab
        if step - self._last_logged_step >= self.log_interval:
            if len(self.logger) > 0:
                self.logger.log_to_swanlab(table_name=self.table_name)
                self.logger.clear()
                self._last_logged_step = step

    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Log remaining completions at end of training."""
        if len(self.logger) > 0:
            LOG.info(
                f"Training complete, logging final {len(self.logger)} completions to SwanLab"
            )
            self.logger.log_to_swanlab(table_name=self.table_name)
            self._last_logged_step = state.global_step


================================================
FILE: src/axolotl/integrations/swanlab/completion_logger.py
================================================
"""SwanLab completion logger for RLHF/DPO/KTO/ORPO/GRPO training.

This module provides utilities for logging model completions during
preference training to SwanLab for qualitative analysis.
"""

from collections import deque
from collections.abc import Mapping
from typing import Any

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


class CompletionLogger:
    """Memory-bounded logger for RLHF completions.

    Stores prompts, completions, and rewards in fixed-size deques to prevent
    memory leaks during long training runs. Logs completion tables to SwanLab
    for qualitative analysis of model outputs.

    Example usage:
        >>> logger = CompletionLogger(maxlen=128)
        >>> logger.add_dpo_completion(
        ...     step=0,
        ...     prompt="What is AI?",
        ...     chosen="Artificial Intelligence is...",
        ...     rejected="AI means...",
        ...     reward_diff=0.5
        ... )
        >>> logger.log_to_swanlab()

    Attributes:
        maxlen: Maximum number of completions to store (older ones are dropped)
        data: Deque storing completion dictionaries
    """

    def __init__(self, maxlen: int = 128):
        """Initialize completion logger with bounded buffer.

        Args:
            maxlen: Maximum number of completions to store. When the buffer
                is full, oldest completions are automatically discarded.
                Default: 128 (sufficient for most RLHF runs without memory issues)
        """
        self.maxlen = maxlen
        self.data: deque[Mapping[str, Any]] = deque(maxlen=maxlen)

    def add_dpo_completion(
        self,
        step: int,
        prompt: str,
        chosen: str,
        rejected: str,
        reward_diff: float | None = None,
    ) -> None:
        """Add a DPO completion to the buffer.

        Args:
            step: Training step number
            prompt: Input prompt
            chosen: Chosen (preferred) completion
            rejected: Rejected (non-preferred) completion
            reward_diff: Reward difference (chosen - rejected), if available
        """
        entry = {
            "step": step,
            "prompt": prompt,
            "chosen": chosen,
            "rejected": rejected,
        }
        if reward_diff is not None:
            entry["reward_diff"] = reward_diff

        self.data.append(entry)

    def add_kto_completion(
        self,
        step: int,
        prompt: str,
        completion: str,
        label: bool,
        reward: float | None = None,
    ) -> None:
        """Add a KTO completion to the buffer.

        Args:
            step: Training step number
            prompt: Input prompt
            completion: Model-generated completion
            label: True if desirable, False if undesirable
            reward: Reward score, if available
        """
        entry = {
            "step": step,
            "prompt": prompt,
            "completion": completion,
            "label": "desirable" if label else "undesirable",
        }
        if reward is not None:
            entry["reward"] = reward

        self.data.append(entry)

    def add_orpo_completion(
        self,
        step: int,
        prompt: str,
        chosen: str,
        rejected: str,
        log_odds_ratio: float | None = None,
    ) -> None:
        """Add an ORPO completion to the buffer.

        Args:
            step: Training step number
            prompt: Input prompt
            chosen: Chosen (preferred) completion
            rejected: Rejected (non-preferred) completion
            log_odds_ratio: Log odds ratio between chosen and rejected
        """
        entry = {
            "step": step,
            "prompt": prompt,
            "chosen": chosen,
            "rejected": rejected,
        }
        if log_odds_ratio is not None:
            entry["log_odds_ratio"] = log_odds_ratio

        self.data.append(entry)

    def add_grpo_completion(
        self,
        step: int,
        prompt: str,
        completion: str,
        reward: float | None = None,
        advantage: float | None = None,
    ) -> None:
        """Add a GRPO completion to the buffer.

        Args:
            step: Training step number
            prompt: Input prompt
            completion: Model-generated completion
            reward: Reward score from reward model
            advantage: Advantage estimate (reward - baseline)
        """
        entry = {
            "step": step,
            "prompt": prompt,
            "completion": completion,
        }
        if reward is not None:
            entry["reward"] = reward
        if advantage is not None:
            entry["advantage"] = advantage

        self.data.append(entry)

    def log_to_swanlab(self, table_name: str = "completions") -> bool:
        """Log buffered completions to SwanLab as a table.

        Creates a SwanLab echarts Table with all buffered completions.
        Only logs if SwanLab is initialized and data is available.

        Args:
            table_name: Name of the table in SwanLab dashboard.
                Default: "completions"

        Returns:
            True if logging succeeded, False otherwise
        """
        if not self.data:
            LOG.debug("No completions to log to SwanLab")
            return False

        try:
            import swanlab

            if swanlab.get_run() is None:
                LOG.debug("SwanLab not initialized, skipping completion logging")
                return False

            # Convert deque to list of dicts
            completions = list(self.data)

            # Extract headers from first entry (all entries should have same structure)
            headers = list(completions[0].keys())

            # Build rows: each completion becomes one row
            rows = []
            for completion in completions:
                row = [completion.get(header, "") for header in headers]
                rows.append(row)

            # Log to SwanLab as echarts Table
            swanlab.log({table_name: swanlab.echarts.Table().add(headers, rows)})

            LOG.info(f"Logged {len(rows)} completions to SwanLab table '{table_name}'")
            return True

        except ImportError:
            LOG.warning(
                "SwanLab not installed, cannot log completions. "
                "Install with: pip install swanlab"
            )
            return False
        except Exception as err:  # pylint: disable=broad-except
            LOG.exception("Failed to log completions to SwanLab: %s", err)
            return False

    def clear(self) -> None:
        """Clear all buffered completions."""
        self.data.clear()

    def __len__(self) -> int:
        """Return number of buffered completions."""
        return len(self.data)

    def __repr__(self) -> str:
        """String representation showing buffer status."""
        return (
            f"CompletionLogger(maxlen={self.maxlen}, "
            f"buffered={len(self.data)}/{self.maxlen})"
        )


================================================
FILE: src/axolotl/integrations/swanlab/plugins.py
================================================
"""SwanLab Plugin for Axolotl"""

from __future__ import annotations

from typing import TYPE_CHECKING

from axolotl.integrations.base import BasePlugin
from axolotl.utils.logging import get_logger

if TYPE_CHECKING:
    from transformers import TrainerCallback

    from axolotl.utils.dict import DictDefault

LOG = get_logger(__name__)


class SwanLabPlugin(BasePlugin):
    """
    SwanLab integration plugin for Axolotl.

    Provides experiment tracking, visualization, and logging capabilities
    using SwanLab (https://swanlab.cn).

    Usage in config.yaml:
        plugins:
          - axolotl.integrations.swanlab.SwanLabPlugin

        use_swanlab: true
        swanlab_project: my-project
        swanlab_experiment_name: my-experiment
        swanlab_mode: cloud  # or 'local', 'offline', 'disabled'
    """

    def __init__(self):
        super().__init__()
        self.swanlab_initialized = False
        LOG.info("SwanLab plugin initialized")

    def get_input_args(self) -> str:
        """Returns the configuration model for SwanLab integration."""
        return "axolotl.integrations.swanlab.SwanLabConfig"

    def register(self, cfg: dict):
        """Register SwanLab plugin with configuration and conflict detection."""
        LOG.info("Registering SwanLab plugin")

        # === Conflict Detection: Required Fields ===

        # Check if SwanLab is enabled
        if cfg.get("use_swanlab"):
            # 1. Validate project name is set
            if not cfg.get("swanlab_project"):
                raise ValueError(
                    "SwanLab enabled but 'swanlab_project' is not set.\n\n"
                    "Solutions:\n"
                    "  1. Add 'swanlab_project: your-project-name' to your config\n"
                    "  2. Set 'use_swanlab: false' to disable SwanLab\n\n"
                    "See: src/axolotl/integrations/swanlab/README.md for examples"
                )

            # 2. Validate swanlab_mode value
            valid_modes = ["cloud", "local", "offline", "disabled"]
            mode = cfg.get("swanlab_mode")
            if mode and mode not in valid_modes:
                raise ValueError(
                    f"Invalid swanlab_mode: '{mode}'.\n\n"
                    f"Valid options: {', '.join(valid_modes)}\n\n"
                    f"Example:\n"
                    f"  swanlab_mode: cloud  # Sync to SwanLab cloud\n"
                    f"  swanlab_mode: local  # Local only, no cloud sync\n"
                )

            # 3. Check API key for cloud mode
            import os

            mode = cfg.get("swanlab_mode", "cloud")  # Default is cloud
            if mode == "cloud":
                api_key = cfg.get("swanlab_api_key") or os.environ.get(
                    "SWANLAB_API_KEY"
                )
                if not api_key:
                    LOG.warning(
                        "SwanLab cloud mode enabled but no API key found.\n"
                        "SwanLab may fail to initialize during training.\n\n"
                        "Solutions:\n"
                        "  1. Set SWANLAB_API_KEY environment variable:\n"
                        "     export SWANLAB_API_KEY=your-api-key\n"
                        "  2. Add 'swanlab_api_key: your-api-key' to config (less secure)\n"
                        "  3. Run 'swanlab login' before training\n"
                        "  4. Use 'swanlab_mode: local' for offline tracking\n"
                    )

        # === Conflict Detection: Multi-Logger Performance Warning ===

        # Detect all active logging tools
        active_loggers = []
        if cfg.get("use_wandb"):
            active_loggers.append("WandB")
        if cfg.get("use_mlflow"):
            active_loggers.append("MLflow")
        if cfg.get("comet_api_key") or cfg.get("comet_project_name"):
            active_loggers.append("Comet")
        if cfg.get("use_swanlab"):
            active_loggers.append("SwanLab")

        if len(active_loggers) > 1:
            LOG.warning(
                f"\n{'=' * 70}\n"
                f"Multiple logging tools enabled: {', '.join(active_loggers)}\n"
                f"{'=' * 70}\n"
                f"This may cause:\n"
                f"  - Performance overhead (~1-2% per logger, cumulative)\n"
                f"  - Increased memory usage\n"
                f"  - Longer training time per step\n"
                f"  - Potential config/callback conflicts\n\n"
                f"Recommendations:\n"
                f"  - Choose ONE primary logging tool for production training\n"
                f"  - Use multiple loggers only for:\n"
                f"    * Migration period (transitioning between tools)\n"
                f"    * Short comparison runs\n"
                f"    * Debugging specific tool issues\n"
                f"  - Monitor system resources (CPU, memory) during training\n"
                f"{'=' * 70}\n"
            )

            if len(active_loggers) >= 3:
                LOG.error(
                    f"\n{'!' * 70}\n"
                    f"WARNING: {len(active_loggers)} logging tools enabled simultaneously!\n"
                    f"{'!' * 70}\n"
                    f"This is likely unintentional and WILL significantly impact performance.\n"
                    f"Expected overhead: ~{len(active_loggers) * 1.5:.1f}% per training step.\n\n"
                    f"STRONGLY RECOMMEND:\n"
                    f"  - Disable all but ONE logging tool\n"
                    f"  - Use config inheritance to manage multiple configs\n"
                    f"{'!' * 70}\n"
                )

        # === Auto-Enable Logic ===

        # Enable SwanLab if project is specified
        if cfg.get("swanlab_project") and not cfg.get("use_swanlab"):
            cfg["use_swanlab"] = True
            LOG.info("Automatically enabled use_swanlab because swanlab_project is set")

    def pre_model_load(self, cfg: DictDefault):
        """Initialize SwanLab before model loading with runtime checks."""
        if not cfg.use_swanlab:
            return

        # === Runtime Check: Import Availability ===
        try:
            import swanlab
        except ImportError as err:
            raise ImportError(
                "SwanLab is not installed.\n\n"
                "Install with:\n"
                "  pip install swanlab\n\n"
                "Or add to requirements:\n"
                "  swanlab>=0.3.0\n\n"
                f"Original error: {err}"
            ) from err

        # Log SwanLab version
        try:
            swanlab_version = swanlab.__version__
            LOG.info(f"SwanLab version: {swanlab_version}")
        except AttributeError:
            LOG.warning("Could not determine SwanLab version")

        # === Runtime Check: Distributed Training Setup ===
        from axolotl.utils.distributed import get_world_size, is_main_process

        world_size = get_world_size()
        if world_size > 1:
            mode = getattr(cfg, "swanlab_mode", "cloud")
            LOG.info(
                f"\n{'=' * 70}\n"
                f"Distributed training detected (world_size={world_size})\n"
                f"SwanLab mode: {mode}\n"
                f"{'=' * 70}\n"
                f"Behavior:\n"
                f"  - Only rank 0 will initialize SwanLab\n"
                f"  - Other ranks will skip SwanLab to avoid conflicts\n"
            )

            if mode == "cloud":
                LOG.info(
                    f"  - Only rank 0 will upload to SwanLab cloud\n"
                    f"  - Other ranks run without SwanLab overhead\n"
                    f"{'=' * 70}\n"
                )

        # Only initialize SwanLab on the main process (rank 0)
        # to avoid creating multiple runs in distributed training
        if not is_main_process():
            LOG.debug("Skipping SwanLab initialization on non-main process")
            return

        # Initialize SwanLab run (passing all params directly to init)
        try:
            init_kwargs = self._get_swanlab_init_kwargs(cfg)
            swanlab.init(**init_kwargs)
            self.swanlab_initialized = True
            LOG.info(f"SwanLab initialized with project: {cfg.swanlab_project}")

            # Register Lark notification callback (if configured)
            self._register_lark_callback(cfg)

            # Log configuration (with error handling)
            try:
                config_dict = self._prepare_config_for_logging(cfg)
                swanlab.config.update(config_dict)
                LOG.debug("Successfully logged config to SwanLab")
            except Exception as config_err:  # pylint: disable=broad-except
                LOG.warning(
                    f"Failed to log config to SwanLab: {config_err}. Continuing anyway."
                )

        except Exception as err:  # pylint: disable=broad-except
            LOG.exception("Failed to initialize SwanLab: %s", err)
            self.swanlab_initialized = False

    def add_callbacks_pre_trainer(self, cfg: DictDefault, model):
        """Add SwanLab callbacks before trainer creation."""
        callbacks: list[TrainerCallback] = []

        if not cfg.use_swanlab:
            return callbacks

        if not self.swanlab_initialized:
            LOG.warning("SwanLab not initialized, skipping callback registration")
            return callbacks

        try:
            from axolotl.utils.callbacks.swanlab import (
                CustomSwanLabCallback,
                SaveAxolotlConfigtoSwanLabCallback,
            )

            # Add our custom lightweight SwanLabCallback
            # (avoids omegaconf/antlr4 version conflicts)
            swanlab_callback = CustomSwanLabCallback()
            callbacks.append(swanlab_callback)
            LOG.info("Added CustomSwanLabCallback for metrics logging")

            # Add Axolotl config logging callback
            if cfg.axolotl_config_path:
                config_callback = SaveAxolotlConfigtoSwanLabCallback(
                    cfg.axolotl_config_path
                )
                callbacks.append(config_callback)
                LOG.info("Added SaveAxolotlConfigtoSwanLabCallback")

        except ImportError as err:
            LOG.exception("Failed to import SwanLab callbacks: %s", err)

        return callbacks

    def post_trainer_create(self, cfg: DictDefault, trainer):
        """Post-trainer creation hook."""
        if cfg.use_swanlab and self.swanlab_initialized:
            try:
                import swanlab

                # Log additional trainer information (with safe conversion)
                trainer_config = {
                    "total_steps": int(trainer.state.max_steps)
                    if trainer.state.max_steps
                    else None,
                    "num_train_epochs": float(trainer.args.num_train_epochs)
                    if trainer.args.num_train_epochs
                    else None,
                    "train_batch_size": int(trainer.args.train_batch_size)
                    if hasattr(trainer.args, "train_batch_size")
                    else None,
                    "gradient_accumulation_steps": int(
                        trainer.args.gradient_accumulation_steps
                    )
                    if trainer.args.gradient_accumulation_steps
                    else None,
                }
                # Remove None values
                trainer_config = {
                    k: v for k, v in trainer_config.items() if v is not None
                }

                if trainer_config:
                    swanlab.config.update(trainer_config)
                    LOG.info("Logged trainer configuration to SwanLab")
            except Exception as err:  # pylint: disable=broad-except
                LOG.debug(f"Failed to log trainer config to SwanLab: {err}")

            # Register RLHF completion logging callback if enabled
            self._register_completion_callback(cfg, trainer)

    def _get_swanlab_init_kwargs(self, cfg: DictDefault) -> dict:
        """Prepare kwargs for swanlab.init().

        Passes all configuration parameters directly to swanlab.init()
        instead of using environment variables as an intermediate layer.

        Returns:
            dict: Keyword arguments for swanlab.init()
        """
        init_kwargs = {}

        # Project name (required)
        if cfg.swanlab_project:
            init_kwargs["project"] = cfg.swanlab_project

        # Experiment name
        if cfg.swanlab_experiment_name:
            init_kwargs["experiment_name"] = cfg.swanlab_experiment_name

        # Description
        if cfg.swanlab_description:
            init_kwargs["description"] = cfg.swanlab_description

        # Workspace (organization)
        if cfg.swanlab_workspace:
            init_kwargs["workspace"] = cfg.swanlab_workspace

        # Mode: cloud, local, offline, disabled
        if cfg.swanlab_mode:
            init_kwargs["mode"] = cfg.swanlab_mode

        # API key (pass directly instead of via env var)
        if cfg.swanlab_api_key:
            init_kwargs["api_key"] = cfg.swanlab_api_key

        # Private deployment hosts (pass directly instead of via env var)
        if cfg.swanlab_web_host:
            init_kwargs["web_host"] = cfg.swanlab_web_host

        if cfg.swanlab_api_host:
            init_kwargs["api_host"] = cfg.swanlab_api_host

        # Log model checkpoints (coming soon in SwanLab)
        if cfg.swanlab_log_model:
            init_kwargs["log_model"] = cfg.swanlab_log_model

        # Custom branding - adds Axolotl identifier to SwanLab UI
        # This helps identify runs from Axolotl vs other frameworks
        init_kwargs["config"] = {"UPPERFRAME": "🦎 Axolotl"}

        return init_kwargs

    def _prepare_config_for_logging(self, cfg: DictDefault) -> dict:
        """Prepare configuration dict for logging to SwanLab."""

        def safe_convert(value):
            """Convert value to JSON-serializable type."""
            if value is None:
                return None
            if isinstance(value, (int, float, bool)):
                return value
            if isinstance(value, str):
                return value
            # Convert everything else to string
            return str(value)

        try:
            # Extract important training parameters with safe conversion
            config_dict = {
                "base_model": safe_convert(getattr(cfg, "base_model", "")),
                "model_type": safe_convert(getattr(cfg, "model_type", "")),
                "sequence_len": safe_convert(getattr(cfg, "sequence_len", None)),
                "micro_batch_size": safe_convert(
                    getattr(cfg, "micro_batch_size", None)
                ),
                "gradient_accumulation_steps": safe_convert(
                    getattr(cfg, "gradient_accumulation_steps", None)
                ),
                "num_epochs": safe_convert(getattr(cfg, "num_epochs", None)),
                "max_steps": safe_convert(getattr(cfg, "max_steps", None)),
                "learning_rate": safe_convert(getattr(cfg, "learning_rate", None)),
                "lr_scheduler": safe_convert(getattr(cfg, "lr_scheduler", "")),
                "optimizer": safe_convert(getattr(cfg, "optimizer", "")),
                "warmup_ratio": safe_convert(getattr(cfg, "warmup_ratio", None)),
                "weight_decay": safe_convert(getattr(cfg, "weight_decay", None)),
                "seed": safe_convert(getattr(cfg, "seed", None)),
                "bf16": safe_convert(getattr(cfg, "bf16", None)),
                "tf32": safe_convert(getattr(cfg, "tf32", None)),
                "flash_attention": safe_convert(getattr(cfg, "flash_attention", None)),
                "sample_packing": safe_convert(getattr(cfg, "sample_packing", None)),
            }

            # Add FSDP/parallel config - only boolean flags
            if hasattr(cfg, "fsdp_config") and cfg.fsdp_config:
                config_dict["fsdp_enabled"] = True
                config_dict["fsdp_version"] = safe_convert(
                    getattr(cfg, "fsdp_version", None)
                )

            if hasattr(cfg, "deepspeed") and cfg.deepspeed:
                config_dict["deepspeed_enabled"] = True

            # Add context parallel info
            if hasattr(cfg, "context_parallel_size"):
                config_dict["context_parallel_size"] = safe_convert(
                    getattr(cfg, "context_parallel_size", None)
                )
            if hasattr(cfg, "tensor_parallel_size"):
                config_dict["tensor_parallel_size"] = safe_convert(
                    getattr(cfg, "tensor_parallel_size", None)
                )
            if hasattr(cfg, "dp_shard_size"):
                config_dict["dp_shard_size"] = safe_convert(
                    getattr(cfg, "dp_shard_size", None)
                )

            # Remove None values and empty strings
            config_dict = {
                k: v
                for k, v in config_dict.items()
                if v is not None and v != "" and v != "None"
            }

            return config_dict
        except Exception as err:  # pylint: disable=broad-except
            LOG.warning(f"Failed to prepare config for logging: {err}")
            # Return minimal config
            try:
                lr = getattr(cfg, "learning_rate", None)
                lr_value = float(lr) if lr is not None else None
            except (TypeError, ValueError):
                lr_value = None
            return {
                "base_model": str(getattr(cfg, "base_model", "unknown")),
                "learning_rate": lr_value,
            }

    def _register_lark_callback(self, cfg: DictDefault):
        """Register Lark (Feishu) notification callback if configured.

        Lark notifications enable sending training updates to team chat channels,
        useful for production monitoring and team collaboration.

        Args:
            cfg: Configuration object with Lark webhook settings
        """
        # Check if Lark webhook URL is configured
        lark_webhook_url = getattr(cfg, "swanlab_lark_webhook_url", None)
        if not lark_webhook_url:
            return  # Lark not configured, skip

        try:
            import swanlab
            from swanlab.plugin.notification import LarkCallback

            # Get optional secret for HMAC signature authentication
            lark_secret = getattr(cfg, "swanlab_lark_secret", None)

            # Create Lark callback with webhook URL and optional secret
            lark_callback = LarkCallback(
                webhook_url=lark_webhook_url,
                secret=lark_secret,
            )

            # Register callback with SwanLab
            swanlab.register_callbacks([lark_callback])

            if lark_secret:
                LOG.info(
                    "Registered Lark notification callback with HMAC authentication"
                )
            else:
                LOG.info("Registered Lark notification callback (no HMAC secret)")
                LOG.warning(
                    "Lark webhook has no secret configured. "
                    "For production use, set 'swanlab_lark_secret' to enable HMAC signature verification."
                )

        except ImportError as err:
            LOG.warning(
                f"Failed to import SwanLab Lark plugin: {err}\n\n"
                "Lark notifications require SwanLab >= 0.3.0 with plugin support.\n"
                "Install with: pip install 'swanlab>=0.3.0'\n\n"
                "Continuing without Lark notifications..."
            )
        except Exception as err:  # pylint: disable=broad-except
            LOG.exception(
                "Failed to register Lark callback: %s\n\n"
                "Check your Lark webhook URL and secret configuration.\n"
                "Continuing without Lark notifications...",
                err,
            )

    def _register_completion_callback(self, cfg: DictDefault, trainer):
        """Register RLHF completion logging callback if enabled and applicable.

        This callback logs model completions (prompts, chosen/rejected responses,
        rewards) to SwanLab during RLHF training for qualitative analysis.

        Args:
            cfg: Configuration object with completion logging settings
            trainer: The trainer instance to add callback to
        """
        # Check if completion logging is enabled
        log_completions = getattr(cfg, "swanlab_log_completions", True)
        if not log_completions:
            LOG.debug("SwanLab completion logging disabled by config")
            return

        # Check if trainer is an RLHF trainer
        trainer_name = trainer.__class__.__name__
        rlhf_trainers = ["DPO", "KTO", "ORPO", "GRPO", "CPO"]
        is_rlhf_trainer = any(name in trainer_name for name in rlhf_trainers)

        if not is_rlhf_trainer:
            LOG.debug(
                f"Trainer {trainer_name} is not an RLHF trainer, "
                "skipping completion logging callback"
            )
            return

        try:
            from axolotl.integrations.swanlab.callbacks import (
                SwanLabRLHFCompletionCallback,
            )

            # Get configuration parameters
            log_interval = getattr(cfg, "swanlab_completion_log_interval", 100)
            max_buffer = getattr(cfg, "swanlab_completion_max_buffer", 128)

            # Create and register callback
            completion_callback = SwanLabRLHFCompletionCallback(
                log_interval=log_interval,
                max_completions=max_buffer,
                table_name="rlhf_completions",
            )

            trainer.add_callback(completion_callback)

            LOG.info(
                f"Registered SwanLab RLHF completion logging callback for {trainer_name} "
                f"(log_interval={log_interval}, max_buffer={max_buffer})"
            )

        except ImportError as err:
            LOG.warning(
                f"Failed to import SwanLab completion callback: {err}\n\n"
                "This is a bug - the callback should be available.\n"
                "Please report this issue.\n\n"
                "Continuing without completion logging..."
            )
        except Exception as err:  # pylint: disable=broad-except
            LOG.exception(
                "Failed to register SwanLab completion callback: %s\n\n"
                "Continuing without completion logging...",
                err,
            )


================================================
FILE: src/axolotl/integrations/swanlab/profiling.py
================================================
"""SwanLab profiling utilities for Axolotl trainers.

This module provides decorators and context managers for profiling
trainer methods and logging execution times to SwanLab.
"""

import time
from contextlib import contextmanager
from functools import wraps
from typing import Any, Callable

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


@contextmanager
def swanlab_profiling_context(trainer: Any, func_name: str):
    """Context manager for profiling trainer methods.

    Measures execution time and logs to SwanLab if enabled.

    Example usage:
        >>> with swanlab_profiling_context(self, "training_step"):
        ...     result = do_expensive_computation()

    Args:
        trainer: Trainer instance (must have cfg attribute with use_swanlab flag)
        func_name: Name of the function being profiled

    Yields:
        None
    """
    start_time = time.perf_counter()
    try:
        yield
    finally:
        duration = time.perf_counter() - start_time

        # Check if SwanLab is enabled and initialized
        use_swanlab = getattr(getattr(trainer, "cfg", None), "use_swanlab", False)
        if use_swanlab:
            try:
                import swanlab

                if swanlab.get_run() is not None:
                    # Log profiling metric
                    trainer_class = trainer.__class__.__name__
                    metric_name = f"profiling/Time taken: {trainer_class}.{func_name}"

                    swanlab.log({metric_name: duration})

            except ImportError:
                # SwanLab not installed, silently skip
                pass
            except Exception as err:  # pylint: disable=broad-except
                # Log error but don't fail training
                LOG.debug(f"Failed to log profiling metric for {func_name}: {err}")


def swanlab_profile(func: Callable) -> Callable:
    """Decorator to profile and log function execution time to SwanLab.

    Automatically measures execution time of trainer methods and logs
    to SwanLab as profiling metrics.

    Example usage:
        >>> class MyTrainer:
        ...     @swanlab_profile
        ...     def training_step(self, model, inputs):
        ...         return super().training_step(model, inputs)

    Args:
        func: Function to profile (must be a method of a trainer instance)

    Returns:
        Wrapped function with profiling
    """

    @wraps(func)
    def wrapper(self, *args, **kwargs):
        with swanlab_profiling_context(self, func.__name__):
            return func(self, *args, **kwargs)

    return wrapper


class ProfilingConfig:
    """Configuration for SwanLab profiling.

    This class provides a centralized way to control profiling behavior.

    Attributes:
        enabled: Whether profiling is enabled globally
        min_duration_ms: Minimum duration (in ms) to log (filters out very fast ops)
        log_interval: Log every N function calls (to reduce overhead)
    """

    def __init__(
        self,
        enabled: bool = True,
        min_duration_ms: float = 0.1,
        log_interval: int = 1,
    ):
        """Initialize profiling configuration.

        Args:
            enabled: Enable profiling. Default: True
            min_duration_ms: Minimum duration to log (ms). Default: 0.1
            log_interval: Log every N calls. Default: 1 (log all)
        """
        self.enabled = enabled
        self.min_duration_ms = min_duration_ms
        self.log_interval = log_interval
        self._call_counts: dict[str, int] = {}

    def should_log(self, func_name: str, duration_seconds: float) -> bool:
        """Check if a profiling measurement should be logged.

        Args:
            func_name: Name of the profiled function
            duration_seconds: Execution duration in seconds

        Returns:
            True if should log, False otherwise
        """
        if not self.enabled:
            return False

        # Check minimum duration threshold
        duration_ms = duration_seconds * 1000
        if duration_ms < self.min_duration_ms:
            return False

        # Check log interval
        self._call_counts.setdefault(func_name, 0)
        self._call_counts[func_name] += 1

        # Always log on first call OR at intervals
        count = self._call_counts[func_name]
        if count == 1 or count % self.log_interval == 0:
            return True

        return False


# Global profiling config (can be modified by users)
DEFAULT_PROFILING_CONFIG = ProfilingConfig()


@contextmanager
def swanlab_profiling_context_advanced(
    trainer: Any,
    func_name: str,
    config: ProfilingConfig | None = None,
):
    """Advanced profiling context with configurable behavior.

    Similar to swanlab_profiling_context but with additional configuration
    options for filtering and throttling profiling logs.

    Example usage:
        >>> config = ProfilingConfig(min_duration_ms=1.0, log_interval=10)
        >>> with swanlab_profiling_context_advanced(self, "forward", config):
        ...     output = model(inputs)

    Args:
        trainer: Trainer instance
        func_name: Function name
        config: Profiling configuration. If None, uses DEFAULT_PROFILING_CONFIG

    Yields:
        None
    """
    if config is None:
        config = DEFAULT_PROFILING_CONFIG

    start_time = time.perf_counter()
    try:
        yield
    finally:
        duration = time.perf_counter() - start_time

        # Check if should log based on config
        if config.should_log(func_name, duration):
            # Check if SwanLab is enabled
            use_swanlab = getattr(getattr(trainer, "cfg", None), "use_swanlab", False)
            if use_swanlab:
                try:
                    import swanlab

                    if swanlab.get_run() is not None:
                        trainer_class = trainer.__class__.__name__
                        metric_name = (
                            f"profiling/Time taken: {trainer_class}.{func_name}"
                        )

                        swanlab.log({metric_name: duration})

                except ImportError:
                    pass
                except Exception as err:  # pylint: disable=broad-except
                    LOG.debug(f"Failed to log profiling metric for {func_name}: {err}")


================================================
FILE: src/axolotl/kernels/__init__.py
================================================


================================================
FILE: src/axolotl/kernels/geglu.py
================================================
"""Module for definition of GEGLU Triton kernels.

See "GLU Variants Improve Transformer" (https://arxiv.org/abs/2002.05202).

Credit to `unsloth` (https://unsloth.ai/) for inspiration for this implementation.
"""

import torch
import triton
import triton.language as tl


@triton.jit
def _geglu_fwd_kernel(
    gate_ptr,
    up_ptr,
    out_ptr,
    n_elements,
    BLOCK_SIZE: tl.constexpr,
):
    """GEGLU forward kernel.

    Args:
        gate_ptr: Pointer to gate tensor [*, hidden_dim].
        up_ptr: Pointer to up-projection tensor [*, hidden_dim].
        out_ptr: Pointer to output tensor [*, hidden_dim].
        n_elements: Total number of elements in the input tensors.
        BLOCK_SIZE: Size of thread blocks for parallel computation.
    """
    block_idx = tl.program_id(0)
    offsets = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
    mask = offsets < n_elements

    gate = tl.load(gate_ptr + offsets, mask=mask, other=0).to(tl.float32)
    up = tl.load(up_ptr + offsets, mask=mask, other=0)

    # Compute activation in fp32 then convert back
    gelu_gate = 0.5 * gate * (tl.math.erf(tl.math.rsqrt(2.0) * gate) + 1.0)
    gelu_gate = gelu_gate.to(up.dtype)
    result = gelu_gate * up

    tl.store(out_ptr + offsets, result, mask=mask)


def geglu_forward(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
    """GEGLU forward pass.

    Args:
        gate: Input gate tensor of shape [batch, seq_len, hidden_dim].
        up: Up-projection tensor of shape [batch, seq_len, hidden_dim].

    Returns:
        torch.Tensor: Output tensor of shape [batch, seq_len, hidden_dim].
    """
    batch, seq_len, hidden_dim = gate.shape
    n_elements = gate.numel()
    out = torch.empty((batch, seq_len, hidden_dim), dtype=gate.dtype, device="cuda")

    grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)  # noqa: E731
    _geglu_fwd_kernel[grid](
        gate_ptr=gate,
        up_ptr=up,
        out_ptr=out,
        n_elements=n_elements,
        BLOCK_SIZE=1024,
    )
    return out


@triton.jit
def _geglu_bwd_kernel(
    grad_out_ptr,
    gate_ptr,
    up_ptr,
    n_elements,
    BLOCK_SIZE: tl.constexpr,
):
    """GEGLU backward kernel. Stores gradient results in-place.

    Args:
        grad_out_ptr: Pointer to gradient output tensor [*, hidden_dim].
        gate_ptr: Pointer to gate tensor [*, hidden_dim].
        up_ptr: Pointer to up-projection tensor [*, hidden_dim].
        n_elements: Total number of elements in the input tensors.
        BLOCK_SIZE: Size of thread blocks for parallel computation.

    Note:
        After kernel execution, tensors are modified in-place:
        - `grad_out_ptr` contains GEGLU activation output (`h`)
        - `gate_ptr` contains gradient w.r.t gate (`grad_gate`)
        - `up_ptr` contains gradient w.r.t up (`grad_up`)
    """
    block_idx = tl.program_id(0)
    offsets = block_idx * BLOCK_SIZE + tl.arange(0, BLOCK_SIZE)
    mask = offsets < n_elements

    grad_out = tl.load(grad_out_ptr + offsets, mask=mask, other=0)
    gate = tl.load(gate_ptr + offsets, mask=mask, other=0).to(tl.float32)
    up = tl.load(up_ptr + offsets, mask=mask, other=0)

    # Forward pass
    gelu_partial = 0.5 * (tl.math.erf(tl.math.rsqrt(2.0) * gate) + 1.0)
    gelu_gate = gelu_partial * gate
    gelu_gate = gelu_gate.to(grad_out.dtype)

    # Forward output
    h = gelu_gate * up

    # Compute gradients
    grad_up = grad_out * gelu_gate

    # Compute gate gradient using GELU derivative
    temp = grad_out * up
    t = 0.3989422804014327  # 1/sqrt(2*pi)
    dgelu_dgate = gelu_partial + t * gate * tl.exp(-0.5 * gate * gate)
    grad_gate = temp.to(tl.float32) * dgelu_dgate
    grad_gate = grad_gate.to(grad_out.dtype)

    # Store results
    tl.store(grad_out_ptr + offsets, h, mask=mask)
    tl.store(gate_ptr + offsets, grad_gate, mask=mask)
    tl.store(up_ptr + offsets, grad_up, mask=mask)


def geglu_backward(
    grad_output: torch.Tensor, gate: torch.Tensor, up: torch.Tensor
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """GEGLU backward pass using in-place operations.

    Args:
        grad_output: Gradient of loss with respect to output, shape `[batch, seq_len, hidden_dim]`.
        gate: Gate tensor from forward pass, shape `[batch, seq_len, hidden_dim]`.
        up: Up-projection tensor from forward pass, shape `[batch, seq_len, hidden_dim]`.

    Returns:
        Tuple containing:
            - GEGLU activation output (`h`)
            - Gradient with respect to gate (`grad_gate`)
            - Gradient with respect to up (`grad_up`)

    Note:
        This function modifies its input tensors in-place to store results.
    """
    n_elements = grad_output.numel()

    grid = lambda meta: (triton.cdiv(n_elements, meta["BLOCK_SIZE"]),)  # noqa: E731
    _geglu_bwd_kernel[grid](
        grad_out_ptr=grad_output,
        gate_ptr=gate,
        up_ptr=up,
        n_elements=n_elements,
        BLOCK_SIZE=1024,
    )

    return grad_output, gate, up


================================================
FILE: src/axolotl/kernels/lora.py
================================================
"""
Module for definition of Low-Rank Adaptation (LoRA) Triton kernels.

See "LoRA: Low-Rank Adaptation of Large Language Models"
(https://arxiv.org/abs/2106.09685).

Credit to `unsloth` (https://unsloth.ai/) for inspiration for this implementation.
"""

from typing import Callable

import torch
from bitsandbytes.functional import QuantState
from torch import nn
from torch.distributed.tensor import DTensor

from .geglu import geglu_backward, geglu_forward
from .quantize import dequantize
from .swiglu import swiglu_backward, swiglu_forward
from .utils import torch_amp_custom_bwd, torch_amp_custom_fwd


def get_lora_parameters(
    proj: nn.Module,
) -> tuple[
    torch.Tensor,
    torch.Tensor | None,
    QuantState | torch.Tensor | None,
    torch.Tensor | None,
    torch.Tensor | None,
    float | None,
]:
    """
    Gets LoRA parameters from a projection module.

    Args:
        proj: The projection module to extract parameters from.

    Returns:
        A tuple containing the base weights, quantization state, LoRA A and B weights,
        scaling factor, and base layer bias. Quant state, weights, and bias may be
        `None` if not available.
    """
    # For DPO or disabled adapters
    base_layer = proj.base_layer if hasattr(proj, "base_layer") else proj
    W = base_layer.weight
    b = base_layer.bias

    if not hasattr(proj, "disable_adapters") or proj.disable_adapters or proj.merged:
        quant_state = getattr(W, "quant_state", None)
        if quant_state is None and W.dtype == torch.float8_e4m3fn:
            quant_state = getattr(base_layer, "weight_scale_inv", None)
        return W, b, quant_state, None, None, None

    quant_state = getattr(W, "quant_state", None)
    if quant_state is None and W.dtype == torch.float8_e4m3fn:
        quant_state = getattr(base_layer, "weight_scale_inv", None)

    active_adapter = (
        proj.active_adapters[0]
        if hasattr(proj, "active_adapters")
        else proj.active_adapter
    )

    linear_A = proj.lora_A[active_adapter]
    linear_B = proj.lora_B[active_adapter]

    # This manual unsharding is needed for FSDP2 + LoRA kernels compatibility.
    # We fuse linear layers + LoRA adapters calculations into a single
    # torch.autograd.Function, bypassing the registered unshard / reshard behavior.
    # Note that we don't apply resharding later in this module (it gets messy quickly),
    # but LoRA parameters are generally small enough that this is not an issue.
    if isinstance(linear_A.weight, DTensor):
        linear_A.unshard()
        linear_B.unshard()

    A = linear_A.weight
    B = linear_B.weight
    s = proj.scaling[active_adapter]

    return W, b, quant_state, A, B, s


def matmul_lora(
    X: torch.Tensor,
    W: torch.Tensor,
    b: torch.Tensor | None,
    W_quant: QuantState | torch.Tensor | None,
    A: torch.Tensor | None,
    B: torch.Tensor | None,
    s: float | None,
    out: torch.Tensor | None = None,
) -> torch.Tensor:
    """
    Efficient fused matmul + LoRA computation.

    Args:
        X: Input tensor [*, in_features]
        W: Base weight matrix [out_features, in_features]
        W_quant: Quantization state for W
        A: LoRA A matrix [rank, in_features]
        B: LoRA B matrix [out_features, rank]
        s: LoRA scaling factor
        out: Optional output tensor for inplace operations

    Returns:
        Result of X @ W + X @ A @ B
    """
    dtype = X.dtype
    W = dequantize(W.t(), W_quant)

    reshape = False
    if X.dim() == 3:
        batch, seq_len, _ = X.shape
        X = X.view(-1, X.shape[-1])
        reshape = True

    out = torch.matmul(X, W, out=out)
    if W_quant is not None:
        del W

    if A is not None:
        A, B = A.t().to(dtype), B.t().to(dtype)  # type: ignore[union-attr]
        out += s * X @ A @ B

    if b is not None:
        out += b

    return out.view(batch, seq_len, -1) if reshape else out


class LoRA_MLP(torch.autograd.Function):
    """Optimized LoRA MLP implementation."""

    @staticmethod
    @torch_amp_custom_fwd
    def forward(
        ctx,
        X: torch.Tensor,
        gate_weight: torch.Tensor,
        gate_bias: torch.Tensor | None,
        gate_quant: QuantState | None,
        gate_A: torch.Tensor | None,
        gate_B: torch.Tensor | None,
        gate_scale: float,
        up_weight: torch.Tensor,
        up_bias: torch.Tensor | None,
        up_quant: QuantState | None,
        up_A: torch.Tensor | None,
        up_B: torch.Tensor | None,
        up_scale: float,
        down_weight: torch.Tensor,
        down_bias: torch.Tensor | None,
        down_quant: QuantState | None,
        down_A: torch.Tensor | None,
        down_B: torch.Tensor | None,
        down_scale: float,
        activation_fn: Callable,
        activation_fn_backward: Callable,
        inplace: bool | None = True,
    ) -> torch.Tensor:
        """
        Forward pass for LoRA MLP.

        Args:
            ctx: Autograd context
            X: Input features
            gate_weight: Gate projection weight
            gate_bias: Gate projection bias
            gate_quant: Gate quantization state
            gate_A: Gate LoRA A matrix
            gate_B: Gate LoRA B matrix
            gate_scale: Gate LoRA scale
            up_weight: Up projection weight
            up_quant: Up projection quantization state
            up_A: Up projection LoRA A matrix
            up_B: Up projection LoRA B matrix
            up_scale: Up projection LoRA scale
            down_weight: Down projection weight
            down_bias: Down projection bias
            down_quant: Down projection quantization state
            down_A: Down projection LoRA A matrix
            down_B: Down projection LoRA B matrix
            down_scale: Down projection LoRA scale
            activation_fn: Forward activation function
            activation_fn_backward: Backward activation function
            inplace: Whether to perform operations in-place

        Returns:
            Output transformed by multi-layer perceptron and activation function
        """
        # Compute projections
        gate = matmul_lora(
            X, gate_weight, gate_bias, gate_quant, gate_A, gate_B, gate_scale
        )
        up = matmul_lora(X, up_weight, up_bias, up_quant, up_A, up_B, up_scale)

        # Activation
        hidden = activation_fn(gate, up)

        # Down projection
        output = matmul_lora(
            hidden, down_weight, down_bias, down_quant, down_A, down_B, down_scale
        )

        # Save for backward
        ctx.save_for_backward(X, gate, up, gate_A, gate_B, up_A, up_B, down_A, down_B)
        ctx.scales = (gate_scale, up_scale, down_scale)
        ctx.quants = (gate_quant, up_quant, down_quant)
        ctx.weights = (gate_weight, up_weight, down_weight)
        ctx.activation_fn = activation_fn
        ctx.activation_fn_backward = activation_fn_backward
        ctx.inplace = inplace

        return output

    @staticmethod
    @torch_amp_custom_bwd
    def backward(
        ctx: torch.autograd.function.FunctionCtx,
        grad_output: torch.Tensor,
    ) -> tuple[
        torch.Tensor | None,
        None,
        None,
        None,
        torch.Tensor | None,
        torch.Tensor | None,
        None,
        None,
        None,
        None,
        torch.Tensor | None,
        torch.Tensor | None,
        None,
        None,
        None,
        None,
        torch.Tensor | None,
        torch.Tensor | None,
        None,
        None,
        None,
        None,
        None,
    ]:
        """
        Performs backward pass computation for LoRA MLP.

        Args:
            ctx: Context object storing tensors saved during forward pass
            grad_output: Gradient of loss with respect to layer output

        Returns:
            Tuple containing gradients for all inputs from forward pass:
            - Input gradient tensor (or `None`)
            - `None` for weights/biases/quantization states
            - LoRA A/B matrix gradients (or `None`)
            - `None` for scaling factors
            - `None` for activation functions and flags
        """
        (
            X,
            gate,
            up,
            gate_A,
            gate_B,
            up_A,
            up_B,
            down_A,
            down_B,
        ) = ctx.saved_tensors
        gate_scale, up_scale, down_scale = ctx.scales
        gate_quant, up_quant, down_quant = ctx.quants
        gate_weight, up_weight, down_weight = ctx.weights

        # Transpose all LoRA matrices
        gate_A, gate_B = (
            gate_A.t() if gate_A is not None else None,
            gate_B.t() if gate_B is not None else None,
        )
        up_A, up_B = (
            up_A.t() if up_A is not None else None,
            up_B.t() if up_B is not None else None,
        )
        down_A, down_B = (
            down_A.t() if down_A is not None else None,
            down_B.t() if down_B is not None else None,
        )

        # Reshape inputs
        batch, seq_len, hd = X.shape
        grad_output = grad_output.view(-1, grad_output.shape[-1])
        X = X.view(-1, X.shape[-1])
        gate = gate.view(-1, gate.shape[-1])
        up = up.view(-1, up.shape[-1])
        dtype = X.dtype

        # Down projection
        grad_down = matmul_lora(
            grad_output,
            down_weight.t(),
            None,
            down_quant,
            down_B,
            down_A,
            down_scale,
        )

        # Activation backward
        h, grad_gate, grad_up = ctx.activation_fn_backward(grad_down, gate, up)

        # Initialize and compute LoRA gradients
        d_down_A = d_down_B = d_up_A = d_up_B = d_gate_A = d_gate_B = None

        if down_A is not None and down_B is not None:
            d_down_A = h.t() @ (grad_output @ down_B.t())
            d_down_B = (down_A.t() @ h.t()) @ grad_output
            d_down_A *= down_scale
            d_down_B *= down_scale

        if up_A is not None and up_B is not None:
            d_up_A = X.t() @ (grad_up @ up_B.t())
            d_up_B = (up_A.t() @ X.t()) @ grad_up
            d_up_A *= up_scale
            d_up_B *= up_scale

        if gate_A is not None and gate_B is not None:
            d_gate_A = X.t() @ (grad_gate @ gate_B.t())
            d_gate_B = (gate_A.t() @ X.t()) @ grad_gate
            d_gate_A *= gate_scale
            d_gate_B *= gate_scale

        # Compute input gradients
        dX = torch.zeros_like(X) if ctx.needs_input_grad[0] else None

        if dX is not None:
            # Up projection gradients
            up_weight = dequantize(up_weight.t(), up_quant)
            if ctx.inplace:
                dX = torch.matmul(grad_up, up_weight.t(), out=X)
            else:
                dX = torch.matmul(grad_up, up_weight.t())
            del up_weight

            # Note the .to(dtype) only where mixing LoRA with base weights
            if up_A is not None and up_B is not None:
                dX += grad_up @ up_B.to(dtype).t() @ (up_scale * up_A.to(dtype).t())

            # Gate projection gradients
            gate_weight = dequantize(gate_weight, gate_quant)
            dX += grad_gate @ gate_weight
            del gate_weight

            if gate_A is not None and gate_B is not None:
                dX += (
                    grad_gate
                    @ gate_B.to(dtype).t()
                    @ (gate_scale * gate_A.to(dtype).t())
                )

            # Reshape back
            dX = dX.view(batch, seq_len, hd)

        # Return gradients in correct order matching forward inputs
        return (
            dX,
            None,
            None,
            None,
            d_gate_A.t() if d_gate_A is not None else None,
            d_gate_B.t() if d_gate_B is not None else None,
            None,
            None,
            None,
            None,
            d_up_A.t() if d_up_A is not None else None,
            d_up_B.t() if d_up_B is not None else None,
            None,
            None,
            None,
            None,
            d_down_A.t() if d_down_A is not None else None,
            d_down_B.t() if d_down_B is not None else None,
            None,
            None,
            None,
            None,
            None,
        )


def apply_lora_mlp_swiglu(self, X: torch.Tensor, inplace: bool = True) -> torch.Tensor:
    """
    Applies LoRA to MLP layer with SwiGLU activation.

    Args:
        X: Input tensor for the MLP layer
        inplace: Whether to perform operations in-place to save memory

    Returns:
        Output tensor after applying LoRA-adapted MLP with SwiGLU activation
    """
    gateW, gateb, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
    upW, upb, upW_quant, upA, upB, upS = get_lora_parameters(self.up_proj)
    downW, downb, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)

    out = LoRA_MLP.apply(
        X,
        gateW,
        gateb,
        gateW_quant,
        gateA,
        gateB,
        gateS,
        upW,
        upb,
        upW_quant,
        upA,
        upB,
        upS,
        downW,
        downb,
        downW_quant,
        downA,
        downB,
        downS,
        swiglu_forward,
        swiglu_backward,
        inplace,
    )

    return out


def apply_lora_mlp_geglu(self, X: torch.Tensor, inplace: bool = True) -> torch.Tensor:
    """
    Applies LoRA to MLP layer with GEGLU activation.

    Args:
        X: Input tensor for the MLP layer
        inplace: Whether to perform operations in-place to save memory

    Returns:
        Output tensor after applying LoRA-adapted MLP with GEGLU activation
    """
    gateW, gateb, gateW_quant, gateA, gateB, gateS = get_lora_parameters(self.gate_proj)
    upW, upb, upW_quant, upA, upB, upS = get_lora_parameters(self.up_proj)
    downW, downb, downW_quant, downA, downB, downS = get_lora_parameters(self.down_proj)
    out = LoRA_MLP.apply(
        X,
        gateW,
        gateb,
        gateW_quant,
        gateA,
        gateB,
        gateS,
        upW,
        upb,
        upW_quant,
        upA,
        upB,
        upS,
        downW,
        downb,
        downW_quant,
        downA,
        downB,
        downS,
        geglu_forward,
        geglu_backward,
        inplace,
    )

    return out


class LoRA_QKV(torch.autograd.Function):
    """
    Optimized LoRA QKV implementation with quantization support.

    Implements efficient computation of query, key, value projections with LoRA,
    supporting quantization and memory optimization.
    """

    @staticmethod
    @torch_amp_custom_fwd
    def forward(
        ctx: torch.autograd.function.FunctionCtx,
        X: torch.Tensor,
        q_weight: torch.Tensor,
        q_bias: torch.Tensor | None,
        q_quant: QuantState | None,
        q_A: torch.Tensor | None,
        q_B: torch.Tensor | None,
        q_scale: float,
        k_weight: torch.Tensor,
        k_bias: torch.Tensor | None,
        k_quant: QuantState | None,
        k_A: torch.Tensor | None,
        k_B: torch.Tensor | None,
        k_scale: float,
        v_weight: torch.Tensor,
        v_bias: torch.Tensor | None,
        v_quant: QuantState | None,
        v_A: torch.Tensor | None,
        v_B: torch.Tensor | None,
        v_scale: float,
        inplace: bool = True,
    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """
        Forward pass computing Q, K, V projections with LoRA.

        Args:
            ctx: Autograd context
            X: Input tensor
            q_weight: Query projection weight
            q_bias: Query projection bias
            q_quant: Query quantization state
            q_A: Query LoRA A matrix
            q_B: Query LoRA B matrix
            q_scale: Query LoRA scale
            k_weight: Key projection weight
            k_bias: Key projection bias
            k_quant: Key quantization state
            k_A: Key LoRA A matrix
            k_B: Key LoRA B matrix
            k_scale: Key LoRA scale
            v_weight: Value projection weight
            v_bias: Value projection bias
            v_quant: Value quantization state
            v_A: Value LoRA A matrix
            v_B: Value LoRA B matrix
            v_scale: Value LoRA scale
            inplace: Whether to perform operations in-place

        Returns:
            Tuple of (Query, Key, Value) projection tensors
        """
        Q = matmul_lora(X, q_weight, q_bias, q_quant, q_A, q_B, q_scale)
        K = matmul_lora(X, k_weight, k_bias, k_quant, k_A, k_B, k_scale)
        V = matmul_lora(X, v_weight, v_bias, v_quant, v_A, v_B, v_scale)

        ctx.save_for_backward(X, q_A, q_B, k_A, k_B, v_A, v_B)
        ctx.scales = (q_scale, k_scale, v_scale)
        ctx.quants = (q_quant, k_quant, v_quant)
        ctx.weights = (q_weight, k_weight, v_weight)
        ctx.biases = (q_bias, k_bias, v_bias)
        ctx.inplace = inplace

        return Q, K, V

    @staticmethod
    @torch_amp_custom_bwd
    def backward(
        ctx: torch.autograd.function.FunctionCtx,
        q_grad: torch.Tensor,
        k_grad: torch.Tensor,
        v_grad: torch.Tensor,
    ) -> tuple[
        torch.Tensor,
        None,
        None,
        None,
        torch.Tensor | None,
        torch.Tensor | None,
        None,
        None,
        None,
        None,
        torch.Tensor | None,
        torch.Tensor | None,
        None,
        None,
        None,
        None,
        torch.Tensor | None,
        torch.Tensor | None,
        None,
        None,
    ]:
        """
        Backward pass computing gradients for LoRA QKV.

        Args:
            ctx: Autograd context
            q_grad: Gradient for query projection
            k_grad: Gradient for key projection
            v_grad: Gradient for value projection

        Returns:
            Tuple containing gradients for all forward inputs
        """
        X, A_q, B_q, A_k, B_k, A_v, B_v = ctx.saved_tensors
        q_weight, k_weight, v_weight = ctx.weights
        q_quant, k_quant, v_quant = ctx.quants
        q_scale, k_scale, v_scale = ctx.scales
        dtype = X.dtype

        # Reshape gradients
        batch, seq_len = X.shape[:2]
        q_grad = q_grad.view(-1, q_grad.shape[-1])
        k_grad = k_grad.reshape(-1, k_grad.shape[-1])
        v_grad = v_grad.view(-1, v_grad.shape[-1])
        X = X.view(-1, X.shape[-1])

        # Pre-transpose X once
        X_t = X.t()

        # Initialize LoRA gradients as None
        d_A_q = d_B_q = d_A_k = d_B_k = d_A_v = d_B_v = None

        # Compute q path LoRA gradients if adapters exist
        if A_q is not None and B_q is not None:
            A_q_scaled = (q_scale * A_q).to(dtype)
            B_q_scaled = B_q.to(dtype)
            d_A_q = torch.mm(X_t, torch.mm(q_grad, B_q_scaled))
            d_B_q = torch.mm(torch.mm(A_q_scaled, X_t), q_grad)

        # Compute k path LoRA gradients if adapters exist
        if A_k is not None and B_k is not None:
            A_k_scaled = (k_scale * A_k).to(dtype)
            B_k_scaled = B_k.to(dtype)
            d_A_k = torch.mm(X_t, torch.mm(k_grad, B_k_scaled))
            d_B_k = torch.mm(torch.mm(A_k_scaled, X_t), k_grad)

        # Compute v path LoRA gradients if adapters exist
        if A_v is not None and B_v is not None:
            A_v_scaled = (v_scale * A_v).to(dtype)
            B_v_scaled = B_v.to(dtype)
            d_A_v = torch.mm(X_t, torch.mm(v_grad, B_v_scaled))
            d_B_v = torch.mm(torch.mm(A_v_scaled, X_t), v_grad)

        # Compute input gradient, reusing X memory if possible
        out_buffer = X if ctx.inplace else None

        # Q path
        q_weight_t = dequantize(q_weight, q_quant)
        grad_X = torch.mm(q_grad, q_weight_t, out=out_buffer)
        del q_weight
        del q_weight_t
        if A_q is not None and B_q is not None:
            # Stay decomposed: dQ @ B^T gives [T, R], then [T, R] @ (s*A) gives [T, in]
            # This is 65x fewer FLOPs than materializing B@A into [out, in]
            grad_X.addmm_(torch.mm(q_grad, B_q_scaled), A_q_scaled)

        # K path
        k_weight_t = dequantize(k_weight, k_quant)
        grad_X.addmm_(k_grad, k_weight_t)
        del k_weight
        del k_weight_t
        if A_k is not None and B_k is not None:
            grad_X.addmm_(torch.mm(k_grad, B_k_scaled), A_k_scaled)

        # V path
        v_weight_t = dequantize(v_weight, v_quant)
        grad_X.addmm_(v_grad, v_weight_t)
        del v_weight
        del v_weight_t
        if A_v is not None and B_v is not None:
            grad_X.addmm_(torch.mm(v_grad, B_v_scaled), A_v_scaled)

        # Transpose gradients if needed
        if d_A_q is not None:
            d_A_q = d_A_q.t()
            d_B_q = d_B_q.t()  # type: ignore[union-attr]
        if d_A_k is not None:
            d_A_k = d_A_k.t()
            d_B_k = d_B_k.t()  # type: ignore[union-attr]
        if d_A_v is not None:
            d_A_v = d_A_v.t()
            d_B_v = d_B_v.t()  # type: ignore[union-attr]

        return (
            grad_X.view(batch, seq_len, -1),
            None,
            None,
            None,
            d_A_q,
            d_B_q,
            None,
            None,
            None,
            None,
            d_A_k,
            d_B_k,
            None,
            None,
            None,
            None,
            d_A_v,
            d_B_v,
            None,
            None,
        )


def apply_lora_qkv(
    self, X: torch.Tensor, inplace: bool = True
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Applies LoRA to compute Query, Key, Value projections.

    Args:
        X: Input tensor
        inplace: Whether to perform operations in-place

    Returns:
        Tuple of (Query, Key, Value) projection tensors
    """
    QW, Qb, QW_quant, QA, QB, QS = get_lora_parameters(self.q_proj)
    KW, Kb, KW_quant, KA, KB, KS = get_lora_parameters(self.k_proj)
    VW, Vb, VW_quant, VA, VB, VS = get_lora_parameters(self.v_proj)
    Q, K, V = LoRA_QKV.apply(
        X,
        QW,
        Qb,
        QW_quant,
        QA,
        QB,
        QS,
        KW,
        Kb,
        KW_quant,
        KA,
        KB,
        KS,
        VW,
        Vb,
        VW_quant,
        VA,
        VB,
        VS,
        inplace,
    )

    return Q, K, V


class LoRA_O(torch.autograd.Function):
    """Optimized LoRA implementation for output projection."""

    @staticmethod
    @torch_amp_custom_fwd
    def forward(
        ctx: torch.autograd.function.FunctionCtx,
        X: torch.Tensor,
        W: torch.Tensor,
        b: torch.Tensor,
        W_quant: QuantState | None,
        A: torch.Tensor,
        B: torch.Tensor,
        s: float,
    ) -> torch.Tensor:
        """
        Forward pass for output projection with LoRA.

        Args:
            ctx: Autograd context
            X: Input tensor
            W: Output projection weight
            b: Output projection bias
            W_quant: Weight quantization state
            A: LoRA A matrix
            B: LoRA B matrix
            s: LoRA scaling factor

        Returns:
            Output projection result
        """
        XW = matmul_lora(X, W, b, W_quant, A, B, s)
        ctx.custom_saved_tensors = (
            W,
            W_quant,
            s,
        )
        ctx.save_for_backward(A, B, X)

        return XW

    @staticmethod
    @torch_amp_custom_bwd
    def backward(
        ctx: torch.autograd.function.FunctionCtx,
        dY: torch.Tensor,
    ) -> tuple[
        torch.Tensor,
        None,
        None,
        None,
        torch.Tensor,
        torch.Tensor,
        None,
    ]:
        """
        Backward pass computing gradients for LoRA output projection.

        Args:
            ctx: Autograd context
            dY: Gradient of loss with respect to output

        Returns:
            Tuple containing gradients for all forward inputs
        """
        W, W_quant, s = ctx.custom_saved_tensors
        A, B, X = ctx.saved_tensors

        batch, seq_len, hd = X.shape
        dY = dY.reshape(-1, dY.shape[-1])
        X = X.reshape(-1, X.shape[-1])
        dtype = X.dtype

        # Weight projection
        dY_X = X.t() @ dY
        d_A = s * dY_X @ B
        d_B = s * A @ dY_X

        # Get derivative for dX
        W = dequantize(W.t(), W_quant)
        dX = dY @ W.t()
        del W

        A, B = A.to(dtype), B.to(dtype)
        # Stay decomposed: dY @ B gives [T, R], then [T, R] @ A gives [T, in]
        dX.addmm_(torch.mm(dY, B), A, alpha=s)

        # W, b, W_quant, A, B, s
        return dX.view(batch, seq_len, hd), None, None, None, d_A.t(), d_B.t(), None


def apply_lora_o(self, X: torch.Tensor) -> torch.Tensor:
    """
    Applies LoRA to output projection layer.

    Args:
        X: Input tensor

    Returns:
        Transformed output tensor
    """
    OW, Ob, OW_quant, OA, OB, OS = get_lora_parameters(self.o_proj)
    output = LoRA_O.apply(X, OW, Ob, OW_quant, OA, OB, OS)

    return output


================================================
FILE: src/axolotl/kernels/quantize.py
================================================
"""Dequantization utilities for `bitsandbytes` and FP8 integration."""

import ctypes

import bitsandbytes as bnb
import torch
from bitsandbytes.functional import QuantState, get_ptr
from packaging.version import Version

cdequantize_blockwise_fp32 = bnb.functional.lib.cdequantize_blockwise_fp32
cdequantize_blockwise_fp16_nf4 = bnb.functional.lib.cdequantize_blockwise_fp16_nf4
cdequantize_blockwise_bf16_nf4 = bnb.functional.lib.cdequantize_blockwise_bf16_nf4

CUDA_STREAM: torch.cuda.Stream | None = None
HAS_CUDA_STREAM: bool = Version(bnb.__version__) > Version("0.43.3")


def dequantize_fp8(
    W: torch.Tensor,
    scale_inv: torch.Tensor,
    dtype: torch.dtype = torch.bfloat16,
) -> torch.Tensor:
    """Dequantize FP8 block-quantized weights: W_dequant = W_fp8 * scale_inv.

    Args:
        W: FP8 weight tensor [out_features, in_features] in float8_e4m3fn.
        scale_inv: Per-block inverse scale [ceil(out/block), ceil(in/block)]
            or per-tensor scalar.
        dtype: Output dtype (default bf16).

    Returns:
        Dequantized tensor in the specified dtype.
    """
    W_float = W.to(dtype)
    if scale_inv.numel() == 1:
        return W_float * scale_inv.to(dtype)
    if scale_inv.dim() == 2 and W.dim() == 2:
        sr, sc = scale_inv.shape
        br = W.shape[0] // sr
        bc = W.shape[1] // sc
        # If dimensions are exactly divisible, use fast reshape path
        if sr * br == W.shape[0] and sc * bc == W.shape[1]:
            return (
                W_float.reshape(sr, br, sc, bc) * scale_inv[:, None, :, None].to(dtype)
            ).reshape(W.shape)
        # Tail-block handling: compute actual block size (ceil division),
        # tile scale_inv to cover full shape, then crop to W's dimensions
        br_ceil = -(-W.shape[0] // sr)  # ceil(rows / scale_rows) = block_size
        bc_ceil = -(-W.shape[1] // sc)
        scale_expanded = (
            scale_inv.to(dtype)
            .repeat_interleave(br_ceil, dim=0)
            .repeat_interleave(bc_ceil, dim=1)
        )[: W.shape[0], : W.shape[1]]
        return W_float * scale_expanded
    return W_float * scale_inv.to(dtype)


def dequantize(
    W: torch.Tensor,
    quant_state: QuantState | list | torch.Tensor | None = None,
    out: torch.Tensor | None = None,
) -> torch.Tensor:
    """
    Fast NF4 dequantization using `bitsandbytes` CUDA kernels.

    Performs efficient dequantization of weights from NF4 format using `bitsandbytes`'
    optimized CUDA implementations. Supports both legacy list and new `QuantState`
    formats.

    Args:
        W: Quantized weight tensor to dequantize
        quant_state: Quantization state containing metadata needed for
            dequantization. Can be either a `QuantState` object or legacy list format.
            If None, returns `W` unchanged.
        out: Optional output tensor for storing dequantized results. Must match
            expected shape and dtype if provided.

    Returns:
        Dequantized tensor in the specified dtype (fp16 or bf16). Will be transposed if
        input `W` was transposed.

    Raises:
        AssertionError: If provided output tensor doesn't match expected shape / dtype.

    Note:
        Uses CUDA streams for better performance when available in newer `bitsandbytes`
        versions (>0.43.3).
    """
    if quant_state is None:
        return W

    # FP8 path: quant_state is actually scale_inv tensor
    if W.dtype == torch.float8_e4m3fn:
        scale_inv = quant_state
        # Caller may pass W.t() (non-contiguous) — dequantize in original
        # layout then transpose back so the result shape matches the input.
        if not W.is_contiguous() and W.dim() == 2:
            return dequantize_fp8(W.t(), scale_inv).t()
        return dequantize_fp8(W, scale_inv)

    # Get the target device from input tensor W
    target_device = W.device

    # Extract quantization state
    if not isinstance(quant_state, list):
        # New style quant_state class
        absmax = quant_state.absmax.to(target_device)
        shape = quant_state.shape
        dtype = quant_state.dtype
        blocksize = quant_state.blocksize
        offset = quant_state.offset.to(target_device)
        state2 = quant_state.state2
        absmax2 = state2.absmax.to(target_device)
        code2 = state2.code.to(target_device)
        blocksize2 = state2.blocksize
    else:
        # Legacy list format
        absmax, shape, dtype, blocksize, compressed_stats, _, _ = quant_state
        absmax = absmax.to(target_device)
        offset, state2 = compressed_stats
        offset = offset.to(target_device)
        absmax2, code2, blocksize2, _, _, _, _ = state2
        absmax2 = absmax2.to(target_device)
        code2 = code2.to(target_device)

    # Setup output tensor on the same device as input
    if out is None:
        out = torch.empty(shape, dtype=dtype, device=target_device)
    else:
        assert out.shape == shape and out.dtype == dtype
        out = out.to(target_device)

    # Dequantize statistics on the target device
    n_elements_absmax: int = absmax.numel()
    out_absmax: torch.Tensor = torch.empty(
        n_elements_absmax, dtype=torch.float32, device=target_device
    )
    ptr_out_absmax: int = get_ptr(out_absmax)

    # Use CUDA stream if available
    if HAS_CUDA_STREAM:
        global CUDA_STREAM
        if CUDA_STREAM is None:
            CUDA_STREAM = torch.cuda.current_stream(target_device)

        cdequantize_blockwise_fp32(
            get_ptr(code2),
            get_ptr(absmax),
            get_ptr(absmax2),
            ptr_out_absmax,
            ctypes.c_int(blocksize2),
            ctypes.c_int(n_elements_absmax),
            CUDA_STREAM,
        )
    else:
        cdequantize_blockwise_fp32(
            get_ptr(code2),
            get_ptr(absmax),
            get_ptr(absmax2),
            ptr_out_absmax,
            ctypes.c_int(blocksize2),
            ctypes.c_int(n_elements_absmax),
        )

    out_absmax += offset

    # Choose appropriate dequantization function
    fx = (
        cdequantize_blockwise_fp16_nf4
        if dtype == torch.float16
        else cdequantize_blockwise_bf16_nf4
    )

    # Dequantize weights
    if HAS_CUDA_STREAM:
        fx(
            get_ptr(None),
            get_ptr(W),
            ptr_out_absmax,
            get_ptr(out),
            ctypes.c_int(blocksize),
            ctypes.c_int(out.numel()),
            CUDA_STREAM,
        )
    else:
        fx(
            get_ptr(None),
            get_ptr(W),
            ptr_out_absmax,
            get_ptr(out),
            ctypes.c_int(blocksize),
            ctypes.c_int(out.numel()),
        )

    # Handle transposed data
    is_transposed: bool = W.shape[0] == 1
    return out.t() if is_transposed else out


================================================
FILE: src/axolotl/kernels/swiglu.py
================================================
"""
Module for definition of SwiGLU Triton kernels.

See "GLU Variants Improve Transformer" (https://arxiv.org/abs/2002.05202).

Credit to `unsloth` (https://unsloth.ai/) for inspiration for this implementation.
"""

import torch
import triton
import triton.language as tl


@triton.jit
def _swiglu_fwd_kernel(
    gate_ptr,
    up_ptr,
    out_ptr,
    n_elements,
    block_size: tl.constexpr,
):
    """
    SwiGLU forward kernel. The kernel computes activation in fp32 precision for better
    numerical stability, then converts back to original dtype for the final result.

    Args:
        gate_ptr: Pointer to gate tensor `[*, hidden_dim]`.
        up_ptr: Pointer to up-projection tensor `[*, hidden_dim]`.
        out_ptr: Pointer to output tensor `[*, hidden_dim]`.
        n_elements: Total number of elements in the input tensors.
        block_size: Size of thread blocks for parallel computation.
    """
    block_idx = tl.program_id(0)
    offsets = block_idx * block_size + tl.arange(0, block_size)
    mask = offsets < n_elements

    # Load gate in fp32, keep up in original dtype
    gate = tl.load(gate_ptr + offsets, mask=mask, other=0).to(tl.float32)
    up = tl.load(up_ptr + offsets, mask=mask, other=0)

    # Compute activation in fp32 then convert back
    f = gate * tl.sigmoid(gate)
    f = f.to(up.dtype)
    result = f * up

    tl.store(out_ptr + offsets, result, mask=mask)


@triton.jit
def _swiglu_bwd_kernel(
    grad_out_ptr,
    gate_ptr,
    up_ptr,
    n_elements,
    block_size: tl.constexpr,
):
    """
    SwiGLU backward kernel. Stores gradient results in-place.

    Args:
        grad_out_ptr: Pointer to gradient output tensor `[*, hidden_dim]`.
        gate_ptr: Pointer to gate tensor `[*, hidden_dim]`.
        up_ptr: Pointer to up-projection tensor `[*, hidden_dim]`.
        n_elements: Total number of elements in the input tensors.
        block_size: Size of thread blocks for parallel computation.

    Note:
        After kernel execution, tensors are modified in-place:
        - `grad_out_ptr` contains forward output (`h`)
        - `gate_ptr` contains gradient w.r.t gate (`grad_gate`)
        - `up_ptr` contains gradient w.r.t up (`grad_up`)
    """
    block_idx = tl.program_id(0)
    offsets = block_idx * block_size + tl.arange(0, block_size)
    mask = offsets < n_elements

    # Load values - only convert gate to fp32
    grad_out = tl.load(grad_out_ptr + offsets, mask=mask, other=0)
    gate = tl.load(gate_ptr + offsets, mask=mask, other=0).to(tl.float32)
    up = tl.load(up_ptr + offsets, mask=mask, other=0)

    # Compute SiLU and forward output
    sigmoid_gate = tl.sigmoid(gate)
    silu_gate = sigmoid_gate * gate
    silu_gate = silu_gate.to(grad_out.dtype)
    h = silu_gate * up

    # Compute gradients
    grad_up = grad_out * silu_gate  # gradient for up is grad_out * SiLU(gate)

    # Compute gate gradient
    temp = grad_out * up
    grad_gate = temp.to(tl.float32) * sigmoid_gate * (1.0 + gate * (1.0 - sigmoid_gate))
    grad_gate = grad_gate.to(grad_out.dtype)

    # Store results with correct gradient ordering
    tl.store(grad_out_ptr + offsets, h, mask=mask)
    tl.store(gate_ptr + offsets, grad_gate, mask=mask)  # grad wrt gate
    tl.store(up_ptr + offsets, grad_up, mask=mask)  # grad wrt up


def swiglu_forward(gate: torch.Tensor, up: torch.Tensor) -> torch.Tensor:
    """
    SwiGLU forward pass. Computes SwiGLU activation: `x * sigmoid(x) * up`, where
    `x` is the gate tensor.

    Args:
        gate: Input gate tensor of shape `[batch, seq_len, hidden_dim]`.
        up: Up-projection tensor of shape `[batch, seq_len, hidden_dim]`.

    Returns:
        Output tensor of shape `[batch, seq_len, hidden_dim]`.
    """
    batch, seq_len, hidden_dim = gate.shape
    n_elements = gate.numel()
    out = torch.empty((batch, seq_len, hidden_dim), dtype=gate.dtype, device="cuda")

    grid = lambda meta: (triton.cdiv(n_elements, meta["block_size"]),)  # noqa: E731
    _swiglu_fwd_kernel[grid](
        gate_ptr=gate,
        up_ptr=up,
        out_ptr=out,
        n_elements=n_elements,
        block_size=1024,
    )

    return out


def swiglu_backward(
    grad_output: torch.Tensor, gate: torch.Tensor, up: torch.Tensor
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    SwiGLU backward pass using in-place operations.

    Args:
        grad_output: Gradient of loss with respect to output, shape `[batch, seq_len, hidden_dim]`.
        gate: Gate tensor from forward pass, shape `[batch, seq_len, hidden_dim]`.
        up: Up-projection tensor from forward pass, shape `[batch, seq_len, hidden_dim]`.

    Returns:
        Tuple containing:
            - Forward pass output (`h`)
            - Gradient with respect to gate (`df`)
            - Gradient with respect to up-projection (`de`)
    """
    n_elements = grad_output.numel()

    grid = lambda meta: (triton.cdiv(n_elements, meta["block_size"]),)  # noqa: E731
    _swiglu_bwd_kernel[grid](
        grad_out_ptr=grad_output,
        gate_ptr=gate,
        up_ptr=up,
        n_elements=n_elements,
        block_size=1024,
    )

    # After kernel execution, tensors contain:
    # grad_output: h (forward output)
    # gate: grad_gate (grad wrt gate)
    # up: grad_up (grad wrt up)
    return grad_output, gate, up


================================================
FILE: src/axolotl/kernels/utils.py
================================================
"""Utilities for `axolotl.kernels` submodules."""

import torch
from packaging.version import Version

if Version(torch.__version__) < Version("2.4.0"):
    torch_amp_custom_fwd = torch.cuda.amp.custom_fwd
    torch_amp_custom_bwd = torch.cuda.amp.custom_bwd
else:
    torch_amp_custom_fwd = torch.amp.custom_fwd(device_type="cuda")
    torch_amp_custom_bwd = torch.amp.custom_bwd(device_type="cuda")


================================================
FILE: src/axolotl/loaders/__init__.py
================================================
"""Init for axolotl.loaders module"""

# flake8: noqa

from .adapter import load_adapter, load_lora
from .constants import MULTIMODAL_AUTO_MODEL_MAPPING
from .model import ModelLoader
from .processor import load_processor
from .tokenizer import load_tokenizer


================================================
FILE: src/axolotl/loaders/adapter.py
================================================
"""Adapter loading functionality, including LoRA / QLoRA and associated utils"""

import os
import types
from typing import Any

import bitsandbytes as bnb
import torch
from bitsandbytes.nn import Params4bit
from peft import (
    AdaptionPromptConfig,
    LoftQConfig,
    LoraConfig,
    PeftConfig,
    PeftMixedModel,
    PeftModel,
    TaskType,
    get_peft_model,
)
from transformers import PreTrainedModel

from axolotl.loaders.utils import get_linear_embedding_layers
from axolotl.telemetry.errors import send_errors
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def setup_quantized_meta_for_peft(model: torch.nn.Module):
    """Replaces `quant_state.to` with a dummy function to prevent PEFT from moving `quant_state` to meta device"""

    def temp_to_method(self, *args, **kwargs):
        return self

    for param in model.parameters():
        if isinstance(param, Params4bit) and param.quant_state is not None:
            param.quant_state._orig_to = param.quant_state.to
            param.quant_state.to = types.MethodType(temp_to_method, param.quant_state)


def setup_quantized_peft_meta_for_training(model: torch.nn.Module):
    """Replaces dummy `quant_state.to` method with the original function to allow training to continue"""
    for param in model.parameters():
        if isinstance(param, Params4bit) and hasattr(param.quant_state, "_orig_to"):
            param.quant_state.to = param.quant_state._orig_to
            param.quant_state._orig_to = None


def find_all_linear_names(model):
    cls = (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt, torch.nn.Linear)
    lora_module_names = set()
    for name, module in model.named_modules():
        if (
            isinstance(module, cls)
            or "Linear" in module.__class__.__name__
            and module.__class__.__name__ not in ("LlamaLinearScalingRotaryEmbedding",)
        ):
            names = name.split(".")
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])

    embedding_modules = get_linear_embedding_layers(model.config.model_type)
    output_embedding = embedding_modules[1]
    if output_embedding in lora_module_names:  # needed for 16-bit
        lora_module_names.remove(output_embedding)

    return list(lora_module_names)


def load_lora(
    model: PreTrainedModel,
    cfg: DictDefault,
    inference: bool = False,
    config_only: bool = False,
) -> tuple[PreTrainedModel | PeftModel | PeftMixedModel | None, PeftConfig | None]:
    lora_target_modules = cfg.lora_target_modules or []
    lora_target_parameters = cfg.lora_target_parameters or []

    if cfg.lora_target_linear:
        linear_names = find_all_linear_names(model)
        LOG.info(f"found linear modules: {repr(sorted(linear_names))}")
        lora_target_modules_as_list = (
            lora_target_modules
            if isinstance(lora_target_modules, list)
            else [lora_target_modules]
        )
        lora_target_modules = list(set(lora_target_modules_as_list + linear_names))

    lora_config_kwargs = {}
    loftq_bits = cfg.peft and cfg.peft.loftq_config and cfg.peft.loftq_config.loftq_bits
    if loftq_bits:
        lora_config_kwargs["loftq_config"] = LoftQConfig(loftq_bits=loftq_bits)
        lora_config_kwargs["init_lora_weights"] = "loftq"
    if cfg.peft_init_lora_weights:
        lora_config_kwargs["init_lora_weights"] = cfg.peft_init_lora_weights
    if cfg.peft_use_dora:
        lora_config_kwargs["use_dora"] = cfg.peft_use_dora
        LOG.info("Initializing LoRA weights using dora. This might take longer.")
    if cfg.peft_use_rslora:
        lora_config_kwargs["use_rslora"] = cfg.peft_use_rslora
    if cfg.peft_layer_replication:
        lora_config_kwargs["layer_replication"] = cfg.peft_layer_replication
    if cfg.peft_trainable_token_indices:
        lora_config_kwargs["trainable_token_indices"] = cfg.peft_trainable_token_indices
    if cfg.peft_ensure_weight_tying is not None:
        lora_config_kwargs["ensure_weight_tying"] = cfg.peft_ensure_weight_tying

    # Determine the correct PEFT task type
    model_cls = type(model).__name__
    if "SequenceClassification" in model_cls:
        task_type = TaskType.SEQ_CLS
    elif "TokenClassification" in model_cls:
        task_type = TaskType.TOKEN_CLS
    else:
        task_type = TaskType.CAUSAL_LM

    lora_config = LoraConfig(
        r=cfg.lora_r,
        lora_alpha=cfg.lora_alpha,
        target_modules=lora_target_modules,
        target_parameters=lora_target_parameters,
        layers_to_transform=cfg.peft_layers_to_transform,
        layers_pattern=cfg.peft_layers_pattern,
        lora_dropout=cfg.lora_dropout,
        fan_in_fan_out=cfg.lora_fan_in_fan_out,
        modules_to_save=cfg.lora_modules_to_save if cfg.lora_modules_to_save else None,
        bias="none",
        task_type=task_type,
        **lora_config_kwargs,
    )

    if config_only:
        return None, lora_config

    rank = int(os.environ.get("LOCAL_RANK", 0))

    if (
        cfg.fsdp_config
        and cfg.adapter
        and cfg.fsdp_config.cpu_ram_efficient_loading
        and rank != 0
    ):
        setup_quantized_meta_for_peft(model)

    model_kwargs: Any = {}
    if cfg.peft_autocast_adapter_dtype is not None:
        model_kwargs["autocast_adapter_dtype"] = cfg.peft_autocast_adapter_dtype

    if cfg.lora_model_dir:
        LOG.debug("Loading pretrained PEFT - LoRA")
        if cfg.lora_on_cpu:
            model_kwargs["max_memory"] = {"cpu": "256GiB"}
            model_kwargs["device_map"] = {"": "cpu"}
        model = PeftModel.from_pretrained(
            model,
            cfg.lora_model_dir,
            is_trainable=(not inference),
            **model_kwargs,
        )
    else:
        model = get_peft_model(model, lora_config, **model_kwargs)

    # FP8 models: LoRA A/B inherit FP8 dtype from base weights, but training
    # requires a compute dtype (bf16/fp16). Cast trainable LoRA params.
    if cfg.torch_dtype:
        _fp8_cast_dtype = cfg.torch_dtype
    elif torch.cuda.is_available() and torch.cuda.is_bf16_supported():
        _fp8_cast_dtype = torch.bfloat16
    else:
        _fp8_cast_dtype = torch.float16
    for _name, param in model.named_parameters():
        if param.requires_grad and param.dtype == torch.float8_e4m3fn:
            param.data = param.data.to(_fp8_cast_dtype)

    if rank == 0:
        try:
            model.print_trainable_parameters()
        except AttributeError as exc:
            LOG.warning(
                "Exception caught during model.print_trainable_parameters(): %s", exc
            )
    elif (
        cfg.fsdp_config
        and cfg.adapter
        and cfg.fsdp_config.cpu_ram_efficient_loading
        and rank != 0
    ):
        setup_quantized_peft_meta_for_training(model)

    return model, lora_config


@send_errors
def load_adapter(
    model: PreTrainedModel,
    cfg: DictDefault,
    adapter: str | None,
    inference: bool = False,
) -> tuple[PreTrainedModel | PeftModel | PeftMixedModel, PeftConfig | None]:
    if adapter is None:
        return model, None
    if hasattr(model, "enable_input_require_grads"):
        model.enable_input_require_grads()
    if adapter in ["lora", "qlora"]:
        peft_model, lora_config = load_lora(model, cfg, inference=inference)
        return peft_model, lora_config
    if adapter == "llama-adapter":
        peft_model, lora_config = load_llama_adapter(model, cfg)
        return peft_model, lora_config

    raise NotImplementedError(f"{adapter} PEFT adapter not available")


def load_llama_adapter(
    model: PreTrainedModel, cfg: DictDefault
) -> tuple[PeftModel | PeftMixedModel, PeftConfig]:
    peft_config = AdaptionPromptConfig(
        adapter_layers=cfg.peft_adapter.layers,  # layers (L)
        adapter_len=cfg.peft_adapter.len,  # prompt length (K)
        task_type="CAUSAL_LM",
    )

    if cfg.lora_model_dir:
        LOG.debug("Loading pretrained PEFT - llama_adapter")
        peft_model = PeftModel.from_pretrained(
            model,
            cfg.lora_model_dir,
            torch_dtype=torch.float16,
        )
    else:
        peft_model = get_peft_model(model, peft_config)

    peft_model.print_trainable_parameters()

    return peft_model, peft_config


================================================
FILE: src/axolotl/loaders/adapters/__init__.py
================================================


================================================
FILE: src/axolotl/loaders/constants.py
================================================
"""Shared constants for axolotl.loaders module"""

from transformers import AutoModelForImageTextToText
from transformers.models.auto.modeling_auto import (
    MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES,
)

MULTIMODAL_AUTO_MODEL_MAPPING = dict(MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMES)

MULTIMODAL_AUTO_MODEL_MAPPING["lfm2-vl"] = AutoModelForImageTextToText

try:
    from transformers import VoxtralForConditionalGeneration

    # transformers >4.53.2
    MULTIMODAL_AUTO_MODEL_MAPPING["voxtral"] = VoxtralForConditionalGeneration
except ImportError:
    pass


================================================
FILE: src/axolotl/loaders/model.py
================================================
"""
Model loader class implementation for loading, configuring, and patching various models.
"""

import gc
import math
import os
from functools import cached_property
from importlib.util import find_spec
from typing import Any

import peft
import torch
import transformers
import transformers.modeling_utils
from accelerate import init_empty_weights
from accelerate.parallelism_config import ParallelismConfig
from peft import (
    PeftConfig,
    PeftMixedModel,
    PeftModel,
    PeftModelForCausalLM,
    prepare_model_for_kbit_training,
)
from torch.distributed import DeviceMesh
from transformers import (
    AutoModelForCausalLM,
    AutoModelForImageTextToText,
    AwqConfig,
    BitsAndBytesConfig,
    GPTQConfig,
    PreTrainedModel,
    PreTrainedTokenizerBase,
)
from transformers.integrations.deepspeed import (
    HfTrainerDeepSpeedConfig,
    is_deepspeed_zero3_enabled,
)

from axolotl.common.architectures import MOE_ARCH_BLOCK
from axolotl.integrations.base import PluginManager
from axolotl.loaders.adapter import load_adapter, load_lora
from axolotl.loaders.constants import MULTIMODAL_AUTO_MODEL_MAPPING
from axolotl.loaders.patch_manager import PatchManager
from axolotl.loaders.utils import (
    get_linear_embedding_layers,
    get_module_class_from_name,
    load_model_config,
)
from axolotl.models.mamba import fix_mamba_attn_for_loss
from axolotl.telemetry.errors import send_errors
from axolotl.utils.bench import log_gpu_memory_usage
from axolotl.utils.dict import DictDefault
from axolotl.utils.distributed import (
    build_parallelism_config,
    get_device_count,
    get_device_type,
)
from axolotl.utils.logging import get_logger
from axolotl.utils.model_shard_quant import load_sharded_model_quant
from axolotl.utils.schemas.enums import RLType

LOG = get_logger(__name__)
PLUGIN_MANAGER = PluginManager.get_instance()


class ModelLoader:
    """Manages model configuration, initialization and application of patches during
    model loading.

    This class orchestrates the entire process of loading a model from configuration to
    final preparation. It handles device mapping, quantization, attention mechanisms,
    adapter integration, and various optimizations.

    The loading process includes:
        - Loading and validating model configuration
        - Applying monkey patches for optimizations / fixes
        - Setting up device mapping (including multi-GPU configurations)
        - Configuring quantization
        - Setting attention mechanisms (Flash Attention, SDPA, etc.)
        - Loading and initializing the model
        - Applying adapters (LoRA, QLoRA, etc.)

    Attributes:
        model: The loaded model instance (available after load() is called).
        model_kwargs: Dictionary of keyword arguments passed to model initialization.
        base_model: Name or path of the base model to load.
        model_type: Type of model to load (e.g., `AutoModelForCausalLM`).
        model_config: Configuration object for the model.
        auto_model_loader: class used for loading the model (default:
            `AutoModelForCausalLM`).
    """

    use_parallel_config: bool | None = False
    parallelism_config: ParallelismConfig | None = None
    device_mesh: DeviceMesh | None = None

    def __init__(
        self,
        cfg: DictDefault,
        tokenizer: PreTrainedTokenizerBase,
        *,
        inference: bool = False,
        reference_model: bool = False,
        **kwargs,
    ):
        """Initializes the ModelLoader.

        Args:
            cfg: Configuration dictionary with model and training settings.
            tokenizer: Tokenizer instance associated with the model.
            processor: Optional processor for multimodal models. Defaults to None.
            inference: Whether the model is being loaded for inference mode. Defaults
                to False.
            reference_model: Whether this is a reference model (used in setups like DPO
                training). Defaults to False.
            **kwargs: Additional keyword arguments (ignored).
        """
        self.cfg = cfg
        self.tokenizer = tokenizer
        self.inference: bool = inference
        self.reference_model: bool = reference_model

        # Init model kwargs
        self.model_kwargs: dict[str, Any] = {}
        if cfg.overrides_of_model_kwargs:
            for key, val in cfg.overrides_of_model_kwargs.items():
                self.model_kwargs[key] = val

        # Init model
        self.model: PreTrainedModel | PeftModel | PeftMixedModel
        self.base_model = cfg.base_model
        self.model_type = cfg.type_of_model

        # Init model config
        self.model_config = load_model_config(cfg)
        self.auto_model_loader = AutoModelForCausalLM

        # Initialize the patch manager
        self.patch_manager = PatchManager(
            cfg=cfg,
            model_config=self.model_config,
            inference=inference,
        )

    @cached_property
    def has_flash_attn(self) -> bool:
        """Check if flash attention is installed."""
        return find_spec("flash_attn") is not None

    @property
    def is_fsdp_enabled(self):
        """Property that determines if FSDP is enabled."""
        return self.cfg.fsdp_config is not None or self.cfg.fsdp is not None

    @property
    def is_qlora_and_fsdp_enabled(self):
        """Property that determines if FSDP with QLoRA is enabled."""
        return self.is_fsdp_enabled and self.cfg.adapter == "qlora"

    @send_errors
    def load(self) -> tuple[PreTrainedModel | PeftModelForCausalLM, PeftConfig | None]:
        """Load and prepare the model with all configurations and patches.

        Returns:
            A tuple with the loaded model and its LoRA configuration (if applicable).
        """
        # Initial setup and patches
        self.patch_manager.apply_pre_model_load_patches()
        self._apply_pre_model_load_setup()

        # Build the model
        PLUGIN_MANAGER.pre_model_load(self.cfg)
        self.patch_manager.apply_post_plugin_pre_model_load_patches()

        skip_move_to_device = self._build_model()
        self.patch_manager.apply_post_model_build_patches(self.model)

        PLUGIN_MANAGER.post_model_build(self.cfg, self.model)

        # Post-build model configuration
        self._apply_post_model_load_setup()

        # Load adapters (LoRA, etc.)
        PLUGIN_MANAGER.pre_lora_load(self.cfg, self.model)
        lora_config = self._load_adapters()
        PLUGIN_MANAGER.post_lora_load(self.cfg, self.model)

        # Apply remaining patches and finalize
        self._apply_post_lora_load_setup(skip_move_to_device)
        self.patch_manager.apply_post_model_load_patches(self.model)
        PLUGIN_MANAGER.post_model_load(self.cfg, self.model)

        return self.model, lora_config

    def _apply_pre_model_load_setup(self):
        """Apply patches and setup configurations before model loading."""
        if self.use_parallel_config is not None:
            self.use_parallel_config = (
                self.cfg.fsdp_config
                or (self.cfg.tensor_parallel_size and self.cfg.tensor_parallel_size > 1)
                or (
                    self.cfg.context_parallel_size
                    and self.cfg.context_parallel_size > 1
                )
            )
            if self.cfg.fsdp_config and self.cfg.fsdp_version != 2:
                self.use_parallel_config = False

        if self.use_parallel_config:
            self._set_parallel_config()
        self._set_auto_model_loader()
        self._set_device_map_config()
        if self.cfg.revision_of_model:
            self.model_kwargs["revision"] = self.cfg.revision_of_model
        if self.cfg.use_kernels:
            self.model_kwargs["use_kernels"] = self.cfg.use_kernels
            if "allow_all_kernels" not in self.model_kwargs:
                self.model_kwargs["allow_all_kernels"] = self.cfg.use_kernels
        self._set_quantization_config()
        self._set_attention_config()
        self._check_model_requirements()

    def _apply_post_model_load_setup(self):
        """Configure the model after it has been loaded."""
        # Handle PeftModel if needed
        if (
            isinstance(self.model, (peft.PeftModel, peft.PeftModelForCausalLM))
            and not self.is_qlora_and_fsdp_enabled
        ):
            self.model = self.model.merge_and_unload()

        self._configure_experts_implementation()
        self._apply_activation_checkpointing()
        self._resize_token_embeddings()
        self._adjust_model_config()
        self._configure_embedding_dtypes()
        self._configure_qat()
        log_gpu_memory_usage(LOG, "Memory usage after model load", 0)

    def _configure_experts_implementation(self):
        if self.cfg.experts_implementation is not None:
            self.model.set_experts_implementation(self.cfg.experts_implementation)

    def _apply_activation_checkpointing(self):
        if self.cfg.activation_offloading is True:
            from axolotl.core.trainers.mixins.activation_checkpointing import (
                ac_wrap_hf_model,
            )

            # ^^ importing this at the module level breaks plugins
            ac_wrap_hf_model(self.model)

    def _resize_token_embeddings(self):
        """Resize token embeddings if needed."""
        embeddings_len = (
            math.ceil(len(self.tokenizer) / 32) * 32
            if self.cfg.resize_token_embeddings_to_32x
            else len(self.tokenizer)
        )
        if hasattr(self.model, "get_input_embeddings") and (
            self.model.get_input_embeddings().num_embeddings < embeddings_len
            or (
                self.model.get_input_embeddings().num_embeddings > embeddings_len
                and self.cfg.shrink_embeddings
            )
        ):
            resize_kwargs = {}
            if self.cfg.mean_resizing_embeddings is not None and (
                self.model_config.model_type != "llava"
            ):
                resize_kwargs["mean_resizing"] = self.cfg.mean_resizing_embeddings
            self.model.resize_token_embeddings(embeddings_len, **resize_kwargs)
        else:
            self.model.tie_weights()

    def _adjust_model_config(self):
        if (
            hasattr(self.model, "config")
            and hasattr(self.model.config, "max_position_embeddings")
            and self.model.config.max_position_embeddings
            and self.cfg.sequence_len > self.model.config.max_position_embeddings
        ):
            LOG.warning(
                "increasing model.config.max_position_embeddings from "
                f"{self.model.config.max_position_embeddings} to {self.cfg.sequence_len}"
            )
            self.model.config.max_position_embeddings = self.cfg.sequence_len

        if (
            hasattr(self.model, "config")
            and hasattr(self.model.config, "bos_token_id")
            and self.model.config.bos_token_id
            and self.model.config.bos_token_id != self.tokenizer.bos_token_id
        ):
            self.model.config.bos_token_id = self.tokenizer.bos_token_id

        if (
            hasattr(self.model, "config")
            and hasattr(self.model.config, "eos_token_id")
            and self.model.config.eos_token_id
            and self.model.config.eos_token_id != self.tokenizer.eos_token_id
        ):
            self.model.config.eos_token_id = self.tokenizer.eos_token_id

    def _configure_embedding_dtypes(self):
        """Configure embedding module dtypes."""
        # Get embedding modules
        embedding_modules = get_linear_embedding_layers(self.cfg.model_config_type)

        # Initial dtype conversion
        if not self.is_fsdp_enabled:
            # We don't run this during FSDP because this will leave mixed and bfloat16
            # dtypes in the model which FSDP doesn't like
            if self.cfg.load_in_4bit and self.cfg.embeddings_skip_upcast:
                embedding_modules = []
            self._convert_embedding_modules_dtype(
                embedding_modules,
                dist_dtype=torch.float32,
                before_kbit_train_or_finetune=True,
            )

        # Handle DeepSpeed Zero3
        if (
            is_deepspeed_zero3_enabled()
            or os.getenv("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3"
        ):
            self._set_z3_leaf_modules()

        # Apply gradient checkpointing if needed
        needs_fa2_dtype = self.cfg.adapter or self.is_fsdp_enabled
        if self.cfg.adapter in ["lora", "qlora"]:
            needs_fa2_dtype = True
            if self.cfg.gradient_checkpointing:
                self.model.gradient_checkpointing_enable(
                    gradient_checkpointing_kwargs=self.cfg.gradient_checkpointing_kwargs
                )

        self._prepare_model_for_quantization()

        # Convert dtypes if needed
        should_convert = (
            # LlamaRMSNorm layers are in fp32 after kbit_training or full finetune, so
            # we need to convert them back to fp16/bf16 for flash-attn compatibility.
            (
                (
                    needs_fa2_dtype
                    or self.cfg.flash_attention
                    or self.cfg.flex_attention
                    or self.cfg.sage_attention
                )
                and not self.is_qlora_and_fsdp_enabled
            )
            or (
                # CCE requires embedding layers to be in fp16/bf16 for backward pass
                self.cfg.cut_cross_entropy
            )
        )

        if should_convert:
            LOG.info("Converting modules to %s", self.cfg.torch_dtype)
            self._convert_embedding_modules_dtype(
                embedding_modules=embedding_modules,
                dist_dtype=self.cfg.torch_dtype,
                before_kbit_train_or_finetune=False,
            )

    def _configure_qat(self):
        """Configure QAT."""
        if self.cfg.qat:
            from axolotl.utils.quantization import prepare_model_for_qat

            prepare_model_for_qat(
                self.model,
                self.cfg.qat.weight_dtype,
                self.cfg.qat.group_size,
                self.cfg.qat.activation_dtype,
                self.cfg.qat.quantize_embedding,
            )

    def _load_adapters(self) -> PeftConfig | None:
        """Load LoRA or other adapters."""
        # Load LoRA or adapter
        lora_config = None
        if not self.reference_model or self.cfg.lora_model_dir:
            # If we're not loading the reference model, then we're loading the model
            # for training. Then, the DPO trainer doesn't want the PEFT model loaded
            # over it, it just wants the LoRA / PEFT config.
            if (
                self.cfg.adapter
                and self.cfg.rl in [RLType.DPO, RLType.IPO, RLType.KTO]
                and not self.cfg.merge_lora
            ):
                _, lora_config = load_lora(
                    self.model, self.cfg, inference=False, config_only=True
                )
            else:
                self.model, lora_config = load_adapter(
                    self.model, self.cfg, self.cfg.adapter
                )

        return lora_config

    def _apply_post_lora_load_setup(self, skip_move_to_device: bool):
        """Apply final optimizations and patches."""
        # Place model on accelerator
        if (
            self.cfg.ddp
            and not self.cfg.load_in_8bit
            and not (self.cfg.rl and self.cfg.load_in_4bit)
            and not skip_move_to_device
        ):
            self.model.to(f"{str(get_device_type())}:{self.cfg.local_rank}")

        if get_device_count() > 1 and int(os.getenv("WORLD_SIZE", "1")) == 1:
            self.model.is_parallelizable = True
            self.model.model_parallel = True

        if not any(
            param.requires_grad
            for _, param in self.model.named_parameters(recurse=True)
        ):
            LOG.warning("There are no parameters that require gradient updates")

        if self.cfg.flash_optimum:
            from optimum.bettertransformer import BetterTransformer

            self.model = BetterTransformer.transform(self.model)

        if self.cfg.adapter is not None:
            log_gpu_memory_usage(LOG, "after adapters", self.model.device)

        for _ in range(3):
            gc.collect()
            torch.cuda.empty_cache()

    def _set_parallel_config(self):
        """Set parallelism configuration (DP, FSDP, TP, CP) in PartialState/Accelerator"""
        parallelism_config, device_mesh = build_parallelism_config(self.cfg)
        if parallelism_config:
            self.parallelism_config = parallelism_config
            self.device_mesh = device_mesh

    def _set_auto_model_loader(self):
        """Set `self.auto_model_loader`. Defaults to `transformers.AutoModelForCausalLM`
        (set at `__init__`). When using a multimodal model, `self.auto_model_loader`
        should be set according to the type of the model.
        """
        if self.cfg.is_multimodal:
            self.auto_model_loader = MULTIMODAL_AUTO_MODEL_MAPPING.get(
                self.model_config.model_type, AutoModelForImageTextToText
            )
            if isinstance(self.auto_model_loader, str):
                self.auto_model_loader = AutoModelForImageTextToText

    def _set_device_map_config(self):
        """Setup `device_map` according to config"""
        device_map = self.cfg.device_map
        max_memory = self.cfg.max_memory

        if self.cfg.gpu_memory_limit:
            gpu_memory_limit = (
                str(self.cfg.gpu_memory_limit) + "GiB"
                if isinstance(self.cfg.gpu_memory_limit, int)
                else self.cfg.gpu_memory_limit
            )

            max_memory = {}
            num_device = get_device_count()
            for i in range(num_device):
                max_memory[i] = gpu_memory_limit
            max_memory["cpu"] = "256GiB"  # something sufficiently large to fit anything

        if max_memory is not None:
            # Based on https://github.com/togethercomputer/OpenChatKit/blob/main/inference/bot.py
            from accelerate import infer_auto_device_map

            with init_empty_weights():
                model_canvas = self.auto_model_loader.from_config(
                    self.model_config,
                    trust_remote_code=self.cfg.trust_remote_code or False,
                )
            model_canvas.tie_weights()
            device_map = infer_auto_device_map(
                model_canvas,
                max_memory=max_memory,
                dtype=self.cfg.torch_dtype,
            )
            # We can discard max_memory now as we have a device map set up
            max_memory = None

        self.model_kwargs["torch_dtype"] = self.cfg.torch_dtype
        self.model_kwargs["dtype"] = self.cfg.torch_dtype

        is_ds_zero3 = is_deepspeed_zero3_enabled()

        # FSDP requires control over device placement, so don't set device_map when FSDP is enabled
        if self.is_fsdp_enabled:
            # For QLoRA + FSDP, we still need to set device_map to "auto" for proper initialization
            if self.is_qlora_and_fsdp_enabled:
                self.model_kwargs["device_map"] = {
                    "": int(os.environ.get("LOCAL_RANK", 0))
                }
            # For other FSDP cases, don't set device_map at all
        elif not is_ds_zero3:
            self.model_kwargs["device_map"] = device_map

            # quantize_moe_experts quantizes expert weights on-the-fly during loading,
            # so the actual VRAM usage is much less than bf16 estimates.
            # When device_map is "auto", accelerate's infer_auto_device_map computes
            # the device map at bf16 size (before quantization), causing it to offload
            # layers to CPU, which BnB then rejects. Force single-GPU placement to
            # prevent this. Only applies to the non-FSDP, non-ZeRO3 path (DDP/single).
            if getattr(self.cfg, "quantize_moe_experts", False) and device_map in (
                "auto",
                None,
            ):
                self.model_kwargs["device_map"] = {
                    "": int(os.environ.get("LOCAL_RANK", 0))
                }

            cur_device = get_device_type()
            if "mps" in str(cur_device):
                self.model_kwargs["device_map"] = "mps:0"
            elif "npu" in str(cur_device):
                self.model_kwargs["device_map"] = "npu:0"

        # TODO: can we put the reference model on it's own gpu? I think we have to move
        # logits around to calculate loss
        # if cfg.rl:
        #     if torch.cuda.device_count() > 1:
        #         if reference_model:
        #             model_kwargs["device_map"] = "cuda:" + str(
        #                 torch.cuda.current_device() + 1
        #             )
        #         else:
        #             model_kwargs["device_map"] = "cuda:" + str(torch.cuda.current_device())

    def _set_quantization_config(self):
        """Set up quantization config (bitsandbytes, awq, gptq, etc.)"""

        if self.cfg.model_quantization_config == "Mxfp4Config":
            from transformers import Mxfp4Config

            mxfp4_kwargs = {}
            if self.cfg.model_quantization_config_kwargs:
                mxfp4_kwargs = self.cfg.model_quantization_config_kwargs
            self.model_kwargs["quantization_config"] = Mxfp4Config(**mxfp4_kwargs)

        if self.cfg.gptq:
            if not hasattr(self.model_config, "quantization_config"):
                LOG.warning(
                    "model config does not contain quantization_config information"
                )
            else:
                if self.cfg.gptq_disable_exllama is not None:
                    self.model_config.quantization_config["disable_exllama"] = (
                        self.cfg.gptq_disable_exllama
                    )
                self.model_kwargs["quantization_config"] = GPTQConfig(
                    **self.model_config.quantization_config
                )
        if (
            self.cfg.adapter in ["qlora", "lora"]
            and hasattr(self.model_config, "quantization_config")
            and self.model_config.quantization_config["quant_method"]
            in ["gptq", "awq", "bitsandbytes"]
        ):
            if self.model_config.quantization_config["quant_method"] == "gptq":
                self.model_kwargs["quantization_config"] = GPTQConfig(
                    **self.model_config.quantization_config
                )
            elif self.model_config.quantization_config["quant_method"] == "awq":
                self.model_kwargs["quantization_config"] = AwqConfig(
                    **self.model_config.quantization_config
                )
            elif (
                self.model_config.quantization_config["quant_method"] == "bitsandbytes"
            ):
                self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
                    **self.model_config.quantization_config
                )
        elif self.cfg.adapter == "qlora" and self.cfg.load_in_4bit:
            bnb_config = {
                "load_in_4bit": True,
                "llm_int8_threshold": 6.0,
                "llm_int8_has_fp16_weight": False,
                "bnb_4bit_compute_dtype": self.cfg.torch_dtype,
                "bnb_4bit_use_double_quant": True,
                "bnb_4bit_quant_type": "nf4",
                "bnb_4bit_quant_storage": torch.bfloat16,
            }
            if self.cfg.model_config_type in ["jamba", "qwen2_moe"] and not (
                self.cfg.deepspeed or self.is_fsdp_enabled
            ):
                # for some reason, this causes the loss to be off by an order of magnitude
                # but deepspeed needs this still in bfloat16
                bnb_config["bnb_4bit_quant_storage"] = torch.float32
            if self.cfg.model_config_type == "falcon_h1":
                # output projection cannot be quantized for Falcon-H1 models
                bnb_config["llm_int8_skip_modules"] = ["out_proj"]

            if self.cfg.bnb_config_kwargs:
                bnb_config.update(self.cfg.bnb_config_kwargs)

            self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
                **bnb_config,
            )
        elif self.cfg.adapter == "lora" and self.cfg.load_in_8bit:
            bnb_config = {
                "load_in_8bit": True,
            }
            # Exclude mamba blocks from int8 quantization for jamba
            if self.cfg.model_config_type == "jamba":
                bnb_config["llm_int8_skip_modules"] = ["mamba"]
            if self.cfg.model_config_type == "falcon_h1":
                # output projection cannot be quantized for Falcon-H1 models
                bnb_config["llm_int8_skip_modules"] = ["out_proj"]
            self.model_kwargs["quantization_config"] = BitsAndBytesConfig(
                **bnb_config,
            )

    def _set_attention_config(self):
        """Sample packing uses custom FA2 patch"""
        if self.cfg.attn_implementation:
            self.model_kwargs["attn_implementation"] = self.cfg.attn_implementation
        elif self.cfg.flex_attention:
            self.model_kwargs["attn_implementation"] = "flex_attention"
            self.model_config._attn_implementation = "flex_attention"

        elif self.cfg.flash_attention:
            if not self.cfg.sample_packing and self.cfg.s2_attention:
                pass
            self.model_kwargs["attn_implementation"] = "flash_attention_2"
            self.model_config._attn_implementation = "flash_attention_2"
        elif self.cfg.sdp_attention:
            self.model_kwargs["attn_implementation"] = "sdpa"
            self.model_config._attn_implementation = "sdpa"
        elif self.cfg.sage_attention:
            # sets FA2 attention to re-use same internal handling like masking
            self.model_kwargs["attn_implementation"] = "flash_attention_2"
            self.model_config._attn_implementation = "flash_attention_2"
        elif self.cfg.eager_attention:
            self.model_kwargs["attn_implementation"] = "eager"
            self.model_config._attn_implementation = "eager"

        if self.cfg.low_cpu_mem_usage:
            self.model_kwargs["low_cpu_mem_usage"] = True

    def _check_model_requirements(self):
        if self.cfg.model_config_type in ["lfm2-vl", "lfm2"]:
            from transformers.utils.import_utils import is_causal_conv1d_available

            if is_causal_conv1d_available():
                raise ImportError(
                    "The 'causal-conv1d' package is installed but causes compatibility issues with LFM2 models. "
                    "Please uninstall it by running: `pip uninstall -y causal-conv1d`"
                )

    def _configure_zero3_memory_efficient_loading(
        self,
    ) -> HfTrainerDeepSpeedConfig | None:
        """
        Set the deepspeed config to load the model into RAM first before moving to VRAM.

        IMPORTANT
        ==========

        We need to return `hf_ds_cfg` as it needs to exist before model loading for zero3.
        HfTrainerDeepSpeedConfig is a class that is used to configure the DeepSpeed training.
        It is not passed anywhere in the model loading function, just need to exist.
        """
        hf_ds_cfg = None

        if os.getenv("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3":
            hf_ds_cfg = HfTrainerDeepSpeedConfig(self.cfg.deepspeed)
            hf_ds_cfg.fill_match(
                "train_micro_batch_size_per_gpu", self.cfg.micro_batch_size
            )
            hf_ds_cfg.fill_match(
                "gradient_accumulation_steps", self.cfg.gradient_accumulation_steps
            )
            hf_ds_cfg.fill_match(
                "train_batch_size",
                int(os.getenv("WORLD_SIZE", "1"))
                * self.cfg.micro_batch_size
                * self.cfg.gradient_accumulation_steps,
            )
            if "device_map" in self.model_kwargs:
                del self.model_kwargs["device_map"]

            transformers.modeling_utils.is_deepspeed_zero3_enabled = lambda: True
            transformers.integrations.deepspeed.is_deepspeed_zero3_enabled = lambda: (
                True
            )

        return hf_ds_cfg

    def _load_model_from_config(self, model_loader_class=None) -> PreTrainedModel:
        """
        Load model with random initialization using from_config.

        Uses the selected loader when provided; otherwise falls back to the auto loader.
        """
        loader = model_loader_class or self.auto_model_loader
        if loader in [AutoModelForCausalLM, AutoModelForImageTextToText]:
            model = loader.from_config(
                config=self.model_config,
                trust_remote_code=self.cfg.trust_remote_code or False,
            )
        else:
            model = loader(config=self.model_config)

        return model

    def _load_model_from_pretrained(self, model_loader_class=None) -> PreTrainedModel:
        """Load model from pretrained weights."""
        loader = model_loader_class or self.auto_model_loader
        kwargs = {
            "config": self.model_config,
            "trust_remote_code": self.cfg.trust_remote_code or False,
            **self.model_kwargs,
        }
        return loader.from_pretrained(self.base_model, **kwargs)

    def _build_model(self) -> bool:
        """Load model, with load strategy depending on config."""
        skip_move_to_device = False

        if self.cfg.tensor_parallel_size > 1:
            self.model_kwargs["tp_size"] = self.cfg.tensor_parallel_size
            self.model_kwargs["tp_plan"] = "auto"
            self.model_kwargs["device_mesh"] = self.device_mesh
            if "device_map" in self.model_kwargs:
                del self.model_kwargs["device_map"]  # not compatible with `tp_plan`

        if self.is_fsdp_enabled:
            if self.cfg.fsdp_config.cpu_ram_efficient_loading:
                skip_move_to_device = True
                # Don't delete device_map for QLoRA + FSDP - it was set correctly in
                # _set_device_map
                if (
                    "device_map" in self.model_kwargs
                    and not self.is_qlora_and_fsdp_enabled
                ):
                    del self.model_kwargs["device_map"]
            elif self.is_qlora_and_fsdp_enabled:
                skip_move_to_device = True

            if (
                self.cfg.tensor_parallel_size <= 1
                and self.cfg.fsdp_config.cpu_ram_efficient_loading
                and self.cfg.fsdp_version == 2
            ):
                # setting device_map for TP is not supported
                local_rank = int(os.getenv("LOCAL_RANK", "0"))
                if local_rank == 0:
                    self.model_kwargs["device_map"] = "cpu"
                else:
                    self.model_kwargs["device_map"] = "meta"

        if (
            self.is_qlora_and_fsdp_enabled
            and self.cfg.fsdp_config.cpu_ram_efficient_loading
            and (
                self.cfg.model_config_type == "dbrx"
                or self.cfg.qlora_sharded_model_loading
            )
        ):
            if self.cfg.reinit_weights:
                LOG.warning(
                    "reinit_weights is not supported with sharded quantized loading. "
                    "Loading from pretrained weights instead."
                )
            quant_storage = self.cfg.torch_dtype
            quantization_config = getattr(
                self.model_config, "quantization_config", None
            )
            quantization_config = (
                quantization_config or self.model_kwargs["quantization_config"]
            )
            self.model = load_sharded_model_quant(
                self.base_model,
                self.model_config,
                self.cfg,
                quant_storage=quant_storage,
                quantization_config=quantization_config,
            )
            skip_move_to_device = True
        elif self.model_type == "MambaLMHeadModel":
            if self.cfg.reinit_weights:
                LOG.warning(
                    "reinit_weights is not supported with MambaLMHeadModel. "
                    "Loading from pretrained weights instead."
                )
            # FIXME this is janky at best and hacked together to make it work
            MambaLMHeadModel = fix_mamba_attn_for_loss()

            self.model_kwargs["dtype"] = self.model_kwargs["torch_dtype"]
            self.model_kwargs["device"] = torch.cuda.current_device()
            self.model_kwargs.pop("torch_dtype", None)
            self.model_kwargs.pop("device_map", None)

            self.model = MambaLMHeadModel.from_pretrained(
                self.base_model,
                **self.model_kwargs,
            )
        else:
            # Please don't remove underscore binding without reading the fn docstring
            _ = self._configure_zero3_memory_efficient_loading()

            if (
                self.model_type
                and self.model_type != "AutoModelForCausalLM"
                and not self.cfg.trust_remote_code
                and not self.cfg.gptq
            ):
                # Use model type from transformers
                model_loader_class = getattr(transformers, self.model_type)
            else:
                # Use auto model loader (handles gptq and default cases)
                model_loader_class = self.auto_model_loader

            self.model_kwargs["dtype"] = self.model_kwargs["torch_dtype"]
            if self.cfg.reinit_weights:
                self.model = self._load_model_from_config(model_loader_class)
            else:
                self.model = self._load_model_from_pretrained(model_loader_class)

        if is_deepspeed_zero3_enabled():
            skip_move_to_device = True

        if self.cfg.tensor_parallel_size > 1:
            # workaround for upstream 4.54.0 not setting _tp_size or _device_mesh
            # TODO(wing): remove once 4.54.1 is released
            if self.model._tp_size != self.cfg.tensor_parallel_size:
                self.model._tp_size = self.cfg.tensor_parallel_size
                self.model._device_mesh = self.model_kwargs["device_mesh"]

        if self.cfg.experimental_skip_move_to_device is not None:
            skip_move_to_device = self.cfg.experimental_skip_move_to_device

        return skip_move_to_device

    def _set_z3_leaf_modules(self):
        from deepspeed.utils import set_z3_leaf_modules

        moe_type = self.cfg.model_config_type_text or self.cfg.model_config_type
        if moe_type in MOE_ARCH_BLOCK:
            moe_blocks = MOE_ARCH_BLOCK[moe_type]
            moe_blocks = [moe_blocks] if isinstance(moe_blocks, str) else moe_blocks
            set_z3_leaf_modules(
                self.model,
                [
                    get_module_class_from_name(self.model, module_name)
                    for module_name in moe_blocks
                ],
            )

    def _prepare_model_for_quantization(self):
        """Prepare loaded model for quantization."""
        skip_prepare_model_for_kbit_training = False
        if self.cfg.model_config_type == "qwen" and self.cfg.adapter == "lora":
            # Qwen doesn't play nicely with LoRA if this is enabled
            skip_prepare_model_for_kbit_training = True

        loftq_bits = (
            self.cfg.peft
            and self.cfg.peft.loftq_config
            and self.cfg.peft.loftq_config.loftq_bits
        )
        if self.cfg.adapter == "lora" and loftq_bits:
            skip_prepare_model_for_kbit_training = True

        if (
            self.is_qlora_and_fsdp_enabled
            or (self.is_fsdp_enabled and self.cfg.fsdp_config.cpu_ram_efficient_loading)
            or is_deepspeed_zero3_enabled()
        ):
            # Make sure everything is in the same dtype
            skip_prepare_model_for_kbit_training = True

        if getattr(self.model, "_moe_experts_quantized", False):
            # Parametrized expert tensors dequantize on access — would OOM.
            skip_prepare_model_for_kbit_training = True

        if (
            not skip_prepare_model_for_kbit_training
            and self.cfg.adapter in ["lora", "qlora"]
            and (self.cfg.load_in_8bit or self.cfg.load_in_4bit)
        ):
            LOG.info("converting PEFT model w/ prepare_model_for_kbit_training")
            self.model = prepare_model_for_kbit_training(
                self.model, use_gradient_checkpointing=self.cfg.gradient_checkpointing
            )

    def _convert_embedding_modules_dtype(
        self,
        embedding_modules: list[str],
        dist_dtype: torch.dtype,
        before_kbit_train_or_finetune: bool,
    ):
        dest = {"dtype": dist_dtype}
        if self.cfg.lora_on_cpu:
            dest["device"] = "cpu"
        for name, module in self.model.named_modules():
            if "norm" in name:
                module.to(dist_dtype)
            if before_kbit_train_or_finetune:
                if name.endswith(".gate"):
                    module.to(dist_dtype)
                if self.model_config.model_type == "btlm":
                    # don't upcast lm_head for btlm
                    continue
            if any(m in name for m in embedding_modules) and hasattr(module, "weight"):
                module.to(**dest)


================================================
FILE: src/axolotl/loaders/patch_manager.py
================================================
"""Patch manager class implementation to complement `axolotl.loaders.ModelLoader`.

Applies pre- and post-model load patches for various fixes and optimizations.
"""

import importlib.util
import os
from functools import cached_property

import addict
import transformers
from transformers import PretrainedConfig, PreTrainedModel
from transformers.modeling_flash_attention_utils import is_flash_attn_available

from axolotl.integrations.base import PluginManager
from axolotl.monkeypatch.multipack import (
    SUPPORTED_MULTIPACK_MODEL_TYPES,
    patch_for_multipack,
)
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)
PLUGIN_MANAGER = PluginManager.get_instance()


class PatchManager:
    """Manages the application of patches during the model loading process."""

    @staticmethod
    def apply_pre_config_load_patches(cfg: DictDefault):
        """
        Apply patches that must be set up before config loading.
        This is for patches that intercept remote code loading from HuggingFace,
        which needs to be in place before AutoConfig.from_pretrained() is called.

        Args:
            cfg: Configuration dictionary with model and training settings.
        """
        if (
            hasattr(cfg, "base_model_config")
            and cfg.base_model_config
            and "kimi-linear" in cfg.base_model_config.lower()
        ):
            from axolotl.monkeypatch.models.kimi_linear.patch_kimi_linear import (
                patch_kimi_config,
            )

            patch_kimi_config()

    @staticmethod
    def apply_pre_tokenizer_load_patches(cfg: DictDefault):
        """
        Apply patches that must be set up before tokenizer loading.
        This is for patches that intercept remote code loading from HuggingFace,
        which needs to be in place before AutoTokenizer.from_pretrained() is called.

        Args:
            cfg: Configuration dictionary with model and training settings.
        """
        if (
            hasattr(cfg, "tokenizer_config")
            and cfg.tokenizer_config
            and "kimi-linear" in cfg.tokenizer_config.lower()
        ):
            from axolotl.monkeypatch.models.kimi_linear.patch_kimi_linear import (
                patch_kimi_tokenizer,
            )

            patch_kimi_tokenizer()

    def __init__(
        self,
        cfg: DictDefault,
        model_config: PretrainedConfig | addict.Dict,
        inference: bool = False,
    ):
        """Initialize the `PatchManager`.

        Args:
            cfg: Configuration dictionary with model and training settings.
            model_config: Configuration object for the model.
            inference: Whether the model is being loaded for inference mode.
        """
        self.cfg = cfg
        self.model_config = model_config
        self.inference = inference

    @cached_property
    def has_flash_attn(self) -> bool:
        """Check if flash attention is installed."""
        return importlib.util.find_spec("flash_attn") is not None

    def apply_pre_model_load_patches(self):
        """Apply pre-model load patches based on config."""
        self._deactivate_hf_async_load()
        self._apply_transformers_patches()
        # self._apply_flex_attention_patches()
        self._apply_flash_attention_patches()
        self._apply_chunked_cross_entropy_patch()
        self._apply_sageattn_patches()
        self._apply_flash_attn_4_patches()
        self._apply_fsdp_patches()
        self._apply_adapter_patches()
        self._apply_model_specific_patches()
        self._apply_fp8_patches()
        self._apply_flash_attention_peft_patches()
        self._apply_gradient_checkpointing_patches()
        self._patch_attention()
        self._apply_multipack_patches()
        self._patch_loss_llama()
        self._patch_llama_derived_model()
        self._apply_mistral_cross_entropy_patch()
        self._apply_self_attention_lora_patch()
        self._apply_fsdp2_bnb_patches()
        self._apply_patch_deepspeed_zero3()
        self._apply_voxtral_patches()
        self._apply_apertus_patches()
        self._apply_trl_vllm_patches()
        self._apply_trl_trainer_utils_patches()

    def apply_post_plugin_pre_model_load_patches(self):
        """Apply post plugin-pre_model_load load patches based on config."""
        self._apply_tiled_mlp(self.cfg.model_config_type)
        self._apply_moe_expert_quantization_patch()

    def _apply_transformers_patches(self):
        from axolotl.monkeypatch.transformers.trainer_loss_calc import (
            patch_evaluation_loop,
            patch_maybe_log_save_evaluate,
        )

        patch_evaluation_loop()
        patch_maybe_log_save_evaluate()

        if self.cfg.context_parallel_size > 1:
            from axolotl.monkeypatch.transformers.trainer_context_parallel import (
                patch_prepare_context_parallel_inputs,
            )

            patch_prepare_context_parallel_inputs()

    def apply_post_model_build_patches(self, model: PreTrainedModel):
        """Apply patches right after model build, before post-load setup."""
        self._finalize_moe_expert_quantization(model)

    def apply_post_model_load_patches(self, model: PreTrainedModel):
        """Apply patches that require the model instance."""
        self._apply_llama_flash_attn_patches(model)
        self._apply_unsloth_patches(model)
        self._apply_lora_kernel_patch(model)
        self._apply_scaling_softmax_patch(model)

    def _apply_flash_attention_patches(self):
        """Apply patches related to Flash Attention."""
        if self.cfg.xformers_attention and self.cfg.sample_packing:
            from axolotl.monkeypatch.attention import patch_xformers_attn_over_fa2

            patch_xformers_attn_over_fa2()
            self.cfg.flash_attention = True

    def _apply_chunked_cross_entropy_patch(self):
        if self.cfg.chunked_cross_entropy:
            from axolotl.monkeypatch.loss.chunked import patch_chunked_ce_loss_fn

            if self.cfg.chunked_cross_entropy_num_chunks:
                patch_chunked_ce_loss_fn(self.cfg.chunked_cross_entropy_num_chunks)
            else:
                patch_chunked_ce_loss_fn()

    def _apply_fsdp_patches(self):
        """Apply patches for FSDP configurations."""
        if self.cfg.fsdp_config:
            from axolotl.monkeypatch.accelerate.fsdp2 import (
                patch_initialize_missing_keys_for_fsdp,
            )

            patch_initialize_missing_keys_for_fsdp()

        if self.cfg.context_parallel_size > 1 or (
            self.cfg.fsdp_config and str(self.cfg.fsdp_version) == "2"
        ):
            from axolotl.monkeypatch.accelerate.parallelism_config import (
                patch_parallelism_config,
            )

            patch_parallelism_config()
        if self.cfg.fsdp_config and str(self.cfg.fsdp_version) == "2":
            from axolotl.monkeypatch.accelerate.fsdp2 import (
                patch_accelerate_fsdp2,
                patch_tied_keys_for_meta_device,
            )

            patch_accelerate_fsdp2()
            if self.cfg.fsdp_config.cpu_ram_efficient_loading:
                patch_tied_keys_for_meta_device()
            if self.cfg.rl:
                from axolotl.monkeypatch.trainer.trl import patch_trl_prepare_fsdp2

                patch_trl_prepare_fsdp2()

        # if self.cfg.fsdp_config:
        #     # see transformers#39152
        #     from axolotl.monkeypatch.trainer_fsdp_optim import (
        #         patch_training_loop_for_fsdp,
        #     )
        #
        #     patch_training_loop_for_fsdp()

    def _apply_adapter_patches(self):
        """Apply patches for adapter configurations."""
        if self.cfg.adapter and self.cfg.embeddings_skip_upcast:
            from axolotl.monkeypatch.peft.utils import patch_peft_prep_code

            patch_peft_prep_code()

    def _apply_flex_attention_patches(self):
        """Apply patches for flexible attention."""
        if self.cfg.flex_attention:
            from axolotl.monkeypatch.attention.flex_attn import (
                patch_flex_wrapper,
            )

            flex_attn_compile_kwargs = self.cfg.flex_attn_compile_kwargs or {}
            patch_flex_wrapper(**flex_attn_compile_kwargs)

    def _apply_sageattn_patches(self):
        """Apply patches for SageAttention."""
        if self.cfg.sage_attention:
            from axolotl.monkeypatch.attention.sage_attn import patch_sageattn

            patch_sageattn()

    def _apply_flash_attn_4_patches(self):
        """Auto-apply FA4 when flash_attention is enabled and FA4 is available on SM90+."""
        if not self.cfg.flash_attention:
            return

        from axolotl.monkeypatch.attention.flash_attn_4 import patch_flash_attn_4

        patch_flash_attn_4(self.model_config)

    def _apply_model_specific_patches(self):
        """Apply patches specific to model architectures."""
        if (
            self.cfg.model_config_type == "llama4"
            and self.cfg.llama4_linearized_experts
        ):
            from axolotl.monkeypatch.models.llama4.modeling import (
                patch_llama4_linearized_modeling,
            )

            patch_llama4_linearized_modeling()

        if self.cfg.model_config_type == "qwen3_next" and self.cfg.sample_packing:
            from axolotl.monkeypatch.models.qwen3_next.modeling import (
                patch_qwen3_next_modeling_packing,
            )

            patch_qwen3_next_modeling_packing()

        if self.cfg.model_config_type == "qwen3_5" and self.cfg.sample_packing:
            from axolotl.monkeypatch.models.qwen3_5.modeling import (
                patch_qwen3_5_modeling_packing,
            )

            patch_qwen3_5_modeling_packing()

        if self.cfg.model_config_type == "qwen3_5_moe" and self.cfg.sample_packing:
            from axolotl.monkeypatch.models.qwen3_5.modeling import (
                patch_qwen3_5_moe_modeling_packing,
            )

            patch_qwen3_5_moe_modeling_packing()

        if (
            self.cfg.model_config_type in ["qwen3_5", "qwen3_5_moe"]
            and self.cfg.is_multimodal
            and self.cfg.flash_attention
        ):
            from axolotl.monkeypatch.models.qwen3_5.modeling import (
                patch_qwen3_5_vlm_flash_attention,
            )

            patch_qwen3_5_vlm_flash_attention()

        if self.cfg.model_config_type == "kimi_linear":
            from axolotl.monkeypatch.models.kimi_linear.patch_kimi_linear import (
                patch_kimi_model,
            )

            patch_kimi_model()

    def _apply_fp8_patches(self):
        """Apply patches for FP8 support."""
        if self.cfg.fp8:
            from axolotl.monkeypatch.trainer_accelerator_args import (
                patch_create_accelerate_code_for_fp8,
            )

            patch_create_accelerate_code_for_fp8(
                self.cfg.fp8_enable_fsdp_float8_all_gather
            )

    def _apply_flash_attention_peft_patches(self):
        """Apply patches for Flash Attention with PEFT."""
        if self.cfg.adapter:
            from axolotl.monkeypatch.transformers_fa_utils import (
                patch_fa_peft_integration,
            )

            patch_fa_peft_integration()

    def _apply_gradient_checkpointing_patches(self):
        """Apply patches for gradient checkpointing."""
        if (
            self.cfg.gradient_checkpointing
            and self.cfg.activation_offloading == "legacy"
        ):
            from axolotl.monkeypatch.gradient_checkpointing import (
                hf_grad_checkpoint_offload_wrapper,
            )

            transformers.modeling_utils.checkpoint = hf_grad_checkpoint_offload_wrapper
        elif (
            self.cfg.gradient_checkpointing
            and self.cfg.activation_offloading == "offload_disk"
        ):
            from axolotl.monkeypatch.gradient_checkpointing import (
                hf_grad_checkpoint_disk_offload_wrapper,
            )

            transformers.modeling_utils.checkpoint = (
                hf_grad_checkpoint_disk_offload_wrapper
            )

    def _apply_mistral_cross_entropy_patch(self):
        """Apply Mistral cross entropy patch if configured."""
        if (
            self.cfg.model_config_type == "mistral"
            and self.cfg.flash_attn_cross_entropy_loss
        ):
            from axolotl.monkeypatch.mistral_attn_hijack_flash import (
                patch_mistral_cross_entropy,
            )

            patch_mistral_cross_entropy()

    def _apply_self_attention_lora_patch(self):
        """Apply self-attention LoRA patches if configured."""
        if self.cfg.lora_qkv_kernel or self.cfg.lora_o_kernel:
            # Only patch if conditions are met
            can_patch = (
                self.cfg.lora_dropout == 0
                if hasattr(self.cfg, "lora_dropout")
                else True
            )  # default to True if lora_dropout is not set

            if not can_patch:
                LOG.warning("Cannot patch self-attention - requires no dropout")
                return

            from axolotl.monkeypatch.lora_kernels import patch_self_attn_lora

            patch_self_attn_lora(self.cfg)

    def _apply_multipack_patches(self):
        """Apply multipack patches if necessary."""
        if (
            self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES
            and (self.cfg.flash_attention or self.cfg.flex_attention)
            and self.cfg.sample_packing
        ):
            # Get automap config if it exists
            auto_map_config = None
            if isinstance(self.model_config, dict) and "auto_map" in self.model_config:
                auto_map_config = self.model_config["auto_map"]
            elif hasattr(self.model_config, "auto_map"):
                auto_map_config = self.model_config.auto_map

            # Determine if the model has remote code
            if auto_map_config is not None:
                has_remote_code = "AutoModelForCausalLM" in auto_map_config
            else:
                has_remote_code = False

            if has_remote_code and self.cfg.trust_remote_code is not None:
                # If explicitly set in YAML, prefer that
                has_remote_code = self.cfg.trust_remote_code

            patch_for_multipack(
                self.cfg.model_config_type,
                model_name=self.cfg.base_model,
                has_remote_code=has_remote_code,
            )

        if self.cfg.sample_packing:
            from axolotl.monkeypatch.data.batch_dataset_fetcher import (
                apply_multipack_dataloader_patch,
            )

            LOG.info("Applying multipack dataloader patch for sample packing...")
            apply_multipack_dataloader_patch()

    def _apply_fsdp2_bnb_patches(self):
        """Apply FSDP2 BNB patches."""
        if (
            self.cfg.fsdp_config
            and str(self.cfg.fsdp_version) == "2"
            and (self.cfg.load_in_4bit or self.cfg.load_in_8bit)
        ):
            from axolotl.monkeypatch.fsdp2_qlora import (
                apply_init_dtype_attrs_patch,
                apply_init_sharded_param_patch,
                apply_init_unsharded_param_patch,
                apply_linear8bitlt_save_patch,
            )

            apply_init_sharded_param_patch()
            apply_init_unsharded_param_patch()
            apply_init_dtype_attrs_patch()
            if self.cfg.load_in_8bit:
                apply_linear8bitlt_save_patch()

    def _deactivate_hf_async_load(self):
        """Load weights synchronously so they can be converted and not OOM."""
        if self.cfg.load_in_4bit or self.cfg.load_in_8bit:
            os.environ["HF_DEACTIVATE_ASYNC_LOAD"] = "1"

    def _apply_moe_expert_quantization_patch(self):
        """Patch transformers weight loading and PEFT for MoE expert quantization."""
        has_target_params = bool(getattr(self.cfg, "lora_target_parameters", None))

        if not self.cfg.quantize_moe_experts and not has_target_params:
            return

        from axolotl.monkeypatch.moe_quant import (
            patch_peft_target_parameters_matching,
        )

        if self.cfg.quantize_moe_experts:
            from axolotl.monkeypatch.moe_quant import patch_moe_quantization_on_load

            patch_moe_quantization_on_load(self.cfg)

        patch_peft_target_parameters_matching()

    def _finalize_moe_expert_quantization(self, model: PreTrainedModel):
        """Log quantization results and set model flag for downstream use."""
        import torch

        model._moe_experts_quantized = False
        if self.cfg.quantize_moe_experts:
            from axolotl.monkeypatch.moe_quant import get_moe_quantized_count

            count = get_moe_quantized_count()
            if count > 0:
                import gc

                model._moe_experts_quantized = True
                LOG.info(
                    "Quantized %d MoE expert parameter(s) to %s during model loading",
                    count,
                    "4-bit" if self.cfg.load_in_4bit else "8-bit",
                )
                gc.collect()
                torch.cuda.empty_cache()

    def _apply_tiled_mlp(self, model_type: str):
        if self.cfg.tiled_mlp:
            from axolotl.monkeypatch.tiled_mlp import (
                patch_tiled_mlp,
            )

            patch_tiled_mlp(
                model_type,
                use_original_mlp=self.cfg.tiled_mlp_use_original_mlp,
                cfg_num_shards=self.cfg.tiled_mlp_num_shards,
            )

    def _apply_voxtral_patches(self):
        """Apply patches for Voxtral model."""
        if self.cfg.model_config_type == "voxtral":
            from axolotl.monkeypatch.models.voxtral.modeling import (
                patch_voxtral_conditional_generation_forward,
            )

            patch_voxtral_conditional_generation_forward()

    def _patch_attention(self):
        """Apply attention-specific patches based on model type."""
        if not (self.cfg.flash_attention and hasattr(self.model_config, "model_type")):
            return

        if self.model_config.model_type == "btlm":
            from axolotl.monkeypatch.btlm_attn_hijack_flash import (
                replace_btlm_attn_with_flash_attn,
            )

            replace_btlm_attn_with_flash_attn(self.cfg.base_model)

        if self.model_config.model_type == "stablelm_epoch" and self.cfg.sample_packing:
            from axolotl.monkeypatch.stablelm_attn_hijack_flash import (
                replace_stablelm_attn_with_flash_attn,
            )

            replace_stablelm_attn_with_flash_attn(self.cfg.base_model)

        if self.model_config.model_type in ("mistral3", "llava"):
            from axolotl.monkeypatch.models.pixtral.modeling_flash_attention_utils import (
                apply_patch_is_packed_sequence,
            )

            apply_patch_is_packed_sequence()

    def _patch_loss_llama(self):
        """Patch loss functions and other optimizations for LLaMA models."""
        if not self.cfg.is_llama_derived_model:
            return

        if self.cfg.flash_attn_cross_entropy and self.has_flash_attn:
            from axolotl.monkeypatch.llama_attn_hijack_flash import (
                patch_fa_llama_cross_entropy,
            )

            patch_fa_llama_cross_entropy()
        elif self.cfg.unsloth_cross_entropy_loss:
            from axolotl.monkeypatch.unsloth_ import integrate_cross_entropy_loss_patch

            integrate_cross_entropy_loss_patch(model_type="llama")

        if self.cfg.flash_attn_rms_norm and self.has_flash_attn:
            from axolotl.monkeypatch.llama_attn_hijack_flash import patch_llama_rms_norm

            patch_llama_rms_norm()
        elif self.cfg.unsloth_rms_norm:
            from axolotl.monkeypatch.unsloth_ import patch_unsloth_layernorm

            patch_unsloth_layernorm()

        if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o:
            from axolotl.monkeypatch.unsloth_ import patch_self_attn_lora

            patch_self_attn_lora()

    def _patch_llama_flash_attention(self):
        """Apply Flash Attention patches for LLaMA models."""
        from axolotl.monkeypatch.llama_attn_hijack_flash import (
            replace_llama_attn_with_flash_attn,
        )

        if self.cfg.s2_attention:
            LOG.info("patching w/ flash-enabled, shifted-sparse attention")
            replace_llama_attn_with_flash_attn(
                cross_entropy=self.cfg.flash_attn_cross_entropy,
                rms_norm=self.cfg.flash_attn_rms_norm,
                use_shifted_sparse_attn=True,
            )
        elif self.cfg.flash_attn_cross_entropy or self.cfg.flash_attn_rms_norm:
            replace_llama_attn_with_flash_attn(
                cross_entropy=self.cfg.flash_attn_cross_entropy,
                rms_norm=self.cfg.flash_attn_rms_norm,
            )

    def _patch_llama_xformers_attention(self):
        """Apply xformers attention patches for LLaMA models."""
        from axolotl.monkeypatch.llama_attn_hijack_xformers import (
            hijack_llama_attention,
        )

        LOG.info("Patching with xformers attention...")
        hijack_llama_attention()

    def _patch_llama_derived_model(self):
        """Modify all llama derived models in one block."""
        if self.cfg.is_llama_derived_model and not (
            self.cfg.model_config_type in SUPPORTED_MULTIPACK_MODEL_TYPES
            and (self.cfg.flash_attention or self.cfg.flex_attention)
            and self.cfg.sample_packing
        ):
            if self.cfg.flash_attention:
                self._patch_llama_flash_attention()
            elif self.cfg.xformers_attention:
                self._patch_llama_xformers_attention()
            elif self.cfg.s2_attention:
                raise NotImplementedError(
                    "Shifted-sparse attention not currently implemented without flash attention."
                )

    def _apply_llama_flash_attn_patches(self, model):
        """Apply LLaMA-specific flash attention patches."""
        if (
            self.model_config.model_type in ["llama", "llama4"]
            and not self.cfg.trust_remote_code
            and not self.cfg.gptq
            and self.cfg.flash_attention
            and is_flash_attn_available()
            and not self.inference
        ):
            # TODO(MengqingCao): split these patches separately
            from axolotl.monkeypatch.llama_attn_hijack_flash import (
                is_xformers_swiglu_available,
                replace_llama_mlp_with_swiglu,
            )

            if self.cfg.flash_attn_fuse_mlp and is_xformers_swiglu_available():
                LOG.info("Patching with SwiGLU...")
                replace_llama_mlp_with_swiglu(model)

    def _apply_unsloth_patches(self, model):
        """Apply unsloth optimization patches."""
        if self.cfg.unsloth_lora_mlp:
            from axolotl.monkeypatch.unsloth_ import integrate_lora_mlp_patch

            integrate_lora_mlp_patch(peft_model=model)

        if self.cfg.unsloth_lora_qkv or self.cfg.unsloth_lora_o:
            from axolotl.monkeypatch.unsloth_ import integrate_lora_patch

            integrate_lora_patch(peft_model=model, cfg=self.cfg)

        if self.cfg.unsloth_rope:
            from axolotl.monkeypatch.unsloth_ import integrate_rope_embeddings

            integrate_rope_embeddings()

    def _apply_lora_kernel_patch(self, model):
        """Apply LoRA kernel patches."""
        if (
            self.cfg.lora_mlp_kernel
            or self.cfg.lora_qkv_kernel
            or self.cfg.lora_o_kernel
        ):
            from axolotl.monkeypatch.lora_kernels import apply_lora_kernel_patches

            apply_lora_kernel_patches(model=model, cfg=self.cfg)

    def _apply_patch_deepspeed_zero3(self):
        try:
            from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled

            from axolotl.monkeypatch.deepspeed_utils import apply_deepspeed_patches

            if self.cfg.activation_offloading is True and (
                is_deepspeed_zero3_enabled()
                or os.getenv("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3"
            ):
                apply_deepspeed_patches()
        except ImportError as e:
            LOG.warning(f"DeepSpeed patches not applied: {e}")

    def _apply_apertus_patches(self):
        """Apply patches for Apertus model."""
        if self.cfg.model_config_type == "apertus":
            from axolotl.monkeypatch.models.apertus.activation import (
                patch_apertus_xielu_activation,
            )

            patch_apertus_xielu_activation()

    def _apply_trl_vllm_patches(self):
        """Apply TRL vLLM patches for batched weight sync, NaN logprobs fix, and scalar handling."""
        if (
            self.cfg.rl
            and getattr(self.cfg, "trl", None)
            and getattr(self.cfg.trl, "use_vllm", False)
        ):
            from axolotl.monkeypatch.trainer.trl_vllm import patch_trl_vllm

            patch_trl_vllm()

    def _apply_trl_trainer_utils_patches(self):
        """Replace trl.trainer.utils.{selective_log_softmax, entropy_from_logits} with Triton kernels."""
        if not self.cfg.rl:
            return

        try:
            from axolotl.monkeypatch.trainer.utils import (
                entropy_from_logits,
                selective_log_softmax,
            )
        except (ImportError, ModuleNotFoundError):
            LOG.warning("Triton not available — skipping trl.trainer.utils patches")
            return

        import trl.trainer.utils

        # Guard against repeated calls: only stash the original if trl still
        # points at its own implementation (not our wrapper).
        if trl.trainer.utils.selective_log_softmax is not selective_log_softmax:
            from axolotl.monkeypatch.trainer import utils as _axolotl_trainer_utils

            _axolotl_trainer_utils.selective_log_softmax_original = (
                trl.trainer.utils.selective_log_softmax
            )
            trl.trainer.utils.selective_log_softmax = selective_log_softmax

        if trl.trainer.utils.entropy_from_logits is not entropy_from_logits:
            trl.trainer.utils.entropy_from_logits = entropy_from_logits

        LOG.info(
            "Patched trl.trainer.utils with Triton selective_log_softmax and entropy_from_logits"
        )

    def _apply_scaling_softmax_patch(self, model: PreTrainedModel):
        """Apply Scaling Softmax (SSMax) patch.  Ref: https://arxiv.org/abs/2501.19399"""
        if self.cfg.scaling_softmax:
            from axolotl.monkeypatch.scaled_softmax_attn import (
                patch_scaled_softmax_attention,
            )

            patch_scaled_softmax_attention(
                scaling_factor_init=self.cfg.scaling_softmax_factor or 0.43,
                bias=self.cfg.scaling_softmax_bias or 0.0,
                model=model,
            )


================================================
FILE: src/axolotl/loaders/processor.py
================================================
"""Processor loading functionality for multi-modal models"""

import transformers
from transformers import (
    AutoProcessor,
    PreTrainedTokenizerBase,
)

from axolotl.telemetry.errors import send_errors
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


@send_errors
def load_processor(cfg: DictDefault, tokenizer: PreTrainedTokenizerBase):
    processor_cls = AutoProcessor
    if cfg.processor_type:
        processor_cls = getattr(transformers, cfg.processor_type)

    # Build common kwargs for processor loading
    processor_kwargs = {}
    if cfg.revision_of_model:
        processor_kwargs["revision"] = cfg.revision_of_model

    if cfg.tokenizer_use_mistral_common:

        def _patch_mistralcommontokenizer():
            """
            Transformers v5 stops reading the sub-processor.

            We need to patch this, so both processors use this.
            """
            import transformers.tokenization_mistral_common as tokenization_mistral_common

            from axolotl.utils.mistral import HFMistralTokenizer

            tokenization_mistral_common.MistralCommonBackend = HFMistralTokenizer

        _patch_mistralcommontokenizer()

        from transformers import VoxtralProcessor

        if processor_cls == VoxtralProcessor:
            return VoxtralProcessor.from_pretrained(
                cfg.processor_config,
                **processor_kwargs,
            )

        from axolotl.utils.mistral import Mistral3Processor

        return Mistral3Processor(
            tokenizer=tokenizer,
        )

    processor_kwargs["trust_remote_code"] = cfg.trust_remote_code or False

    processor = processor_cls.from_pretrained(
        cfg.processor_config,
        **processor_kwargs,
    )
    processor.tokenizer = tokenizer

    # Attempt to load image size from processor if available
    if (
        cfg.image_size is None
        and hasattr(processor, "size")
        and any(dim in processor.size for dim in ["width", "height"])
    ):
        im_width = None
        im_height = None
        if "width" in processor.size:
            im_width = processor.size["width"]
        if "height" in processor.size:
            im_height = processor.size["height"]

        # If both width and height are set, use a tuple
        if im_width is not None and im_height is not None:
            cfg.image_size = (im_width, im_height)
        # If only width is set, use as integer
        elif im_width is not None:
            cfg.image_size = im_width
        # If only height is set, use as integer
        elif im_height is not None:
            cfg.image_size = im_height

        LOG.debug(f"Loaded image size: {cfg.image_size} from processor")

    return processor


================================================
FILE: src/axolotl/loaders/tokenizer.py
================================================
"""Tokenizer loading functionality and associated utils"""

import json
import os

import transformers
from transformers import (
    AddedToken,
    AutoTokenizer,
    PreTrainedTokenizer,
)

from axolotl.integrations.base import PluginManager
from axolotl.loaders.utils import get_linear_embedding_layers, load_model_config
from axolotl.prompt_tokenizers import LLAMA_DEFAULT_EOS_TOKEN
from axolotl.telemetry.errors import send_errors
from axolotl.utils.chat_templates import get_chat_template_from_config
from axolotl.utils.dict import DictDefault
from axolotl.utils.distributed import (
    barrier,
    is_local_main_process,
    is_main_process,
)
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)
PLUGIN_MANAGER = PluginManager.get_instance()


def modify_tokenizer_files(
    tokenizer_path: str,
    token_mappings: dict[int, str],
    output_dir: str,
    revision: str = "main",
) -> str:
    """
    Modify tokenizer files to replace added_tokens strings, save to output directory,
    and return the path to the modified tokenizer.

    This only works with reserved tokens that were added to the tokenizer, not tokens
    already part of the vocab.

    Args:
        tokenizer_path: Path or name of the original tokenizer
        token_mappings: Dict mapping {token_id (int): new_token_string}
        output_dir: Directory to save the modified tokenizer
        revision: Model revision/branch/tag/commit to load from (HF Hub)

    Returns:
        Path to the modified tokenizer directory

    Ref: https://github.com/huggingface/transformers/issues/27974#issuecomment-1854188941
    """
    # Create the tokenizer directory in output_dir if it doesn't exist
    tokenizer_dir = os.path.join(output_dir, "tokenizer")
    os.makedirs(tokenizer_dir, exist_ok=True)

    if is_local_main_process():
        # Load the tokenizer
        temp_tokenizer = AutoTokenizer.from_pretrained(
            tokenizer_path, use_fast=True, revision=revision
        )

        # Save the tokenizer to the output directory
        temp_tokenizer.save_pretrained(tokenizer_dir)

        # Get the token IDs and map them to their new values
        token_id_mappings = {
            int(token_id): new_value for token_id, new_value in token_mappings.items()
        }

        # 1. Update tokenizer_config.json - added_tokens_decoder
        config_path = os.path.join(tokenizer_dir, "tokenizer_config.json")
        if os.path.exists(config_path):
            with open(config_path, "r", encoding="utf-8") as f:
                config_data = json.load(f)

            # Update added_tokens_decoder
            if "added_tokens_decoder" in config_data:
                for token_id, new_value in token_id_mappings.items():
                    token_id_str = str(token_id)
                    if token_id_str in config_data["added_tokens_decoder"]:
                        config_data["added_tokens_decoder"][token_id_str]["content"] = (
                            new_value
                        )
                    else:
                        raise ValueError(
                            f"Token ID {token_id_str} not found in added_tokens_decoder"
                        )

            # Write the updated config back
            with open(config_path, "w", encoding="utf-8") as f:
                json.dump(config_data, f, indent=2)

        # 2. Update tokenizer.json - added_tokens
        tokenizer_path = os.path.join(tokenizer_dir, "tokenizer.json")
        if os.path.exists(tokenizer_path):
            with open(tokenizer_path, "r", encoding="utf-8") as f:
                tokenizer_data = json.load(f)

            # Update added_tokens
            if "added_tokens" in tokenizer_data:
                for token_id, new_value in token_id_mappings.items():
                    for i, token_entry in enumerate(tokenizer_data["added_tokens"]):
                        if token_entry["id"] == token_id:
                            tokenizer_data["added_tokens"][i]["content"] = new_value
                            break
                    else:
                        # Reaching this section means the token_id was not found in tokenizer.json added_tokens
                        raise ValueError(
                            f"Token ID {token_id} not found in added_tokens"
                        )
            if "model" in tokenizer_data and "vocab" in tokenizer_data["model"]:
                for token_id, new_value in token_id_mappings.items():
                    for entry_val, entry_id in tokenizer_data["model"]["vocab"].items():
                        if entry_id == token_id:
                            del tokenizer_data["model"]["vocab"][entry_val]
                            tokenizer_data["model"]["vocab"][new_value] = token_id
                            break

            # Write the updated tokenizer data back
            with open(tokenizer_path, "w", encoding="utf-8") as f:
                json.dump(tokenizer_data, f, indent=2)

    barrier()
    return tokenizer_dir


@send_errors
def load_tokenizer(cfg: DictDefault) -> PreTrainedTokenizer:
    """Load and configure the tokenizer based on the provided config."""

    # Apply patches that need to be in place before tokenizer loading
    from axolotl.loaders.patch_manager import PatchManager

    PatchManager.apply_pre_tokenizer_load_patches(cfg)

    def _load_mistral_common_tokenizer(cfg: DictDefault):
        """Load mistral-common tokenizer"""
        from axolotl.utils.mistral import HFMistralTokenizer

        # Load the HF-compatible wrapper around MistralTokenizer
        kwargs = {}
        if cfg.revision_of_model:
            kwargs["revision"] = cfg.revision_of_model
        tokenizer = HFMistralTokenizer.from_pretrained(cfg.tokenizer_config, **kwargs)

        return tokenizer

    if cfg.tokenizer_use_mistral_common:
        return _load_mistral_common_tokenizer(cfg)

    model_config = load_model_config(cfg)
    tokenizer_kwargs = {}
    use_fast = True  # this is the default

    if cfg.tokenizer_use_fast is not None:
        use_fast = cfg.tokenizer_use_fast
    if cfg.tokenizer_legacy is not None:
        # True is the default w/ https://github.com/huggingface/transformers/pull/25224
        tokenizer_kwargs["legacy"] = cfg.tokenizer_legacy
    if cfg.revision_of_model:
        tokenizer_kwargs["revision"] = cfg.revision_of_model

    tokenizer_cls = AutoTokenizer
    if cfg.tokenizer_type:
        tokenizer_cls = getattr(transformers, cfg.tokenizer_type)

    # Set base tokenizer path
    tokenizer_path = cfg.tokenizer_config

    # Apply token string overrides if specified
    if cfg.added_tokens_overrides:
        # Modify tokenizer files and get path to modified tokenizer
        modify_kwargs = {"output_dir": cfg.output_dir}
        if cfg.revision_of_model:
            modify_kwargs["revision"] = cfg.revision_of_model
        tokenizer_path = modify_tokenizer_files(
            tokenizer_path, cfg.added_tokens_overrides, **modify_kwargs
        )

    tokenizer = tokenizer_cls.from_pretrained(
        tokenizer_path,
        trust_remote_code=cfg.trust_remote_code or False,
        use_fast=use_fast,
        **tokenizer_kwargs,
    )

    if (
        tokenizer.__class__.__name__
        in [
            "LlamaTokenizer",
            "LlamaTokenizerFast",
            "CodeLlamaTokenizer",
            "CodeLlamaTokenizerFast",
        ]
        and hasattr(tokenizer, "pad_token")
        and not tokenizer.pad_token
    ):
        # set a pad_token, but use eos_token so we don't add a new token
        tokenizer.pad_token = LLAMA_DEFAULT_EOS_TOKEN

    if tokenizer.__class__.__name__ == "GPTNeoXTokenizerFast":
        tokenizer.add_special_tokens({"pad_token": "[PAD]"})  # nosec B105
        os.environ["TOKENIZERS_PARALLELISM"] = "false"

    # Mistral's official FA implementation requires left padding
    if cfg.is_mistral_derived_model and cfg.flash_attention and not cfg.sample_packing:
        tokenizer.padding_side = "left"

    # Qwen base only has single token, so we need to set the special tokens
    # the following check is for Qwen1 base models
    if cfg.is_qwen_derived_model and hasattr(tokenizer, "eod_id"):
        token_ids = ["bos_token_id", "eos_token_id", "pad_token_id", "unk_token_id"]
        for attr_name in token_ids:
            if getattr(tokenizer, attr_name) is None:
                setattr(tokenizer, attr_name, tokenizer.eod_id)

        token_names = ["bos_token", "eos_token", "pad_token", "unk_token"]
        for attr_name in token_names:
            if getattr(tokenizer, attr_name) is None:
                setattr(tokenizer, attr_name, "<|endoftext|>")

    additional_special_tokens = None
    if cfg.special_tokens:
        special_tokens = cfg.special_tokens.to_dict()
        additional_special_tokens = special_tokens.pop(
            "additional_special_tokens", None
        )
        lora_modules_to_save = get_linear_embedding_layers(model_config.model_type)
        for k, val in special_tokens.items():
            # check if new special token is not already in tokenizer and
            # is adapter training to make sure lora_modules_to_save is set

            if (
                (getattr(tokenizer, k) is None or getattr(tokenizer, k) != val)
                and (len(tokenizer.encode(val, add_special_tokens=False)) > 2)
                and cfg.adapter
                and (
                    not cfg.lora_modules_to_save
                    or not all(
                        x in cfg.lora_modules_to_save for x in lora_modules_to_save
                    )
                )
                and k != "pad_token"
            ):
                lora_modules_to_save_str = ", ".join(
                    [f"`{x}`" for x in lora_modules_to_save]
                )
                raise ValueError(
                    f"Please set lora_modules_to_save to [{lora_modules_to_save_str}] "
                    "when using an adapter and changing the special tokens."
                )

            tokenizer.add_special_tokens(
                {k: AddedToken(val, rstrip=False, lstrip=False, normalized=False)}
            )

        # If we add bos_token and eos_token, we need to update the post processor to
        # handle them correctly.
        # https://github.com/huggingface/transformers/pull/24132
        bos_or_eos_in_special_tokens = (
            "bos_token" in cfg.special_tokens and "eos_token" in cfg.special_tokens
        )
        if (
            tokenizer.__class__.__name__
            in (
                "LlamaTokenizerFast",
                "CodeLlamaTokenizerFast",
            )
            and bos_or_eos_in_special_tokens
        ):
            tokenizer.update_post_processor()

    if cfg.tokens:
        tokenizer.add_tokens(
            [
                AddedToken(token, rstrip=False, lstrip=False, normalized=False)
                for token in cfg.tokens
            ]
        )

    # Additional special tokens are a List, and need to be treated differently than regular special
    # tokens. We add them after we have called `add_tokens` in case these additional special tokens
    # are new tokens.
    #
    # Usage:
    #
    # ```py
    # special_tokens:
    #   additional_special_tokens: ["<|im_start|>", "<|im_end|>"]
    # ```
    if additional_special_tokens is not None:
        tokenizer.add_special_tokens(
            {"additional_special_tokens": additional_special_tokens}
        )

    if is_main_process():
        LOG.debug(f"EOS: {tokenizer.eos_token_id} / {tokenizer.eos_token}")
        LOG.debug(f"BOS: {tokenizer.bos_token_id} / {tokenizer.bos_token}")
        LOG.debug(f"PAD: {tokenizer.pad_token_id} / {tokenizer.pad_token}")
        LOG.debug(f"UNK: {tokenizer.unk_token_id} / {tokenizer.unk_token}")

    if cfg.chat_template:
        chat_template_string = get_chat_template_from_config(
            cfg=cfg,
            tokenizer=tokenizer,
        )
        if cfg.default_system_message and cfg.chat_template == "chatml":
            chat_template_string = chat_template_string.replace(
                "You are a helpful assistant.", cfg.default_system_message
            )

        tokenizer.chat_template = chat_template_string
    elif getattr(tokenizer, "chat_template", None) is None:
        LOG.info(
            "No Chat template selected. Consider adding a chat template for easier inference."
        )

    # make the tokenizer.pad call quieter 🤐
    if hasattr(tokenizer, "deprecation_warnings"):
        tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True

    return tokenizer


================================================
FILE: src/axolotl/loaders/utils.py
================================================
"""Utilities for axolotl.loaders module"""

import contextlib
from typing import Type

import addict
import torch
import transformers
from transformers import AutoConfig, PretrainedConfig, PreTrainedModel

from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def get_module_class_from_name(
    module: torch.nn.Module, name: str
) -> Type[torch.nn.Module] | None:
    """Gets a class from a module by its name. Copied from `accelerate.utils.dataclasses`
    (https://github.com/huggingface/accelerate/blob/main/src/accelerate/utils/dataclasses.py#L2805).

    Args:
        module: The module to get the class from.
        name: The name of the class.

    Returns:
        The class type of the matching module, or `None` if no match is found.
    """
    modules_children = list(module.children())
    if module.__class__.__name__ == name:
        return module.__class__

    if len(modules_children) == 0:
        return None

    for child_module in modules_children:
        module_class = get_module_class_from_name(child_module, name)
        if module_class is not None:
            return module_class

    return None


def check_model_config(cfg: DictDefault, model_config: PretrainedConfig):
    """Validates and adjusts model config based on `axolotl` config.

    This function performs several important checks and adjustments:
        - Disables model caching for better memory efficiency
        - Handles multimodal model-specific configurations
        - Validates quantization settings
        - Ensures proper LoRA configuration when using adapters with new tokens

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        model_config: The model's configuration object from `transformers`.

    Raises:
        ValueError: If a multimodal model lacks text configuration, if GPTQ settings
            are inconsistent, or if LoRA `modules_to_save` is improperly configured
            with new tokens.
    """
    if hasattr(model_config, "use_cache"):
        model_config.use_cache = False

    if cfg.is_multimodal:
        # For multimodal configs, use_cache is set in the text_config
        if hasattr(model_config, "get_text_config"):
            text_config = model_config.get_text_config()
            if hasattr(text_config, "use_cache"):
                text_config.use_cache = False
        else:
            raise ValueError(
                "No text config found for multimodal model. Please raise an Issue with model details."
            )

        # Check if image_size is not set and load image size from model config if available
        if (
            cfg.image_size is None
            and hasattr(model_config, "vision_config")
            and hasattr(model_config.vision_config, "image_size")
        ):
            image_size = model_config.vision_config.image_size
            if isinstance(image_size, list):
                cfg.image_size = tuple(image_size)
            else:
                cfg.image_size = image_size
            LOG.debug(f"Loaded image size: {cfg.image_size} from model config")

    quant_config_exists = (
        hasattr(model_config, "quantization_config")
        and model_config.quantization_config
    )

    # Detect compressed-tensors config
    is_compressed_tensors_config = (
        quant_config_exists
        and model_config.quantization_config.get("quant_method") == "compressed-tensors"
    )

    if is_compressed_tensors_config:
        if model_config.quantization_config.get("config_groups"):
            LOG.warning(
                "Found `config_groups` in a compressed-tensors config. "
                "QAT integration with llmcompressor is not tested."
            )
        # Skip further quant checks for compressed-tensors
        return

    quant_config_method_is_gptq = (
        quant_config_exists
        and "quant_method" in model_config.quantization_config
        and model_config.quantization_config["quant_method"] == "gptq"
    )

    if cfg.gptq and not quant_config_method_is_gptq:
        raise ValueError(
            "model_config.quantization_config is not set or quant_method is not set to gptq. "
            "Please make sure to point to a GPTQ model."
        )

    lora_modules_to_save = get_linear_embedding_layers(model_config.model_type)
    if (
        cfg.adapter
        and cfg.tokens
        and (
            not cfg.lora_modules_to_save
            or not all(x in cfg.lora_modules_to_save for x in lora_modules_to_save)
        )
    ):
        lora_modules_to_save_joined = ", ".join(
            map(lambda x: f"`{x}`", lora_modules_to_save)
        )
        raise ValueError(
            "`lora_modules_to_save` not properly set when adding new tokens. "
            f"Please include [{lora_modules_to_save_joined}] in `lora_modules_to_save`."
        )

    if (
        cfg.tensor_parallel_size
        and cfg.tensor_parallel_size > 1
        and hasattr(model_config, "tie_word_embeddings")
        and model_config.tie_word_embeddings
    ):
        raise ValueError(
            "Tensor parallelism is incompatible with models configured with `tie_word_embeddings` enabled. "
            "Please use a model without `tie_word_embeddings`, or disable tensor parallelism."
        )


def load_model_config(cfg: DictDefault) -> PretrainedConfig | addict.Dict:
    """Loads and configures a model configuration from HuggingFace or local sources.

    This function determines the appropriate model config source, loads it, applies any
    necessary overrides, and validates it for compatibility with the `axolotl` config.

    If `cfg.cls_model_config` is set, a custom config class from transformers will be
    used instead of `AutoConfig` (e.g., 'LlamaConfig', 'MistralConfig').

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.

    Returns:
        A configured model configuration object (`AutoConfig` instance), or a simple
            dictionary configuration for special cases like Mamba models.

    Raises:
        ValueError: If configuration loading fails for reasons other than special cases
            that are handled (e.g., Mamba models).
    """
    model_config_name = cfg.base_model_config or cfg.base_model
    if not model_config_name and cfg.tokenizer_config:
        model_config_name = cfg.tokenizer_config
    trust_remote_code = cfg.trust_remote_code is True
    config_kwargs = {}
    if cfg.revision_of_model:
        config_kwargs["revision"] = cfg.revision_of_model
    if cfg.num_labels:
        # num_labels is used to initialize classifier models
        config_kwargs["num_labels"] = cfg.num_labels

    config_cls = AutoConfig
    if cfg.cls_model_config:
        config_cls = getattr(transformers, cfg.cls_model_config)

    try:
        model_config = config_cls.from_pretrained(
            model_config_name,
            trust_remote_code=trust_remote_code,
            **config_kwargs,
        )
    except ValueError as error:
        if "mamba" in model_config_name:
            return addict.Dict(
                {
                    "model_type": "mamba",
                }
            )
        raise error

    if cfg.overrides_of_model_config:
        for key, val in cfg.overrides_of_model_config.items():
            setattr(model_config, key, val)

    check_model_config(cfg, model_config)

    return model_config


def ensure_dtype(model: PreTrainedModel, dtype: torch.dtype = torch.bfloat16):
    """Ensures all modules in the model are converted to the specified data type."""
    for name, module in model.named_modules():
        weight_mismatch = False
        with contextlib.suppress(AttributeError):
            weight_mismatch = module.weight.dtype != dtype

        bias_mismatch = False
        with contextlib.suppress(AttributeError):
            bias_mismatch = module.bias.dtype != dtype

        if weight_mismatch:
            LOG.debug(
                f"Converting module {name}.weight: {module.weight.dtype} -> {dtype}"
            )
        if bias_mismatch:
            LOG.debug(f"Converting module {name}.bias: {module.bias.dtype} -> {dtype}")
        if weight_mismatch or bias_mismatch:
            module.to(dtype)


def get_linear_embedding_layers(model_type: str) -> list[str]:
    """Returns layer names of linear embeddings needed for LoRA based on model type."""
    if model_type == "gpt_neox":
        return ["embed_in", "embed_out"]
    if model_type == "falcon":
        return ["word_embeddings", "lm_head"]
    return ["embed_tokens", "lm_head"]


================================================
FILE: src/axolotl/logging_config.py
================================================
"""Common logging module for axolotl."""

import logging
import os
from logging import Formatter, Logger, LogRecord
from logging.config import dictConfig
from typing import Any, Dict

from colorama import Fore, Style, init

DEFAULT_AXOLOTL_LOG_LEVEL = "INFO"
DEFAULT_LOG_LEVEL = "WARNING"


class AxolotlOrWarnErrorFilter(logging.Filter):
    """
    Allows ANY WARNING or higher (unless overridden by LOG_LEVEL). Allows axolotl.* at
    INFO or higher (unless overridden by AXOLOTL_LOG_LEVEL). Drops all other records
    (i.e. non-axolotl.INFO, DEBUG, etc. by default).
    """

    def __init__(self, **kwargs):
        super().__init__(**kwargs)

        axolotl_log_level = os.getenv(
            "AXOLOTL_LOG_LEVEL", DEFAULT_AXOLOTL_LOG_LEVEL
        ).upper()
        other_log_level = os.getenv("LOG_LEVEL", DEFAULT_LOG_LEVEL).upper()

        try:
            # py311+ only
            level_mapping = logging.getLevelNamesMapping()
            self.axolotl_level = level_mapping[axolotl_log_level]
            self.other_level = level_mapping[other_log_level]
        except AttributeError:
            # For py310, use getLevelName directly
            self.axolotl_level = logging.getLevelName(axolotl_log_level)
            self.other_level = logging.getLevelName(other_log_level)

    def filter(self, record: LogRecord) -> bool:
        # General filter
        if record.levelno >= self.other_level:
            return True

        # Axolotl filter
        return (
            record.name.startswith("axolotl") and record.levelno >= self.axolotl_level
        )


class AxolotlLogger(Logger):
    """Logger that applies filtering to non-axolotl loggers."""

    def __init__(self, name: str, level: int = logging.NOTSET):
        super().__init__(name, level)
        if not name.startswith("axolotl"):
            self.addFilter(AxolotlOrWarnErrorFilter())


class ColorfulFormatter(Formatter):
    """
    Formatter to add coloring to log messages by log type
    """

    COLORS = {
        "WARNING": Fore.YELLOW,
        "ERROR": Fore.RED,
        "CRITICAL": Fore.RED + Style.BRIGHT,
    }

    def format(self, record):
        record.rank = int(os.getenv("LOCAL_RANK", "0"))
        record.rank_fmt = f" [RANK:{record.rank}]" if record.rank != 0 else ""
        log_message = super().format(record)
        return self.COLORS.get(record.levelname, "") + log_message + Fore.RESET


DEFAULT_LOGGING_CONFIG: Dict[str, Any] = {
    "version": 1,
    "disable_existing_loggers": False,
    "formatters": {
        "simple": {
            "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d] %(message)s",
        },
        "colorful": {
            "()": ColorfulFormatter,
            "format": "[%(asctime)s] [%(levelname)s] [%(name)s.%(funcName)s:%(lineno)d] [PID:%(process)d]%(rank_fmt)s %(message)s",
        },
        "concise": {
            "format": "[%(asctime)s] [%(levelname)s] [%(name)s] %(message)s",
        },
        "concise_color": {
            "()": ColorfulFormatter,
            "format": "[%(asctime)s] [%(levelname)s] [%(name)s]%(rank_fmt)s %(message)s",
        },
    },
    "filters": {
        "ax_or_warn": {
            "()": "axolotl.logging_config.AxolotlOrWarnErrorFilter",
        },
    },
    "handlers": {
        "console": {
            "class": "logging.StreamHandler",
            "formatter": "concise",
            "filters": ["ax_or_warn"],
            "stream": "ext://sys.stdout",
        },
        "color_console": {
            "class": "logging.StreamHandler",
            "formatter": "concise_color",
            "filters": ["ax_or_warn"],
            "stream": "ext://sys.stdout",
        },
        "ax_file_only": {
            "class": "logging.StreamHandler",
            "level": "DEBUG",
            "formatter": "simple",
            "stream": "ext://axolotl.utils.tee.file_only_stream",
        },
        "root_file_only": {
            "class": "logging.StreamHandler",
            "level": "DEBUG",
            "formatter": "simple",
            "stream": "ext://axolotl.utils.tee.file_only_stream",
        },
    },
    "root": {
        "handlers": ["console", "root_file_only"],
        "level": os.getenv("LOG_LEVEL", DEFAULT_LOG_LEVEL).upper(),
    },
    "loggers": {
        "axolotl": {
            "handlers": ["color_console", "ax_file_only"],
            "level": os.getenv("AXOLOTL_LOG_LEVEL", DEFAULT_AXOLOTL_LOG_LEVEL).upper(),
            "propagate": False,
        },
    },
}


def configure_logging():
    """Configure with default logging"""
    init()  # Initialize colorama

    dictConfig(DEFAULT_LOGGING_CONFIG)
    logging.setLoggerClass(AxolotlLogger)

    # Route Python warnings through logging so they reach file handlers
    logging.captureWarnings(True)

    # Set default `ACCELERATE_LOG_LEVEL` to `LOG_LEVEL` if available and not set
    if "ACCELERATE_LOG_LEVEL" not in os.environ:
        os.environ["ACCELERATE_LOG_LEVEL"] = os.getenv(
            "LOG_LEVEL", DEFAULT_LOG_LEVEL
        ).upper()


================================================
FILE: src/axolotl/models/__init__.py
================================================


================================================
FILE: src/axolotl/models/mamba/__init__.py
================================================
"""
Modeling module for Mamba models
"""

import importlib


def check_mamba_ssm_installed():
    mamba_ssm_spec = importlib.util.find_spec("mamba_ssm")
    if mamba_ssm_spec is None:
        raise ImportError(
            "MambaLMHeadModel requires mamba_ssm. Please install it with `pip install -e .[mamba-ssm]`"
        )


def fix_mamba_attn_for_loss():
    check_mamba_ssm_installed()

    from mamba_ssm.models import mixer_seq_simple

    from .modeling_mamba import MambaLMHeadModel as MambaLMHeadModelFixed

    mixer_seq_simple.MambaLMHeadModel = MambaLMHeadModelFixed
    return mixer_seq_simple.MambaLMHeadModel


================================================
FILE: src/axolotl/models/mamba/configuration_mamba.py
================================================
"""
HF Transformers MambaConfig
"""

from transformers import PretrainedConfig


class MambaConfig(PretrainedConfig):
    """
    modeling configuration for state space model/mamba
    """

    model_type = "mamba"

    def __init__(
        self,
        vocab_size=50280,
        d_model=2560,
        n_layer=64,
        rms_norm=True,
        residual_in_fp32=True,
        fused_add_norm=True,
        pad_vocab_size_multiple=8,
        pad_token_id=50277,
        bos_token_id=0,
        eos_token_id=0,
        tie_word_embeddings=False,
        **kwargs,
    ):
        self.vocab_size = vocab_size
        self.d_model = d_model
        self.n_layer = n_layer
        self.rms_norm = rms_norm
        self.residual_in_fp32 = residual_in_fp32
        self.fused_add_norm = fused_add_norm
        self.pad_vocab_size_multiple = pad_vocab_size_multiple
        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )


================================================
FILE: src/axolotl/models/mamba/modeling_mamba.py
================================================
import os
from collections import namedtuple
from functools import partial
from typing import Optional, Union

import torch
from mamba_ssm.models.mixer_seq_simple import MixerModel, _init_weights
from mamba_ssm.utils.generation import GenerationMixin
from mamba_ssm.utils.hf import load_config_hf, load_state_dict_hf
from torch import nn
from torch.nn import CrossEntropyLoss

from axolotl.models.mamba.configuration_mamba import MambaConfig


class MambaLMHeadModel(nn.Module, GenerationMixin):
    def __init__(
        self,
        d_model: int,
        n_layer: int,
        vocab_size: int,
        initializer_cfg=None,
        pad_vocab_size_multiple: int = 1,
        device=None,
        dtype=None,
        **backbone_kwargs,
    ) -> None:
        factory_kwargs = {"device": device, "dtype": dtype}
        super().__init__()
        if vocab_size % pad_vocab_size_multiple != 0:
            vocab_size += pad_vocab_size_multiple - (
                vocab_size % pad_vocab_size_multiple
            )
        self.config = MambaConfig(
            vocab_size=vocab_size,
            d_model=d_model,
            n_layer=n_layer,
            pad_vocab_size_multiple=pad_vocab_size_multiple,
        )
        self.backbone = MixerModel(
            d_model=d_model,
            n_layer=n_layer,
            vocab_size=vocab_size,
            initializer_cfg=initializer_cfg,
            **backbone_kwargs,
            **factory_kwargs,
        )
        self.lm_head = nn.Linear(d_model, vocab_size, bias=False, **factory_kwargs)

        # Initialize weights and apply final processing
        self.apply(
            partial(
                _init_weights,
                n_layer=n_layer,
                **(initializer_cfg if initializer_cfg is not None else {}),
            )
        )
        self.tie_weights()

    def tie_weights(self):
        self.lm_head.weight = self.backbone.embedding.weight

    def allocate_inference_cache(self, batch_size, max_seqlen, dtype=None, **kwargs):
        return self.backbone.allocate_inference_cache(
            batch_size, max_seqlen, dtype=dtype, **kwargs
        )

    def forward(
        self,
        input_ids,
        position_ids=None,
        inference_params=None,
        num_last_tokens=0,
        labels=None,
        **kwargs,
    ):
        """
        "position_ids" is just to be compatible with Transformer generation. We don't use it.
        num_last_tokens: if > 0, only return the logits for the last n tokens
        """
        hidden_states = self.backbone(input_ids, inference_params=inference_params)
        if num_last_tokens > 0:
            hidden_states = hidden_states[:, -num_last_tokens:]
        lm_logits = self.lm_head(hidden_states)

        CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
        return CausalLMOutput(logits=lm_logits)

        loss = None
        if labels is not None:
            logits = lm_logits
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            shift_logits = shift_logits.view(-1, self.config.vocab_size)
            shift_labels = shift_labels.view(-1)
            # Enable model parallelism
            shift_labels = shift_labels.to(shift_logits.device)
            loss = loss_fct(shift_logits, shift_labels)
            CausalLMOutput = namedtuple("CausalLMOutput", ["logits", "loss"])
            print(loss)
            return CausalLMOutput(logits=lm_logits, loss=loss)

        else:
            CausalLMOutput = namedtuple("CausalLMOutput", ["logits"])
            return CausalLMOutput(logits=lm_logits)

    def save_pretrained(
        self,
        save_directory: Union[str, os.PathLike],
        state_dict: Optional[dict] = None,
        **kwargs,
    ):
        if state_dict is None:
            state_dict = self.state_dict()
        torch.save(state_dict, os.path.join(save_directory, "pytorch_model.bin"))

    @classmethod
    def from_pretrained(cls, pretrained_model_name, device=None, dtype=None, **kwargs):
        config = load_config_hf(pretrained_model_name)
        model = cls(**config, device=device, dtype=dtype, **kwargs)
        model.load_state_dict(
            load_state_dict_hf(pretrained_model_name, device={"": device}, dtype=dtype)
        )
        return model


================================================
FILE: src/axolotl/monkeypatch/__init__.py
================================================


================================================
FILE: src/axolotl/monkeypatch/accelerate/__init__.py
================================================


================================================
FILE: src/axolotl/monkeypatch/accelerate/fsdp2.py
================================================
"""
monkeypatch for accelerate fsdp2 fix when modifying ordereddict during interation, and saving full state dicts
"""

import copy
import functools
import os
import sys

import torch
import torch.distributed as dist
from torch import nn

from axolotl.utils.bench import log_gpu_memory_usage
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def fsdp2_load_full_state_dict(
    _accelerator, model: torch.nn.Module, full_sd: dict, offload_to_cpu: bool = False
):
    """
    Loads the full state dict (could be only on rank 0) into the sharded model. This is done by broadcasting the
    parameters from rank 0 to all other ranks. This function modifies the model in-place.
    Args:
        accelerator (`Accelerator`): The accelerator instance
        model (`torch.nn.Module`):
            The model to load the state dict into, expected to be on meta device or a VRAM spike can occur
        full_sd (`dict`): The full state dict to load, can only be on rank 0
    """
    from torch.distributed.tensor import distribute_tensor

    LOG.info("Broadcasting full state dict to all ranks...")
    import time

    start_time = time.time()

    meta_sharded_sd = model.state_dict()
    sharded_sd = {}
    for param_name, sharded_meta_param in meta_sharded_sd.items():
        full_tensor = None
        if _accelerator.is_main_process:
            full_tensor = full_sd[param_name]
            full_tensor = full_tensor.to(sharded_meta_param.dtype)

        if hasattr(sharded_meta_param, "device_mesh"):
            device_mesh = sharded_meta_param.device_mesh
            if _accelerator.is_main_process:
                full_tensor = full_tensor.to(device_mesh.device_type)
            else:
                full_tensor = torch.empty(
                    sharded_meta_param.size(),
                    device=device_mesh.device_type,
                    dtype=sharded_meta_param.dtype,
                )
            sharded_param = distribute_tensor(
                full_tensor,
                device_mesh,
                sharded_meta_param.placements,
                src_data_rank=0,
            )
        else:
            # Non-sharded parameters
            if _accelerator.is_main_process:
                sharded_param = full_tensor.to(torch.device("cuda"))
            else:
                # broadcast manually
                sharded_param = torch.empty_like(
                    sharded_meta_param,
                    device=torch.device("cuda"),
                    dtype=sharded_meta_param.dtype,
                )
            dist.broadcast(sharded_param, src=0)

        if offload_to_cpu:
            sharded_param = sharded_param.cpu()

        sharded_sd[param_name] = nn.Parameter(sharded_param)

        del full_tensor
        full_sd[param_name] = None

    model.load_state_dict(sharded_sd, assign=True, strict=True)
    end_time = time.time()
    LOG.debug(
        f"Time taken to load full state dict: {(end_time - start_time):.2f} seconds"
    )
    log_gpu_memory_usage(LOG, "Memory usage after broadcasting full state dict", 0)
    return model


def get_state_dict(self, model, unwrap=True):
    """
    Returns the state dictionary of a model sent through [`Accelerator.prepare`] potentially without full
    precision.

    Args:
        model (`torch.nn.Module`):
            A PyTorch model sent through [`Accelerator.prepare`]
        unwrap (`bool`, *optional*, defaults to `True`):
            Whether to return the original underlying state_dict of `model` or to return the wrapped state_dict

    Returns:
        `dict`: The state dictionary of the model potentially without full precision.

    Example:

    ```python
    >>> import torch
    >>> from accelerate import Accelerator

    >>> accelerator = Accelerator()
    >>> net = torch.nn.Linear(2, 2)
    >>> net = accelerator.prepare(net)
    >>> state_dict = accelerator.get_state_dict(net)
    ```
    """
    from accelerate import DistributedType
    from accelerate.utils import compare_versions

    if self.distributed_type == DistributedType.DEEPSPEED:
        zero3_sharding = self.deepspeed_config["zero_optimization"]["stage"] == 3
        tp_sharding = (
            self.deepspeed_config.get("tensor_parallel", {}).get("autotp_size", 0) > 1
        )
        if zero3_sharding or tp_sharding:
            if model.zero_gather_16bit_weights_on_model_save():
                if tp_sharding and not compare_versions("deepspeed", ">=", "0.16.4"):
                    raise ImportError(
                        "Deepspeed TP requires deepspeed >= 0.16.4, Please update DeepSpeed via `pip install deepspeed -U`."
                    )
                state_dict = (
                    model._consolidated_16bit_state_dict()
                    if tp_sharding
                    else model._zero3_consolidated_16bit_state_dict()
                )
            else:
                raise ValueError(
                    "Cannot get 16bit model weights because `stage3_gather_16bit_weights_on_model_save` in DeepSpeed config is False. "
                    "To save the model weights in 16bit, set `stage3_gather_16bit_weights_on_model_save` to True in DeepSpeed config file or "
                    "set `zero3_save_16bit_model` to True when using `accelerate config`. "
                    "To save the full checkpoint, run `model.save_checkpoint(save_dir)` and use `zero_to_fp32.py` to recover weights."
                )
        else:
            from deepspeed.checkpoint.utils import clone_tensors_for_torch_save

            state_dict = clone_tensors_for_torch_save(
                self.unwrap_model(model).state_dict()
            )
    elif self.is_fsdp2:
        # https://github.com/pytorch/torchtune/blob/main/torchtune/training/_distributed.py#L465
        from torch.distributed.tensor import DTensor

        state_dict = {}
        sharded_state_dict = model.state_dict()
        for param_name, param in sharded_state_dict.items():
            if param.is_cpu:
                param = param.to(torch.device("cuda"))

            if isinstance(param, DTensor):
                param = param.full_tensor()

            if torch.distributed.get_rank() == 0:
                state_dict[param_name] = param.cpu()
            torch.distributed.barrier()
    elif self.distributed_type == DistributedType.FSDP:
        from torch.distributed.fsdp import (
            FullStateDictConfig,
            FullyShardedDataParallel as FSDP,
            StateDictType,
        )

        full_state_dict_config = FullStateDictConfig(
            offload_to_cpu=True, rank0_only=True
        )
        with FSDP.state_dict_type(
            model, StateDictType.FULL_STATE_DICT, full_state_dict_config
        ):
            state_dict = model.state_dict()
    else:
        if unwrap:
            model = self.unwrap_model(model)
        state_dict = model.state_dict()

    return state_dict


def patch_peft_param_wrapper_for_fsdp2():
    """Patch PEFT's _LoraParameterProxy.forward for FSDP2 DTensor compatibility.

    PEFT's ParamWrapper applies LoRA via torch.nn.utils.parametrize, which adds
    delta_weight to the base weight W inside _LoraParameterProxy.forward().
    Under FSDP2, W may be a DTensor (from FSDP unshard) while delta_weight is a
    regular Tensor (or vice versa), causing a RuntimeError on mixed types.

    This patch promotes the non-DTensor operand to match the DTensor's spec
    using DTensor.from_local(), which is free for Replicate placement (just
    metadata wrapping, no communication).
    """
    from peft.tuners.lora.layer import _LoraParameterProxy

    if getattr(_LoraParameterProxy, "_axolotl_fsdp2_patched", False):
        return

    _original_forward = _LoraParameterProxy.forward

    # NOTE: Replaces (not wraps) forward; assumes original is just `W + self.delta_weight`.
    def _patched_forward(self, W):
        from torch.distributed.tensor import DTensor

        delta = self.delta_weight
        w_is_dt = isinstance(W, DTensor)
        d_is_dt = isinstance(delta, DTensor)

        with torch.nn.utils.parametrize.cached():
            if w_is_dt == d_is_dt:
                return W + delta
            if w_is_dt:
                return W + DTensor.from_local(delta, W.device_mesh, W.placements)
            return DTensor.from_local(W, delta.device_mesh, delta.placements) + delta

    _LoraParameterProxy.forward = _patched_forward
    _LoraParameterProxy._axolotl_fsdp2_patched = True
    LOG.info("Patched PEFT _LoraParameterProxy.forward for FSDP2 DTensor compatibility")


def _process_lora_module_for_fsdp(module, fsdp2_kwargs):
    """Helper function to process LoRA modules for FSDP2."""
    from peft.tuners.lora.layer import ParamWrapper
    from torch.distributed.fsdp import fully_shard

    # Skip ParamWrapper — its lora_A/B must not be independently sharded.
    # The parent decoder layer's FSDP wrapper handles unsharding them.
    # TODO: review if we even need to shard them separately in first place.
    if isinstance(module, ParamWrapper):
        return False

    log_bias_dtype_mismatch = False

    # Linear4Bit will keep it's bias term in fp32. If the weight dtype is in bf16 we are not able to
    # wrap this. Therefore we must ensure the bias has the same dtype as the weight
    if hasattr(module.base_layer, "bias") and module.base_layer.bias is not None:
        if module.base_layer.weight.dtype != module.base_layer.bias.dtype:
            log_bias_dtype_mismatch = True
            module.base_layer.bias.data = module.base_layer.bias.data.to(
                module.base_layer.weight.dtype
            )

    for active_adapter in module.active_adapters:
        if module.lora_A:
            fully_shard(module.lora_A[active_adapter], **fsdp2_kwargs)
        if module.lora_B:
            fully_shard(module.lora_B[active_adapter], **fsdp2_kwargs)
        if module.lora_magnitude_vector:
            fully_shard(module.lora_magnitude_vector[active_adapter], **fsdp2_kwargs)

    # lora_embedding_A/B are ParameterDicts containing nn.Parameter (Tensors),
    # not nn.Module. fully_shard() only accepts nn.Module, so we cannot shard
    # individual embedding Parameters. Instead, shard the entire LoraLayer module. fully_shard() can be used hierarchically because it does not
    # override groups already assigned by fully_shard(), so modules
    # where fully_shard() was already called are not affected [see https://docs.pytorch.org/docs/stable/distributed.fsdp.fully_shard.html]
    if module.lora_embedding_A or module.lora_embedding_B:
        from torch.distributed.fsdp import FSDPModule

        if not isinstance(module, FSDPModule):
            fully_shard(module, **fsdp2_kwargs)

    return log_bias_dtype_mismatch


def fsdp2_prepare_model(accelerator, model: torch.nn.Module) -> torch.nn.Module:
    """Prepares the model for FSDP2 in-place. Also returns the model to avoid misuse of the original model.

    Args:
        accelerator (`Accelerator`): The accelerator instance
        model (`torch.nn.Module`): The model to prepare

    Returns:
        `torch.nn.Module`: Prepared model
    """
    from accelerate.utils import get_module_children_bottom_up, is_compiled_module
    from accelerate.utils.fsdp_utils import fsdp2_prepare_auto_wrap_policy
    from accelerate.utils.modeling import get_non_persistent_buffers
    from peft import PeftModel
    from peft.tuners.lora import LoraLayer
    from torch.distributed.fsdp import (
        CPUOffloadPolicy,
        FSDPModule,
        MixedPrecisionPolicy,
        fully_shard,
    )

    is_type_fsdp = isinstance(model, FSDPModule) or (
        is_compiled_module(model) and isinstance(model._orig_mod, FSDPModule)
    )
    if is_type_fsdp:
        return model

    fsdp2_plugin = accelerator.state.fsdp_plugin

    original_sd = model.state_dict()

    from torch.distributed.fsdp.wrap import (
        size_based_auto_wrap_policy,
        transformer_auto_wrap_policy,
    )

    # We need the `auto_wrap_policy` original type to create a custom poilicy function for sharding
    # This is because `fully_shard` doesn't support old auto wrap policies, rather we have to imitate the behaviour
    if fsdp2_plugin.auto_wrap_policy is transformer_auto_wrap_policy:
        pass  # auto_wrap_policy_type = "transformer"
    elif fsdp2_plugin.auto_wrap_policy is size_based_auto_wrap_policy:
        pass  # auto_wrap_policy_type = "size"

    # We set `auto_wrap_policy` to `functools.partial` to avoid creating it again
    # This is because of `apply_activation_checkpointing` which will can reuse this function
    fsdp2_plugin.set_auto_wrap_policy(model)

    if fsdp2_plugin.activation_checkpointing:
        from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
            CheckpointImpl,
            apply_activation_checkpointing,
            checkpoint_wrapper,
        )

        # Apply activation checkpointing before applying `fully_shard`
        apply_activation_checkpointing(
            model,
            checkpoint_wrapper_fn=functools.partial(
                checkpoint_wrapper,
                checkpoint_impl=CheckpointImpl.NO_REENTRANT,
            ),
            auto_wrap_policy=fsdp2_plugin.auto_wrap_policy,
        )

    mesh = getattr(accelerator.state, "device_mesh", None)

    # Disable memory pinning if requested
    offload_to_cpu = isinstance(fsdp2_plugin.cpu_offload, CPUOffloadPolicy)
    if offload_to_cpu and os.environ.get("FSDP_CPU_OFFLOAD_PIN_MEMORY", "") == "false":
        fsdp2_plugin.cpu_offload.pin_memory = False

    fsdp2_kwargs = {
        "reshard_after_forward": fsdp2_plugin.reshard_after_forward,
        "offload_policy": fsdp2_plugin.cpu_offload,
        # `fully_shard` doesn't accept `None` in case of `MixedPrecisionPolicy`
        "mp_policy": fsdp2_plugin.mixed_precision_policy or MixedPrecisionPolicy(),
        "mesh": (
            mesh[tuple(accelerator.state.parallelism_config.fsdp_dim_names)]
            if mesh is not None
            else None
        ),
    }
    model_has_params4bit = False
    for _, param in model.named_parameters():
        # this is a temporary fix whereby loading models with bnb params cannot be moved from
        # GPU to a meta device due with FSDP2 because torch operations don't return the original class type
        # bypassing the move to meta will still cause the VRAM spike, but at least it still will load
        if param.__class__.__name__ == "Params4bit":
            model_has_params4bit = True
            break

    if fsdp2_plugin.cpu_ram_efficient_loading and not model_has_params4bit:
        # Context: `fully_shard` moves the model to GPU if it was on CPU, however it can also be on `meta` and then it stays there even after `fully_shard`
        # For this reason, we need to move the model to `meta` device, as then sharding happens on `meta` device
        # If we kept the model on CPU (`cpu_ram_efficient_loading` has model be on CPU on all ranks, though non-main ranks only have `torch.emtpy`), `fully_shard` would move it to GPU
        # Afterwards, when we call `fsdp2_load_full_state_dict`, us creating the state_dict would result into briefly having two copies of model state_dict on the GPU -> VRAM spike

        # We need to keep the original non-persistent buffers, as those MAY not be in the state_dict, resulting in them staying on meta device
        # Also, these buffers aren't getting sharded by default
        # We get the FQNs of all non-persistent buffers, to re-register them after
        non_persistent_buffer_fqns = get_non_persistent_buffers(
            model, recurse=True, fqns=True
        )
        original_non_persistent_buffers = copy.deepcopy(
            {k: v for k, v in model.named_buffers() if k in non_persistent_buffer_fqns}
        )
        # We move the model to meta device, as then sharding happens on meta device
        model = model.to(torch.device("meta"))
        # We need to re-tie the weights, not exactly sure why, but if we don't do this, reference to `lm_head/embed_tokens` stay hanging -> more VRAM usage
        # We assume `transformers` models have a `tie_weights` method if they support it
        if hasattr(model, "tie_weights"):
            model.tie_weights()

    is_peft_model = isinstance(model, PeftModel)

    # Patch PEFT's _LoraParameterProxy for DTensor compatibility if any
    # ParamWrapper modules exist (used for target_parameters / 3D expert params).
    if is_peft_model:
        from peft.tuners.lora.layer import ParamWrapper

        if any(isinstance(m, ParamWrapper) for m in model.modules()):
            patch_peft_param_wrapper_for_fsdp2()

    auto_wrap_policy = fsdp2_prepare_auto_wrap_policy(fsdp2_plugin, model)
    log_bias_dtype_mismatch = False
    if auto_wrap_policy is not None:
        for module in get_module_children_bottom_up(model)[:-1]:
            if is_peft_model and isinstance(module, LoraLayer):
                module_log_bias_mismatch = _process_lora_module_for_fsdp(
                    module, fsdp2_kwargs
                )
                log_bias_dtype_mismatch |= module_log_bias_mismatch
            if auto_wrap_policy(module) and not isinstance(module, FSDPModule):
                fully_shard(module, **fsdp2_kwargs)

    fully_shard(model, **fsdp2_kwargs)

    if log_bias_dtype_mismatch:
        LOG.warning(
            "Bias dtype mismatch detected in LoRA base linear layer. Bias parameters have been cast to weight dtype."
        )

    if fsdp2_plugin.cpu_ram_efficient_loading:
        fsdp2_load_full_state_dict(
            accelerator, model, original_sd, offload_to_cpu=offload_to_cpu
        )

    if fsdp2_plugin.cpu_ram_efficient_loading and not model_has_params4bit:
        # We re-register the buffers, as they may not be in the state_dict
        for fqn, buffer_tensor in original_non_persistent_buffers.items():
            buffer_tensor = buffer_tensor.to(accelerator.device)

            if "." in fqn:
                parent_fqn, local_buffer_name = fqn.rsplit(".", 1)
                parent_module = model.get_submodule(parent_fqn)
            else:
                local_buffer_name = fqn
                parent_module = model

            parent_module.register_buffer(
                local_buffer_name, buffer_tensor, persistent=False
            )

        # We need to tie the weights again, as call to `load_full_state_dict` breaks the tie
        # Needs to be called both here and above
        # removing this call makes the have slightly different loss
        # removing the call above leads to extra memory usage as explained in the comment above
        if hasattr(model, "tie_weights"):
            model.tie_weights()
    return model


def patch_tied_keys_for_meta_device():
    """Patch _adjust_tied_keys_with_tied_pointers to skip meta tensors.

    Meta tensors all share data_ptr()==0, causing every parameter to be incorrectly
    grouped as "tied". Skipping them is safe since they have no real storage.
    """
    from collections import defaultdict

    from transformers import PreTrainedModel

    def _patched_adjust_tied_keys_with_tied_pointers(self, missing_keys):
        param_pointers = defaultdict(list)
        for param_name, param_value in self.state_dict().items():
            if param_value.is_meta:
                continue
            param_pointers[param_value.data_ptr()].append(param_name)

        tied_param_names = [
            names
            for names in param_pointers.values()
            if len(names) > 1
            and not any(name in self.all_tied_weights_keys.keys() for name in names)
            and not all(name in missing_keys for name in names)
        ]

        tied_weights_keys_by_pointers = {
            param_name: group[0]
            for group in tied_param_names
            for param_name in group[1:]
        }
        self.all_tied_weights_keys.update(tied_weights_keys_by_pointers)

    PreTrainedModel._adjust_tied_keys_with_tied_pointers = (
        _patched_adjust_tied_keys_with_tied_pointers
    )


def patch_initialize_missing_keys_for_fsdp():
    """Patch _initialize_missing_keys to skip re-initialization on FSDP non-rank-0.

    When using cpu_ram_efficient_loading, non-rank-0 processes load weights on
    meta device and move them to CPU as empty tensors. Without this patch,
    initialize_weights() re-initializes ALL parameters (via guarded init
    functions), which is slow and uses extra RAM per process.

    The fix marks all params/buffers with _is_hf_initialized=True before calling
    the original method, so guarded init functions (init.normal_, init.zeros_,
    etc.) become no-ops on non-rank-0 processes. The real weights arrive later
    via FSDP broadcast from rank 0.

    Upstream fix: https://github.com/huggingface/transformers/pull/44473
    Remove this patch once transformers includes the fix in a stable release.
    """
    from transformers import PreTrainedModel
    from transformers.modeling_utils import is_fsdp_enabled, is_local_dist_rank_0

    if getattr(PreTrainedModel._initialize_missing_keys, "_axolotl_patched", False):
        return

    _original_initialize_missing_keys = PreTrainedModel._initialize_missing_keys

    def _patched_initialize_missing_keys(self, is_quantized: bool) -> None:
        if is_fsdp_enabled() and not is_local_dist_rank_0():
            for key in self.state_dict():
                try:
                    param_or_buffer = self.get_parameter_or_buffer(key)
                    param_or_buffer._is_hf_initialized = True
                except AttributeError:
                    pass  # may happen when handling pre-quantized weights
            self._is_hf_initialized = True

        _original_initialize_missing_keys(self, is_quantized)

    PreTrainedModel._initialize_missing_keys = _patched_initialize_missing_keys
    PreTrainedModel._initialize_missing_keys._axolotl_patched = True


def patch_accelerate_fsdp2():
    import accelerate

    accelerate.accelerator.fsdp2_prepare_model = fsdp2_prepare_model
    accelerate.Accelerator.get_state_dict = get_state_dict
    setattr(
        sys.modules["accelerate"],
        "Accelerator.get_state_dict",
        get_state_dict,
    )


================================================
FILE: src/axolotl/monkeypatch/accelerate/parallelism_config.py
================================================
"""
workaround to allow parallelism config for pure CP
"""

import os
import warnings

from accelerate import DistributedType


def _validate_accelerator(self, accelerator):
    _warnings = set()
    if not accelerator.multi_device and self.total_size == 1:
        # No distributed setup, valid parallelism config
        return

    # We need this to ensure DDP works
    if self.total_size == 1:
        self._set_size("dp_replicate", accelerator.num_processes)

    if self.total_size != accelerator.num_processes:
        raise ValueError(
            f"ParallelismConfig total_size ({self.total_size}) does not match "
            f"num_processes ({accelerator.num_processes}). Please adjust dp_replicate_size/ "
            f"dp_shard_size/tp_size/cp_size."
        )

    # allow parallelism config when not using fsdp if using pure context parallelism
    allow_parallelism_config = False

    if (
        self.cp_size > 1
        and self.dp_shard_size <= 1
        and os.environ.get("ACCELERATE_ALLOW_CP_STANDALONE", "false").lower() == "true"
    ):
        allow_parallelism_config = True

    if (
        self.total_size > 1
        and not allow_parallelism_config
        and not (accelerator.is_fsdp2 or accelerator.multi_device)
    ):
        raise ValueError(
            f"ParallelismConfig is only compatible DistributedType.FSDP (version 2) or DistributedType.Multi{{Device}}, but got {accelerator.distributed_type}."
        )

    for parallelism, size in self._sizes.items():
        if size == 1 and getattr(self, f"{parallelism}_handler", None) is not None:
            _warnings.add(
                f"ParallelismConfig.{parallelism}_handler is set, but {parallelism}_size is set to 1. This handler will be ignored."
            )

    if _warnings and accelerator.is_main_process:
        warnings.warn(
            "ParallelismConfig has the following warnings:\n" + "\n".join(_warnings),
            UserWarning,
            stacklevel=2,
        )


def patched_is_fsdp2(self) -> bool:
    """
    Patched version of is_fsdp2 that guards against a None fsdp_plugin.
    """
    # The new logic checks if fsdp_plugin exists before accessing its attributes
    return (
        self.distributed_type == DistributedType.FSDP
        and self.fsdp_plugin
        and self.fsdp_plugin.fsdp_version == 2
    )


def patch_parallelism_config():
    from accelerate.accelerator import AcceleratorState, ParallelismConfig

    ParallelismConfig._validate_accelerator = _validate_accelerator
    AcceleratorState.is_fsdp2 = property(patched_is_fsdp2)


def patch_prepare_cp():
    import contextlib

    from accelerate import Accelerator

    def patched_prepare_cp(self, *args):
        if self.parallelism_config.cp_backend == "deepspeed":
            return args

        @contextlib.contextmanager
        def _noop_cp_context(
            buffers=None, buffer_seq_dims=None, no_restore_buffers=None
        ):
            yield

        self._cp_context = _noop_cp_context
        return args

    Accelerator._prepare_cp = patched_prepare_cp


================================================
FILE: src/axolotl/monkeypatch/attention/__init__.py
================================================
"""
attention module for attention monkeypatches
"""

from transformers.integrations.flash_attention import flash_attention_forward


def patch_xformers_attn_over_fa2():
    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS

    from .xformers import xformers_attention_forward

    ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = xformers_attention_forward


def unpatch_xformers_attn_over_fa2():
    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS

    ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = flash_attention_forward()


================================================
FILE: src/axolotl/monkeypatch/attention/flash_attn_4.py
================================================
"""Transparently upgrade FA2 to FA4 when available on SM90+ hardware."""

import torch

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def _get_head_dims(model_config):
    """Extract (head_dim, head_dim_v) from a model config.

    Handles composite models (e.g. Qwen3.5 VL) via text_config and
    MLA models (DeepSeek/Kimi) that have separate Q/V head dimensions.
    """
    cfg = model_config
    if hasattr(cfg, "text_config"):
        cfg = cfg.text_config

    # MLA models: Q head_dim = qk_nope + qk_rope, V head_dim = v_head_dim
    if hasattr(cfg, "qk_nope_head_dim") and hasattr(cfg, "qk_rope_head_dim"):
        head_dim = cfg.qk_nope_head_dim + cfg.qk_rope_head_dim
        head_dim_v = getattr(cfg, "v_head_dim", head_dim)
        return head_dim, head_dim_v

    # Standard models
    if hasattr(cfg, "head_dim"):
        return cfg.head_dim, cfg.head_dim
    if hasattr(cfg, "hidden_size") and hasattr(cfg, "num_attention_heads"):
        head_dim = cfg.hidden_size // cfg.num_attention_heads
        return head_dim, head_dim

    return None, None


def patch_flash_attn_4(model_config=None):
    """Patch _lazy_imports to redirect FA2 imports to FA4 if available on supported hardware."""
    if not torch.cuda.is_available():
        return

    major, _ = torch.cuda.get_device_capability()
    # Matches flash_attn/cute/interface.py: arch / 10 in [9, 10, 11]
    if major not in (9, 10, 11):
        return

    try:
        from flash_attn.cute import (  # noqa: F401
            flash_attn_func,
            flash_attn_varlen_func,
        )
    except ImportError:
        LOG.info(
            "Flash Attention 4 is available for your GPU and offers faster training speeds. "
            "To enable: pip install flash-attn-4"
        )
        return

    # Validate head dimensions against FA4's own constraints
    head_dim = None
    if model_config is not None:
        head_dim, head_dim_v = _get_head_dims(model_config)
        if head_dim is not None:
            try:
                from flash_attn.cute.interface import _validate_head_dims
            except ImportError:
                LOG.warning(
                    "Could not import _validate_head_dims from flash_attn.cute.interface, "
                    "unable to verify head dimension compatibility, falling back to FA2"
                )
                return

            # alignment = 16 // element_size; bf16/fp16 = 2 bytes -> alignment = 8
            alignment = 8
            try:
                _validate_head_dims(head_dim, head_dim_v, major, alignment)
            except AssertionError as exc:
                LOG.warning(
                    "Model head dimensions not supported by FA4, "
                    "falling back to FA2: %s",
                    exc,
                )
                return

    import transformers.modeling_flash_attention_utils as fa_utils

    if getattr(fa_utils._lazy_imports, "_axolotl_patched", False):
        return

    def _patched_lazy_imports(
        implementation, attention_wrapper=None, allow_all_kernels=False
    ):
        return (
            flash_attn_func,
            flash_attn_varlen_func,
            fa_utils._pad_input,
            fa_utils._unpad_input,
        )

    _patched_lazy_imports._axolotl_patched = True
    fa_utils._lazy_imports = _patched_lazy_imports
    LOG.info(
        "Flash Attention 4 enabled (head_dim=%s)",
        head_dim if model_config else "unknown",
    )


================================================
FILE: src/axolotl/monkeypatch/attention/flex_attn.py
================================================
"""Flex attention monkey patch"""

import sys

import torch
import transformers
from packaging import version
from transformers.utils.import_utils import _torch_version, is_torch_less_or_equal

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def patch_flex_wrapper(**flex_attn_compile_kwargs):
    # TODO remove this patch when transformers#37285 is merged and in a release
    is_torch_2_6 = torch.__version__.startswith("2.6")

    if not is_torch_2_6:
        return

    from torch.nn.attention.flex_attention import flex_attention

    class WrappedFlexAttention:
        """
        We are doing a singleton class so that flex attention is compiled once when it's first called.
        """

        _instance = None
        _is_flex_compiled = False
        _compiled_flex_attention = None

        def __new__(cls, *args, **kwargs):
            if cls._instance is None:
                # Create a new instance if one doesn't already exist
                cls._instance = super().__new__(cls)
            return cls._instance

        @classmethod
        def del_singleton(cls):
            cls._instance = None

        @torch.compiler.disable(recursive=False)
        def __init__(self, training):
            """
            Initialize or update the singleton instance.
            """
            self.training = None
            if not self._is_flex_compiled or training != self.training:
                self.training = training
                if is_torch_less_or_equal("2.5.1"):
                    self._compiled_flex_attention = torch.compile(
                        flex_attention, dynamic=False
                    )
                # In PyTorch 2.6.0, there's a known issue with flex attention compilation which may
                # cause errors. The suggested fix is to compile with "max-autotune-no-cudagraphs"
                # see https://github.com/pytorch/pytorch/issues/146260 for training
                elif version.parse(_torch_version).base_version == "2.6.0" and training:
                    self._compiled_flex_attention = torch.compile(
                        flex_attention, dynamic=False, mode="max-autotune-no-cudagraphs"
                    )
                # Fallback, usually the most recent torch 2.7.x+ versions
                else:
                    LOG.info(
                        "Compiling flex attention with kwargs: %s. This may take a while...",
                        flex_attn_compile_kwargs,
                    )
                    self._compiled_flex_attention = torch.compile(
                        flex_attention,
                        **flex_attn_compile_kwargs,
                    )
                    LOG.info("Flex attention compiled successfully.")

                self._is_flex_compiled = True

        def __call__(self):
            return self._compiled_flex_attention

    transformers.integrations.flex_attention.WrappedFlexAttention = WrappedFlexAttention
    sys.modules[
        "transformers.integrations.flex_attention"
    ].WrappedFlexAttention = WrappedFlexAttention


================================================
FILE: src/axolotl/monkeypatch/attention/sage_attn.py
================================================
"""
Monkeypatch for SageAttention for use with transformers.

https://github.com/thu-ml/SageAttention/
"""

import torch
from transformers.integrations.sdpa_attention import repeat_kv

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

sageattn = None  # pylint: disable=invalid-name
sageattn_varlen = None  # pylint: disable=invalid-name


def _is_sageattn_available():
    """Determine if SageAttention is available"""
    try:
        import sageattention  # noqa: F401 # pylint: disable=unused-import

        return True
    except ImportError:
        return False


if _is_sageattn_available():
    # import sageattn here if available
    from sageattention import sageattn, sageattn_varlen


def _check_sageattn_imported():
    """Check if SageAttention is imported. Raises an ImportError if not."""
    if sageattn is None:
        raise ImportError(
            "SageAttention is not installed. Please install it from source: "
            "`pip install git+https://github.com/thu-ml/SageAttention.git@1718ddc06dbc694bcf3c6b49ac28c1921aa2d8bd`"
        )


def sage_attention_forward(
    module: torch.nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: torch.Tensor | None = None,
    dropout: float = 0.0,
    scaling: float | None = None,
    is_causal: bool | None = None,
    **kwargs,
) -> tuple[torch.Tensor, None]:
    """
    Forward pass for SageAttention compatible with transformers attention interfaces.

    https://github.com/thu-ml/SageAttention/
    """

    _check_sageattn_imported()

    if kwargs.get("output_attentions", False) or kwargs.get("head_mask") is not None:
        raise NotImplementedError(
            "SageAttention does not support `output_attentions=True` or `head_mask`."
        )

    # The base sageattn API does not support dropout.
    if dropout > 0.0:
        raise NotImplementedError("SageAttention does not support dropout.")

    # Handle Grouped-Query Attention (GQA) and Multi-Query Attention (MQA)
    if hasattr(module, "num_key_value_groups"):
        key = repeat_kv(key, module.num_key_value_groups)
        value = repeat_kv(value, module.num_key_value_groups)

    # Calculate is_causal following transformers
    assert is_causal is not False, "is_causal must be True or None"
    is_causal = True

    position_ids = kwargs.get("position_ids", None)
    query_length = query.shape[2]

    cu_seqlens_q = kwargs.get("cu_seqlens_q", None)
    cu_seqlens_k = kwargs.get("cu_seqlens_k", None)
    max_length_q = kwargs.get("max_length_q", None)
    max_length_k = kwargs.get("max_length_k", None)

    # Sample packing uses position_ids, so we check for it first
    if position_ids is not None and (
        max_length_q is not None
        or (query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all())
    ):
        # transpose inputs to NHD layout for use with FA2 utils
        query = query.transpose(1, 2)
        key = key.transpose(1, 2)
        value = value.transpose(1, 2)

        batch_size = query.size(0)

        from transformers.modeling_flash_attention_utils import (
            prepare_fa2_from_position_ids,
        )

        if cu_seqlens_q is None or cu_seqlens_k is None:
            query, key, value, indices_q, cu_seq_lens, max_seq_lens = (
                prepare_fa2_from_position_ids(query, key, value, position_ids)
            )

            cu_seqlens_q, cu_seqlens_k = cu_seq_lens
            max_length_q, max_length_k = max_seq_lens

        else:
            query = query.reshape(-1, query.size(-2), query.size(-1))
            key = key.reshape(-1, key.size(-2), key.size(-1))
            value = value.reshape(-1, value.size(-2), value.size(-1))

        attn_output_unpad = sageattn_varlen(
            q=query,
            k=key,
            v=value,
            cu_seqlens_q=cu_seqlens_q,
            cu_seqlens_k=cu_seqlens_k,
            max_seqlen_q=max_length_q,
            max_seqlen_k=max_length_k,
            is_causal=is_causal,
            sm_scale=scaling,
            smooth_k=False,  # reduces loss 0 / nan grad norms
            tensor_layout="NHD",
        )

        attn_output = attn_output_unpad.view(
            batch_size, -1, attn_output_unpad.size(-2), attn_output_unpad.size(-1)
        )

    elif attention_mask is not None:
        # NOTE: When used without `pad_to_sequence_len`, the loss becomes unstable after a few steps.

        assert attention_mask.ndim == 2, "Attention mask must be 2D"

        from transformers.modeling_flash_attention_utils import (
            _upad_input,
        )

        # transpose inputs to NHD layout for use with FA2 utils
        query = query.transpose(1, 2)
        key = key.transpose(1, 2)
        value = value.transpose(1, 2)

        batch_size = query.shape[0]

        query, key, value, indices_q, cu_seq_lens, max_seq_lens = _upad_input(
            query, key, value, attention_mask, query_length
        )
        cu_seqlens_q, cu_seqlens_k = cu_seq_lens
        max_seqlen_q, max_seqlen_k = max_seq_lens

        attn_output_unpad = sageattn_varlen(
            q=query,
            k=key,
            v=value,
            cu_seqlens_q=cu_seqlens_q,
            cu_seqlens_k=cu_seqlens_k,
            max_seqlen_q=max_seqlen_q,
            max_seqlen_k=max_seqlen_k,
            is_causal=is_causal,
            sm_scale=scaling,
            tensor_layout="NHD",
        )

        from flash_attn.bert_padding import pad_input

        attn_output = pad_input(attn_output_unpad, indices_q, batch_size, query_length)
    else:
        # Use standard sageattn
        # The input layout for transformers models is (batch_size, num_heads, seq_len, head_dim),
        # which corresponds to SageAttention's "HND" layout.
        attn_output = sageattn(
            q=query,
            k=key,
            v=value,
            tensor_layout="HND",
            is_causal=is_causal,
            sm_scale=scaling,
        )

        # SageAttention with "HND" returns (batch, heads, seq_len, head_dim)
        # Transformers expects (batch, seq_len, heads, head_dim) for the output
        # So we need to transpose dimensions 1 and 2
        attn_output = attn_output.transpose(1, 2).contiguous()

    return attn_output, None


def patch_sageattn():
    """Patch SageAttention for use with transformers."""

    _check_sageattn_imported()

    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS

    # Replace flash attention with sage attention
    ALL_ATTENTION_FUNCTIONS.register("flash_attention_2", sage_attention_forward)

    # Note: New method after transformers refactor to use ALL_MASK_ATTENTION_FUNCTIONS
    # Register sage_attention with the global attention interface
    # ALL_ATTENTION_FUNCTIONS.register("sage_attention", sage_attention_forward)

    # from transformers.masking_utils import ALL_MASK_ATTENTION_FUNCTIONS, flash_attention_mask

    # ALL_MASK_ATTENTION_FUNCTIONS.register("sage_attention", flash_attention_mask)

    LOG.info("SageAttention patched successfully")


================================================
FILE: src/axolotl/monkeypatch/attention/xformers.py
================================================
"""
xformers attention implementation for packing
"""

from typing import Optional

import torch
import xformers
import xformers.ops.fmha
from transformers.modeling_flash_attention_utils import (
    _upad_input,
)

from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids

xformers_attention = xformers.ops.fmha.memory_efficient_attention


def xformers_attention_forward(
    module: torch.nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    dropout: float = 0.0,
    scaling: Optional[float] = None,
    sliding_window: Optional[int] = None,
    softcap: Optional[float] = None,
    cu_seq_lens_q: Optional[torch.LongTensor] = None,
    cu_seq_lens_k: Optional[torch.LongTensor] = None,
    max_length_q: Optional[int] = None,
    max_length_k: Optional[int] = None,
    **kwargs,
):
    # Get dimensions
    # query: [batch, heads, seq_len, hidden_dim]
    batch_size = query.size(0)
    query_length = query.shape[2]
    key_length = key.shape[2]

    # Default causal mask
    attn_bias = xformers.ops.LowerTriangularMask()

    # Check if we have sliding window attention
    has_sliding_window = sliding_window is not None and sliding_window < query_length

    # Transpose dimensions for xformers (Q: [b, h, s, d] -> [b, s, h, d])
    query = query.transpose(1, 2)
    key = key.transpose(1, 2)
    value = value.transpose(1, 2)

    # Get GQA parameters
    num_attention_heads = module.config.num_attention_heads
    num_key_value_heads = module.config.num_key_value_heads
    head_dim = query.size(-1)
    is_gqa = num_attention_heads != num_key_value_heads
    n_groups = num_attention_heads // num_key_value_heads if is_gqa else 1

    # If position_ids is provided and check all examples do not contain only 1 sequence, If tensor in increasing
    # then we probably have one sequence, otherwise it is packed. Additionally check we are in pre-fill/training stage.
    # Use `flash_attn_varlen_func` to prevent cross-example attention and also allow padding free approach
    if position_ids is not None and (
        max_length_q is not None
        or (query_length != 1 and not (torch.diff(position_ids, dim=-1) >= 0).all())
    ):
        if cu_seq_lens_q is None or cu_seq_lens_k is None:
            cu_seq_lens_q = get_cu_seqlens_from_pos_ids(position_ids)[0]
            cu_seq_lens_q = cu_seq_lens_q.squeeze()
            seq_lengths = cu_seq_lens_q[1:] - cu_seq_lens_q[:-1]
            attn_bias = (
                xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask.from_seqlens(
                    q_seqlen=seq_lengths.tolist(),
                )
            )
        else:
            query = query.reshape(-1, query.size(-2), query.size(-1))
            key = key.reshape(-1, key.size(-2), key.size(-1))
            value = value.reshape(-1, value.size(-2), value.size(-1))

        # Handle GQA
        if is_gqa:
            key = key.repeat_interleave(n_groups, dim=2)
            value = value.repeat_interleave(n_groups, dim=2)

    elif attention_mask is not None:
        query, key, value, _, cu_seq_lens, _ = _upad_input(
            query, key, value, attention_mask, query_length
        )
        cu_seq_lens_q, cu_seq_lens_k = cu_seq_lens
        seq_lengths = []
        for i in range(len(cu_seq_lens_q) - 1):
            seq_lengths.append(cu_seq_lens_q[i + 1] - cu_seq_lens_q[i])
        attn_bias = xformers.ops.fmha.attn_bias.BlockDiagonalCausalMask.from_seqlens(
            q_seqlen=seq_lengths,
            kv_seqlen=seq_lengths,
        )

        # Handle GQA
        if is_gqa:
            key = key.repeat_interleave(n_groups, dim=2)
            value = value.repeat_interleave(n_groups, dim=2)
    else:
        # Handle Group Query Attention (GQA) using view/expand approach from reference
        key = key.view(batch_size, key_length, num_key_value_heads, 1, head_dim)
        value = value.view(batch_size, key_length, num_key_value_heads, 1, head_dim)
        key = key.expand(
            batch_size, key_length, num_key_value_heads, n_groups, head_dim
        )
        value = value.expand(
            batch_size, key_length, num_key_value_heads, n_groups, head_dim
        )

        if module.training:
            key = key.reshape(batch_size, key_length, num_attention_heads, head_dim)
            value = value.reshape(batch_size, key_length, num_attention_heads, head_dim)

            if has_sliding_window:
                query = query.view(
                    1, batch_size * query_length, num_attention_heads, head_dim
                )
                key = key.view(
                    1, batch_size * key_length, num_attention_heads, head_dim
                )
                value = value.view(
                    1, batch_size * key_length, num_attention_heads, head_dim
                )
        else:
            query = query.view(
                batch_size, query_length, num_key_value_heads, n_groups, head_dim
            )

            # If we need a sliding window attention
            if has_sliding_window:
                query = query.view(
                    1,
                    batch_size * query_length,
                    num_key_value_heads,
                    n_groups,
                    head_dim,
                )
                key = key.view(
                    1, batch_size * key_length, num_key_value_heads, n_groups, head_dim
                )
                value = value.view(
                    1, batch_size * key_length, num_key_value_heads, n_groups, head_dim
                )

    # Run the xformers attention
    attn_output = xformers_attention(
        query,
        key,
        value,
        attn_bias=attn_bias,
    )

    attn_output = attn_output.view(
        batch_size, -1, attn_output.size(-2), attn_output.size(-1)
    )
    return attn_output, None


================================================
FILE: src/axolotl/monkeypatch/btlm_attn_hijack_flash.py
================================================
"""
Flash attention monkey patch for cerebras btlm model
"""

import importlib
from typing import Optional, Tuple

import torch
from accelerate import init_empty_weights
from flash_attn.flash_attn_interface import flash_attn_func
from transformers import AutoConfig, AutoModelForCausalLM

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def replace_btlm_attn_with_flash_attn(model_name="cerebras/btlm-3b-8k-base"):
    # this is a wonky hack to get the remotely loaded module
    model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
    # we need to load the model here in order for modeling_btlm to be available
    with init_empty_weights():
        AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
    module_name = model_config.__class__.__module__.replace(
        ".configuration_btlm", ".modeling_btlm"
    )
    modeling_btlm = importlib.import_module(module_name)
    modeling_btlm.BTLMAttention._attn = flashattn_attn


def flashattn_attn(
    self,
    query: torch.Tensor,
    key: Optional[torch.Tensor] = None,
    value: Optional[torch.Tensor] = None,
    attention_mask: Optional[torch.Tensor] = None,
    head_mask: Optional[torch.Tensor] = None,
    position_bias: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor]]:
    softmax_scale = (
        1 / (key.size(-1) ** self.attn_scale_power) if self.scale_attn_weights else None
    )

    query = query.permute(0, 2, 1, 3)
    key = key.permute(0, 2, 1, 3)
    value = value.permute(0, 2, 1, 3)

    # Perform Flash attention
    attn_output = flash_attn_func(
        query,
        key,
        value,
        dropout_p=0.0,  # Assuming you have this attribute
        softmax_scale=softmax_scale,  # Set this if you have specific scaling in mind
        causal=not self.is_cross_attention,  # Assuming you have this attribute
        return_attn_probs=False,  # Set this based on your needs
    )

    # Optional: Apply head mask if it's not None
    if head_mask is not None:
        attn_output *= head_mask

    attn_output = attn_output.permute(0, 2, 1, 3)

    return attn_output, None  # We don't have explicit attn_weights in Flash attention


================================================
FILE: src/axolotl/monkeypatch/data/__init__.py
================================================


================================================
FILE: src/axolotl/monkeypatch/data/batch_dataset_fetcher.py
================================================
"""Monkey patches for the dataset fetcher to handle batches of packed indexes."""

import torch
from torch.utils.data._utils.fetch import _BaseDatasetFetcher
from torch.utils.data._utils.worker import _worker_loop

_ORIGINAL_MAP_DATASET_FETCHER = None
_ORIGINAL_WORKER_LOOP = None
_IS_PATCHED = False


class _MapDatasetFetcher(_BaseDatasetFetcher):
    """
    Custom dataset fetcher that handles nested batch structures from
    MultipackBatchSampler.
    """

    def fetch(self, possibly_batched_index):
        if isinstance(possibly_batched_index[0], list):
            # Handle nested structure from MultipackBatchSampler
            data = [None for i in possibly_batched_index]
            for i, possibly_batched_index_ in enumerate(possibly_batched_index):
                if self.auto_collation:
                    if (
                        hasattr(self.dataset, "__getitems__")
                        and self.dataset.__getitems__
                    ):
                        data[i] = self.dataset.__getitems__(possibly_batched_index_)
                    else:
                        data[i] = [self.dataset[idx] for idx in possibly_batched_index_]
                else:
                    data[i] = self.dataset[possibly_batched_index_]
        else:
            # Standard batch handling
            if self.auto_collation:
                if hasattr(self.dataset, "__getitems__") and self.dataset.__getitems__:
                    data = self.dataset.__getitems__(possibly_batched_index)
                else:
                    data = [self.dataset[idx] for idx in possibly_batched_index]
            else:
                data = self.dataset[possibly_batched_index]
        return self.collate_fn(data)


def patch_fetchers():
    """Apply patches to PyTorch's DataLoader components."""
    torch.utils.data._utils.fetch._MapDatasetFetcher = _MapDatasetFetcher
    torch.utils.data.dataloader._utils.fetch._MapDatasetFetcher = _MapDatasetFetcher


def patched_worker_loop(*args, **kwargs):
    """Worker loop that ensures patches are applied in worker processes."""
    patch_fetchers()
    return _worker_loop(*args, **kwargs)


def apply_multipack_dataloader_patch():
    """
    This patch allows DataLoader to correctly process batches that contain multiple bins
    of packed sequences.
    """
    # pylint: disable=global-statement
    global _ORIGINAL_MAP_DATASET_FETCHER, _ORIGINAL_WORKER_LOOP, _IS_PATCHED

    if _IS_PATCHED:
        return

    # Store original implementations
    _ORIGINAL_MAP_DATASET_FETCHER = torch.utils.data._utils.fetch._MapDatasetFetcher
    _ORIGINAL_WORKER_LOOP = torch.utils.data._utils.worker._worker_loop

    # Apply patches
    patch_fetchers()
    torch.utils.data._utils.worker._worker_loop = patched_worker_loop

    _IS_PATCHED = True


def remove_multipack_dataloader_patch():
    """Remove the monkeypatch and restore original PyTorch DataLoader behavior."""
    # pylint: disable=global-statement
    global _IS_PATCHED

    if not _IS_PATCHED:
        return

    if _ORIGINAL_MAP_DATASET_FETCHER:
        torch.utils.data._utils.fetch._MapDatasetFetcher = _ORIGINAL_MAP_DATASET_FETCHER
        torch.utils.data.dataloader._utils.fetch._MapDatasetFetcher = (
            _ORIGINAL_MAP_DATASET_FETCHER
        )

    if _ORIGINAL_WORKER_LOOP:
        torch.utils.data._utils.worker._worker_loop = _ORIGINAL_WORKER_LOOP

    _IS_PATCHED = False


================================================
FILE: src/axolotl/monkeypatch/deepspeed_utils.py
================================================
import importlib
import importlib.util

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def patch_checkpoint_wrapper_setattr():
    """
    Patch CheckpointWrapper to properly forward DeepSpeed attributes to wrapped modules.

    This fixes the issue where CheckpointWrapper doesn't forward ds_* attributes
    (like ds_grads_remaining) to the actual wrapped module, causing DeepSpeed
    ZeRO-3 to fail when gradient checkpointing is enabled.

    This issue occurs specifically with:
    - QLoRA + DeepSpeed ZeRO-3
    - gradient_checkpointing: true
    - activation_offloading: true

    References:
    - https://github.com/deepspeedai/DeepSpeed/issues/7203
    - https://github.com/deepspeedai/DeepSpeed/blob/38d1a9eb64c9e01e32eccc50b25ba18925287441/deepspeed/runtime/zero/parameter_offload.py#L424-L458
    - https://github.com/axolotl-ai-cloud/axolotl/pull/3102
    """

    try:
        from torch.distributed.algorithms._checkpoint.checkpoint_wrapper import (
            CheckpointWrapper,
        )

        # Check if already patched
        if hasattr(CheckpointWrapper, "_axolotl_setattr_patched"):
            LOG.debug("CheckpointWrapper already patched")
            return

        original_setattr = CheckpointWrapper.__setattr__

        def new_setattr(self, name: str, value) -> None:
            if name.startswith("ds_") and hasattr(self, "_checkpoint_wrapped_module"):
                setattr(self._checkpoint_wrapped_module, name, value)
                LOG.debug(
                    f"Forwarded {name} to wrapped module {type(self._checkpoint_wrapped_module).__name__}"
                )
            else:
                original_setattr(self, name, value)

        CheckpointWrapper.__setattr__ = new_setattr
        CheckpointWrapper._axolotl_setattr_patched = True

        LOG.info("CheckpointWrapper patched to forward DeepSpeed attributes")

    except ImportError as e:
        LOG.debug(f"CheckpointWrapper not available: {e}")
    except Exception as e:
        LOG.warning(f"Failed to patch CheckpointWrapper: {e}")


def apply_deepspeed_patches():
    """
    Apply DeepSpeed-related patches
    """
    if importlib.util.find_spec("deepspeed") is not None:
        patch_checkpoint_wrapper_setattr()
    else:
        LOG.debug("DeepSpeed not available, skipping patches")


================================================
FILE: src/axolotl/monkeypatch/fsdp2_qlora.py
================================================
"""
Monkeypatch to add Params4bit and Int8Params support to FSDP2. This enables QLoRA + FSDP2
and 8-bit LoRA + FSDP2, as well as our LoRA / QLoRA Triton kernels to work with FSDP2.

This patch modifies the _init_sharded_param and init_unsharded_param methods in FSDPParam
to handle bitsandbytes Params4bit and Int8Params parameters, preserving their quantization
metadata through the FSDP2 shard/unshard cycle.
"""

import importlib
import inspect

from axolotl.monkeypatch.utils import detab_code
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def apply_init_sharded_param_patch():
    """Apply patch to FSDPParam._init_sharded_param to support Params4bit."""
    if getattr(apply_init_sharded_param_patch, "_axolotl_patched", False):
        return
    from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam

    # Get original source
    original_source = inspect.getsource(FSDPParam._init_sharded_param)
    original_source, _ = detab_code(original_source)

    # Define the replacement
    original_param_creation = """    self.sharded_param = nn.Parameter(self.to_sharded_dtensor(sharded_param))
    self.sharded_param.requires_grad_(param.requires_grad)"""

    patched_param_creation = """    import bitsandbytes as bnb
    if isinstance(param, bnb.nn.modules.Params4bit):
        self.sharded_param = bnb.nn.modules.Params4bit(
            data=sharded_param,
            requires_grad=param.requires_grad,
            quant_state=param.quant_state,
            blocksize=param.blocksize,
            compress_statistics=param.compress_statistics,
            quant_type=param.quant_type,
            quant_storage=param.quant_storage,
            module=param.module,
            bnb_quantized=param.bnb_quantized,
        )
        self.sharded_param = self.to_sharded_dtensor(self.sharded_param)
    elif isinstance(param, bnb.nn.modules.Int8Params):
        self.sharded_param = bnb.nn.modules.Int8Params(
            data=sharded_param,
            requires_grad=param.requires_grad,
            has_fp16_weights=param.has_fp16_weights,
            CB=None,
            SCB=param.SCB,
        )
        self.sharded_param = self.to_sharded_dtensor(self.sharded_param)
    else:
        self.sharded_param = nn.Parameter(
            self.to_sharded_dtensor(sharded_param),
            requires_grad=param.requires_grad,
        )"""

    # Apply the replacement
    if original_param_creation in original_source:
        patched_source = original_source.replace(
            original_param_creation, patched_param_creation
        )
        patched_source = patched_source.replace(
            "def _init_sharded_param(",
            "def patched_init_sharded_param(",
            1,
        )

        # Load necessary imports
        module_name = FSDPParam.__module__
        module = importlib.import_module(module_name)

        items_to_import = []
        for item in dir(module):
            if item in patched_source:
                items_to_import.append(item)

        exec(  # nosec B102
            f"from {module_name} import ({', '.join(items_to_import)})",
            globals(),
        )
        exec(patched_source, globals())  # nosec B102

        # Replace the method
        FSDPParam._init_sharded_param = patched_init_sharded_param
        apply_init_sharded_param_patch._axolotl_patched = True
        LOG.info("Successfully applied FSDP _init_sharded_param patch")
    else:
        LOG.warning("Could not find target code for _init_sharded_param patching")


def apply_init_unsharded_param_patch():
    """Apply patch to FSDPParam.init_unsharded_param to support Params4bit."""
    if getattr(apply_init_unsharded_param_patch, "_axolotl_patched", False):
        return
    from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam

    # Get original source
    original_source = inspect.getsource(FSDPParam.init_unsharded_param)
    original_source, _ = detab_code(original_source)

    # Define the replacement
    original_param_creation = """        self._unsharded_param = nn.Parameter(
            unsharded_param, requires_grad=self.sharded_param.requires_grad
        )"""

    patched_param_creation = """        import bitsandbytes as bnb
        local_tensor = self.sharded_param._local_tensor
        if isinstance(local_tensor, bnb.nn.modules.Params4bit):
            self._unsharded_param = bnb.nn.modules.Params4bit(
                data=unsharded_param,
                requires_grad=self.sharded_param.requires_grad,
                quant_state=local_tensor.quant_state,
                blocksize=local_tensor.blocksize,
                compress_statistics=local_tensor.compress_statistics,
                quant_type=local_tensor.quant_type,
                quant_storage=local_tensor.quant_storage,
                module=local_tensor.module,
                bnb_quantized=local_tensor.bnb_quantized,
            )
        elif isinstance(local_tensor, bnb.nn.modules.Int8Params):
            self._unsharded_param = bnb.nn.modules.Int8Params(
                data=unsharded_param,
                requires_grad=self.sharded_param.requires_grad,
                has_fp16_weights=local_tensor.has_fp16_weights,
                CB=unsharded_param,
                SCB=local_tensor.SCB,
            )
        else:
            self._unsharded_param = nn.Parameter(
                unsharded_param, requires_grad=self.sharded_param.requires_grad
            )"""

    # Apply the replacement
    if original_param_creation in original_source:
        patched_source = original_source.replace(
            original_param_creation, patched_param_creation
        )
        patched_source = patched_source.replace(
            "def init_unsharded_param(",
            "def patched_init_unsharded_param(",
            1,
        )

        # Load necessary imports
        module_name = FSDPParam.__module__
        module = importlib.import_module(module_name)

        items_to_import = []
        for item in dir(module):
            if item in patched_source:
                items_to_import.append(item)

        exec(  # nosec B102
            f"from {module_name} import ({', '.join(items_to_import)})",
            globals(),
        )
        exec(patched_source, globals())  # nosec B102

        # Replace the method
        FSDPParam.init_unsharded_param = patched_init_unsharded_param
        apply_init_unsharded_param_patch._axolotl_patched = True
        LOG.info("Successfully applied FSDP init_unsharded_param patch")
    else:
        LOG.warning("Could not find target code for patching")


def apply_linear8bitlt_save_patch():
    """Patch Linear8bitLt._save_to_state_dict to handle DTensor-wrapped Int8Params.

    After FSDP2 sharding, Linear8bitLt.weight is a DTensor wrapping Int8Params.
    BnB's _save_to_state_dict accesses self.weight.SCB directly, but DTensor
    doesn't proxy custom attribute access to its _local_tensor. This patch
    temporarily unwraps the DTensor during saving so BnB can find the SCB attribute.
    """
    if getattr(apply_linear8bitlt_save_patch, "_axolotl_patched", False):
        return
    import bitsandbytes as bnb
    from torch.distributed.tensor import DTensor

    original_save = bnb.nn.Linear8bitLt._save_to_state_dict

    def _patched_save_to_state_dict(self, destination, prefix, keep_vars):
        # Use _parameters dict directly to bypass nn.Module.__setattr__ type check.
        weight = self._parameters["weight"]
        unwrapped = False
        if isinstance(weight, DTensor) and hasattr(weight, "_local_tensor"):
            self._parameters["weight"] = weight._local_tensor
            unwrapped = True
        try:
            original_save(self, destination, prefix, keep_vars)
        finally:
            if unwrapped:
                self._parameters["weight"] = weight

    bnb.nn.Linear8bitLt._save_to_state_dict = _patched_save_to_state_dict
    apply_linear8bitlt_save_patch._axolotl_patched = True
    LOG.info("Patched Linear8bitLt._save_to_state_dict for DTensor compatibility")


def apply_init_dtype_attrs_patch():
    """Prevent FSDP2 mixed precision from casting non-float quantized params.

    When mixed precision is enabled (e.g., bf16), FSDP2's init_dtype_attrs sets
    param_dtype=bf16 for ALL params. During all-gather, _to_dtype_if_needed casts
    the sharded param to param_dtype. For non-float params (uint8 packed 4-bit,
    int8 quantized) without FSDP2 extensions, this destroys the quantized data.

    Params4bit handles this via fsdp_pre/post_all_gather extensions, but our
    parametrize-based expert quantization uses plain nn.Parameter(uint8/int8)
    without extensions.
    """
    if getattr(apply_init_dtype_attrs_patch, "_axolotl_patched", False):
        return
    from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam

    original_init_dtype_attrs = FSDPParam.init_dtype_attrs

    def patched_init_dtype_attrs(self, mp_policy):
        original_init_dtype_attrs(self, mp_policy)
        # Skip casting non-float quantized params (uint8/int8) without FSDP2
        # extensions — the parametrization chain handles dequantization.
        if self.param_dtype is not None and not self.sharded_param.is_floating_point():
            local = self.sharded_param
            if hasattr(local, "_local_tensor"):
                local = local._local_tensor
            if not hasattr(local, "fsdp_pre_all_gather"):
                self.param_dtype = None

    FSDPParam.init_dtype_attrs = patched_init_dtype_attrs
    apply_init_dtype_attrs_patch._axolotl_patched = True
    LOG.info("Patched FSDPParam.init_dtype_attrs for non-float quantized params")


================================================
FILE: src/axolotl/monkeypatch/gradient_checkpointing/__init__.py
================================================
"""custom checkpointing utils"""

import importlib
from functools import partial

from packaging import version

from axolotl.monkeypatch.gradient_checkpointing.offload_cpu import (  # noqa: F401
    CPU_Offloaded_Gradient_Checkpointer,
)
from axolotl.monkeypatch.gradient_checkpointing.offload_disk import (
    Disco,
)

transformers_version = version.parse(importlib.metadata.version("transformers"))
if transformers_version > version.parse("4.51.3"):
    from transformers.modeling_layers import GradientCheckpointingLayer

    def uses_gc_layers(decoder_layer):
        return isinstance(decoder_layer.func.__self__, GradientCheckpointingLayer)

else:

    def uses_gc_layers(_):
        return False


def hf_grad_checkpoint_offload_wrapper(decoder_layer, *args, use_reentrant=None):
    if uses_gc_layers(decoder_layer):
        return CPU_Offloaded_Gradient_Checkpointer.apply(
            decoder_layer,
            *args,
        )

    return CPU_Offloaded_Gradient_Checkpointer.apply(
        (
            decoder_layer.func.__self__
            if isinstance(decoder_layer, partial)
            else decoder_layer.__self__
        ),
        *args,
    )


def hf_grad_checkpoint_disk_offload_wrapper(decoder_layer, *args, use_reentrant=None):
    if uses_gc_layers(decoder_layer):
        return Disco.apply(
            decoder_layer,
            *args,
        )

    return Disco.apply(
        (
            decoder_layer.func.__self__
            if isinstance(decoder_layer, partial)
            else decoder_layer.__self__
        ),
        *args,
    )


================================================
FILE: src/axolotl/monkeypatch/gradient_checkpointing/offload_cpu.py
================================================
"""CPU offloaded checkpointing"""

# Copyright 2023-present Daniel Han-Chen & the Unsloth team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import inspect

import torch
from packaging import version
from torch.utils.checkpoint import (
    set_device_states,
)

# support different pytorch versions
has_device_type = "device_type" in inspect.signature(set_device_states).parameters

torch_version = version.parse(torch.__version__)

if torch_version < version.parse("2.4.0"):
    torch_cuda_amp_custom_fwd = torch.cuda.amp.custom_fwd
    torch_cuda_amp_custom_bwd = torch.cuda.amp.custom_bwd
else:
    torch_cuda_amp_custom_fwd = torch.amp.custom_fwd(device_type="cuda")
    torch_cuda_amp_custom_bwd = torch.amp.custom_bwd(device_type="cuda")


class CPU_Offloaded_Gradient_Checkpointer(torch.autograd.Function):
    """
    Saves VRAM by smartly offloading to RAM.
    Tiny hit to performance, since we mask the movement via non blocking calls.
    """

    @staticmethod
    @torch_cuda_amp_custom_fwd
    def forward(ctx, forward_function, hidden_states, *args):
        saved_hidden_states = hidden_states.to("cpu", non_blocking=True)
        with torch.no_grad():
            output = forward_function(hidden_states, *args)
        ctx.save_for_backward(saved_hidden_states)
        ctx.forward_function = forward_function
        ctx.args = args
        return output

    @staticmethod
    @torch_cuda_amp_custom_bwd
    def backward(ctx, dY):
        (hidden_states,) = ctx.saved_tensors
        hidden_states = hidden_states.to("cuda", non_blocking=True).detach()
        hidden_states.requires_grad = True
        with torch.enable_grad():
            output = ctx.forward_function(hidden_states, *ctx.args)
            # Newer HF models (e.g. Qwen3MoE) using GradientCheckpointingLayer
            # return a plain tensor, not a tuple.  Older models return tuples
            # like (hidden_states, present_kv, ...).  Unwrap if needed.
            if isinstance(output, (tuple, list)):
                (output,) = output
        torch.autograd.backward(output, dY)
        return (
            None,
            hidden_states.grad,
        ) + (None,) * len(ctx.args)


================================================
FILE: src/axolotl/monkeypatch/gradient_checkpointing/offload_disk.py
================================================
"""
DISCO - DIsk-based Storage and Checkpointing with Optimized prefetching
"""

# Copyright 2025 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import atexit
import concurrent.futures
import os
import queue
import shutil
import tempfile
import threading
import time
import uuid
from collections import deque
from concurrent.futures import Future
from typing import Dict

import torch

from axolotl.utils.logging import get_logger

torch_cuda_amp_custom_fwd = torch.amp.custom_fwd(device_type="cuda")
torch_cuda_amp_custom_bwd = torch.amp.custom_bwd(device_type="cuda")

# Setup logger
logger = get_logger(__name__)


class DiskOffloadManager:
    """
    Manages offloaded tensors and handles prefetching in a separate thread.
    Includes synchronization to prevent race conditions.
    """

    def __init__(
        self,
        prefetch_size: int = 3,
        prefetch_to_gpu: bool = True,
        save_workers: int = 4,
    ):
        """
        Args:
            prefetch_size: Maximum number of tensors to prefetch in the background.
            prefetch_to_gpu: Whether to prefetch tensors directly to GPU memory.
            save_workers: Maximum number of concurrent save operations.
        """
        self.temp_dir = tempfile.mkdtemp(prefix="disco_")

        # Track tensor paths and their status
        self.tensor_paths: deque = deque()  # Ordered history of tensor paths (LIFO)
        self.file_locks: Dict[
            str, threading.Lock
        ] = {}  # Maps file_path -> threading.Lock()
        # Maps file_path -> status ("saving", "ready", "prefetching", "loaded", "deleted")
        self.file_status: Dict[str, str] = {}

        self.max_prefetch = prefetch_size
        self.prefetch_to_gpu = prefetch_to_gpu

        # Thread synchronization
        self.manager_lock = threading.RLock()  # Used for thread-safe operations

        # Prefetch queue and cache
        self.prefetch_queue: queue.Queue = queue.Queue()
        self.prefetch_cache: Dict[str, torch.Tensor] = {}  # Maps file_path -> tensor

        # Save queue and thread pool
        self.save_queue: queue.Queue = queue.Queue()
        self.save_pool = concurrent.futures.ThreadPoolExecutor(max_workers=save_workers)
        self.save_futures: Dict[str, Future] = {}
        self.save_semaphore = threading.Semaphore(
            save_workers * 2
        )  # Limit concurrent save operations

        # Start prefetch worker thread
        self.stop_event = threading.Event()
        # start multiple threads for prefetching
        self.prefetch_worker_count = 2
        self.prefetch_workers = []
        for _ in range(self.prefetch_worker_count):
            worker = threading.Thread(target=self._prefetch_worker, daemon=True)
            worker.start()
            self.prefetch_workers.append(worker)

        # Start save worker thread
        self.save_worker = threading.Thread(target=self._save_worker, daemon=True)
        self.save_worker.start()
        self.idx = 0

        atexit.register(self.cleanup)

    def _save_worker(self):
        """Background thread that processes the save queue"""
        while not self.stop_event.is_set():
            try:
                save_item = self.save_queue.get(timeout=0.5)
                if save_item is None:
                    continue

                tensor, file_path = save_item

                # Submit the save task to the thread pool
                future = self.save_pool.submit(
                    self._save_tensor_to_disk, tensor, file_path
                )
                with self.manager_lock:
                    self.save_futures[file_path] = future

                self.save_queue.task_done()

            except queue.Empty:
                time.sleep(0.01)  # Small sleep to prevent CPU spinning
                continue

    def _save_tensor_to_disk(self, tensor: torch.Tensor, file_path: str):
        """Actually save the tensor to disk"""
        try:
            # Save tensor to disk
            cpu_tensor = tensor.detach().cpu()
            torch.save(cpu_tensor, file_path)
            del cpu_tensor

            with self.manager_lock:
                # Mark file as ready
                self.file_status[file_path] = "ready"

            # Release semaphore
            self.save_semaphore.release()

            return True
        except FileNotFoundError as e:
            logger.error(f"Error saving tensor to {file_path}: {e}")
            with self.manager_lock:
                self.file_status[file_path] = "error"

            # Release semaphore
            self.save_semaphore.release()

            return False

    def _prefetch_worker(self):
        """Background thread that loads tensors from disk ahead of time"""
        while not self.stop_event.is_set():
            try:
                file_path = self.prefetch_queue.get(timeout=0.5)
                if file_path is None:
                    continue

                # Check if file is available and not already in cache
                with self.manager_lock:
                    if (
                        file_path not in self.file_status
                        or self.file_status[file_path] == "deleted"
                    ):
                        self.prefetch_queue.task_done()
                    if file_path in self.prefetch_cache:
                        self.prefetch_queue.task_done()
                        continue

                    # If file is still being saved, wait for it
                    if (
                        self.file_status[file_path] == "saving"
                        and file_path in self.save_futures
                    ):
                        # Re-queue this prefetch request with a little delay
                        self.prefetch_queue.task_done()
                        time.sleep(0.1)
                        self.prefetch_queue.put(file_path)
                        continue

                    # Mark file as being prefetched
                    self.file_status[file_path] = "prefetching"

                # Load tensor from disk and store in cache
                try:
                    if os.path.exists(file_path):
                        if self.prefetch_to_gpu:
                            tensor = torch.load(
                                file_path,
                                map_location=torch.device("cuda"),
                                weights_only=True,
                            )
                        else:
                            tensor = torch.load(file_path, weights_only=True)

                        with self.manager_lock:
                            self.prefetch_cache[file_path] = tensor
                            self.file_status[file_path] = "ready"
                    else:
                        with self.manager_lock:
                            if self.file_status.get(file_path) != "deleted":
                                logger.warning(
                                    f"Prefetch error: File not found {file_path}"
                                )
                                self.file_status[file_path] = "missing"

                except FileNotFoundError as e:
                    with self.manager_lock:
                        if self.file_status.get(file_path) != "deleted":
                            logger.warning(f"Prefetch error for {file_path}: {e}")
                            self.file_status[file_path] = "error"

                self.prefetch_queue.task_done()

            except queue.Empty:
                time.sleep(0.01)  # Small sleep to prevent CPU spinning
                continue

    def save_tensor(self, tensor: torch.Tensor):
        """Save tensor to disk asynchronously and return file path with thread-safe operations"""
        # Generate unique file path
        self.idx += 1
        file_path: str = os.path.join(
            self.temp_dir, f"{self.idx:06d}-{uuid.uuid4()}.pt"
        )

        with self.manager_lock:
            # Mark file as being saved
            self.file_locks[file_path] = threading.Lock()
            self.file_status[file_path] = "saving"
            # Add to history
            self.tensor_paths.append(file_path)

        # Acquire semaphore to limit concurrent save operations
        self.save_semaphore.acquire()
        # Queue tensor for saving in background
        self.save_queue.put((tensor.detach(), file_path))

        return file_path

    def wait_for_save(self, file_path, timeout=None) -> None:
        """Wait for a tensor to be saved to disk"""
        start_time = time.time()
        while timeout is None or time.time() - start_time < timeout:
            with self.manager_lock:
                if self.file_status.get(file_path) == "ready":
                    return
                if self.file_status.get(file_path) in ["error", "missing", "deleted"]:
                    return

                if file_path in self.save_futures:
                    future = self.save_futures[file_path]
                    if future.done():
                        return

            # Small sleep to prevent CPU spinning
            time.sleep(0.01)

        # Timeout
        logger.warning(f"Timeout waiting for tensor to be saved: {file_path}")
        return

    def load_tensor(self, file_path, target_device="cuda"):
        """Load tensor from disk or prefetch cache with proper synchronization"""
        # Wait for tensor to be saved if it's still in progress
        self.wait_for_save(file_path)

        tensor = None

        # Try to get from cache first
        with self.manager_lock:
            # Check if tensor is already in cache
            if file_path in self.prefetch_cache:
                tensor = self.prefetch_cache[file_path]
                del self.prefetch_cache[file_path]
                self.file_status[file_path] = "loaded"

        if tensor is not None:
            # Ensure tensor is on correct device
            if target_device != "cpu" and tensor.device.type == "cpu":
                tensor = tensor.to(target_device, non_blocking=True)
            return tensor

        # If not in cache, load directly from disk
        try:
            if not os.path.exists(file_path):
                logger.error(f"File not found for loading: {file_path}")
                raise FileNotFoundError(f"File not found: {file_path}")

            tensor = torch.load(file_path, weights_only=True)

            with self.manager_lock:
                self.file_status[file_path] = "loaded"

            if target_device != "cpu":
                tensor = tensor.to(target_device, non_blocking=True)

            return tensor

        except Exception as e:
            logger.error(f"Error loading tensor from {file_path}: {e}")
            raise

    def _safe_delete_file(self, file_path):
        """Safely delete a file with proper synchronization"""
        with self.manager_lock:
            # Make sure any save operation is completed
            if file_path in self.save_futures:
                future = self.save_futures[file_path]
                try:
                    if not future.done():
                        future.cancel()
                    del self.save_futures[file_path]
                except FileNotFoundError as e:
                    logger.warning(
                        f"Error canceling save operation for {file_path}: {e}"
                    )

            # Only delete if file exists and is not being prefetched
            status = self.file_status.get(file_path)
            if status in ["ready", "loaded", "error", "missing"]:
                try:
                    if os.path.exists(file_path):
                        os.remove(file_path)
                    self.file_status[file_path] = "deleted"
                    return True
                except FileNotFoundError as e:
                    logger.warning(f"Error deleting file {file_path}: {e}")
            return False

    def trigger_prefetch(self, n=None):
        """Trigger prefetching of the next N tensors with proper synchronization"""
        if n is None:
            n = self.max_prefetch

        prefetch_paths = []
        with self.manager_lock:
            # Find files that are ready to be prefetched (not already in cache or being prefetched)
            for path in reversed(self.tensor_paths):
                if (
                    path not in self.prefetch_cache
                    and self.file_status.get(path) == "ready"
                ):
                    prefetch_paths.append(path)
                    if len(prefetch_paths) >= n:
                        break

        # Queue files for prefetching
        for path in prefetch_paths:
            self.prefetch_queue.put(path)

    def cleanup_tensor(self, file_path: str):
        """Clean up a specific tensor file after it's been used"""
        with self.manager_lock:
            if file_path in self.tensor_paths:
                self.tensor_paths.remove(file_path)

            # Remove from prefetch cache if present
            if file_path in self.prefetch_cache:
                del self.prefetch_cache[file_path]

            # Remove from save futures if present
            if file_path in self.save_futures:
                future = self.save_futures[file_path]
                if not future.done():
                    future.cancel()
                del self.save_futures[file_path]

        # Try to delete the file
        self._safe_delete_file(file_path)

    def cleanup(self):
        """Clean up all temp files and stop prefetch thread with proper synchronization"""
        self.stop_event.set()

        # Cancel all pending save operations
        with self.manager_lock:
            for _, future in self.save_futures.items():
                if not future.done():
                    future.cancel()
            self.save_futures.clear()

        # Drain the save queue
        while not self.save_queue.empty():
            try:
                self.save_queue.get_nowait()
                self.save_queue.task_done()
            except queue.Empty:
                break

        # Shutdown the save pool
        self.save_pool.shutdown(wait=False)

        # Join the save worker thread
        if self.save_worker.is_alive():
            self.save_worker.join(timeout=2.0)

        # Join the prefetch worker threads
        for thread in self.prefetch_workers:
            if thread.is_alive():
                thread.join(timeout=2.0)

        # Clear cache and remove all temporary files
        with self.manager_lock:
            self.prefetch_cache.clear()
            paths_to_delete = list(self.tensor_paths)
            self.tensor_paths.clear()

        # Delete all temporary files
        for path in paths_to_delete:
            self._safe_delete_file(path)

        # Remove temp directory
        try:
            if os.path.exists(self.temp_dir):
                shutil.rmtree(self.temp_dir, ignore_errors=True)
        except FileNotFoundError as e:
            logger.warning(f"Error removing temporary directory {self.temp_dir}: {e}")


class Disco(torch.autograd.Function):
    """
    Disco: DIsk-based Storage and Checkpointing with Optimized prefetching
    Advanced disk-based gradient checkpointer with prefetching.
    """

    # Shared manager instance across all checkpointing operations
    _manager = None

    @staticmethod
    def get_instance(prefetch_size=1, prefetch_to_gpu=True, save_workers=4):
        """Get or create the offload manager"""
        if Disco._manager is None:
            Disco._manager = DiskOffloadManager(
                prefetch_size=prefetch_size,
                prefetch_to_gpu=prefetch_to_gpu,
                save_workers=save_workers,
            )
        return Disco._manager

    @staticmethod
    @torch_cuda_amp_custom_fwd
    def forward(
        ctx,
        forward_function,
        hidden_states,
        *args,
        prefetch_size=1,
        prefetch_to_gpu=True,
        save_workers=4,
    ):
        """Forward pass that offloads activations to disk asynchronously"""
        # Get or create the manager
        manager = Disco.get_instance(
            prefetch_size=prefetch_size,
            prefetch_to_gpu=prefetch_to_gpu,
            save_workers=save_workers,
        )

        # Save tensor to disk asynchronously
        file_path = manager.save_tensor(hidden_states)

        # Run forward pass immediately without waiting for save to complete
        with torch.no_grad():
            output = forward_function(hidden_states, *args)

        # Store what we need for backward
        ctx.save_for_backward(torch.tensor([0]))  # Dummy tensor
        ctx.file_path = file_path
        ctx.forward_function = forward_function
        ctx.args = args

        return output

    @staticmethod
    @torch_cuda_amp_custom_bwd
    def backward(ctx, *grad_outputs):
        """Backward pass that loads activations from disk with prefetching"""
        # Get the manager
        manager = Disco._manager

        # Trigger prefetching for future tensors
        # This happens at the start of backward, so should have time to complete
        manager.trigger_prefetch()

        # Load hidden states from disk or prefetch cache
        file_path = ctx.file_path
        try:
            # Ensure the file is saved before we try to load it
            manager.wait_for_save(file_path)

            hidden_states = manager.load_tensor(file_path)
            hidden_states.requires_grad = True

            # Compute gradients
            with torch.enable_grad():
                output = ctx.forward_function(hidden_states, *ctx.args)

                # Handle tuple outputs properly
                if isinstance(output, tuple):
                    if len(grad_outputs) == len(output):
                        torch.autograd.backward(output, grad_outputs)
                    else:
                        torch.autograd.backward(output, grad_outputs[0])
                else:
                    torch.autograd.backward(output, grad_outputs[0])

            # Clean up the file after we're done with it
            manager.cleanup_tensor(file_path)

            return (
                (
                    None,  # forward_function
                    hidden_states.grad,  # hidden_states grad
                )
                + (None,) * len(ctx.args)  # for each arg
                + (
                    None,  # prefetch_size
                    None,  # prefetch_to_gpu
                    None,  # save_workers
                )
            )

        except Exception as e:
            logger.error(f"Error in backward pass: {e}")
            # Clean up the file even on error
            manager.cleanup_tensor(file_path)
            raise


================================================
FILE: src/axolotl/monkeypatch/llama_attn_hijack_flash.py
================================================
"""Flash attention monkey patch for llama model"""

# copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/llama_flash_attn_monkey_patch.py

import importlib.util
import warnings
from typing import Optional, Tuple

import torch
import transformers
from einops import rearrange
from flash_attn.bert_padding import pad_input, unpad_input
from transformers.models.llama.modeling_llama import (
    LlamaMLP,
    apply_rotary_pos_emb,
    repeat_kv,
)

from axolotl.monkeypatch.utils import set_module_name
from axolotl.utils.logging import get_logger

try:
    from flash_attn.flash_attn_interface import (
        flash_attn_varlen_qkvpacked_func,
    )
except ImportError:
    from flash_attn.flash_attn_interface import (
        flash_attn_unpadded_qkvpacked_func as flash_attn_varlen_qkvpacked_func,
    )


LOG = get_logger(__name__)


def is_xformers_available() -> bool:
    return importlib.util.find_spec("xformers") is not None


def is_xformers_swiglu_available() -> bool:
    if not is_xformers_available():
        return False

    from xformers.ops.common import get_xformers_operator

    try:
        get_xformers_operator("swiglu_packedw")()
        return True
    except RuntimeError as exc:
        if "No such operator xformers::swiglu_packedw " in str(exc):
            return False
        return True


def replace_llama_mlp_with_swiglu(model):
    if is_xformers_swiglu_available():
        from axolotl.monkeypatch.xformers_ import FusedMLP
    else:
        raise RuntimeError("xformers SwiGLU not available for this environment")

    for name, module in model.named_modules():
        if isinstance(module, LlamaMLP):
            mlp = FusedMLP(
                module.config, module.gate_proj, module.up_proj, module.down_proj
            )
            set_module_name(model, name, mlp)


def patch_fa_llama_cross_entropy():
    LOG.info(
        "patching transformers.loss.loss_utils.fixed_cross_entropy with flash_attn.ops.triton.cross_entropy"
    )
    from flash_attn.ops.triton.cross_entropy import (
        cross_entropy_loss as flash_attn_cross_entropy_loss,
    )

    def fa2_fixed_cross_entropy(
        source,
        target,
        num_items_in_batch: int = None,
        ignore_index: int = -100,
        **kwargs,
    ):
        reduction = "sum" if num_items_in_batch is not None else "mean"
        loss, _ = flash_attn_cross_entropy_loss(
            source, target, ignore_index=ignore_index
        )
        if reduction == "sum":
            loss = loss.sum() / num_items_in_batch
        else:
            loss = loss.sum() / (target != ignore_index).sum()
        return loss

    transformers.loss.loss_utils.fixed_cross_entropy = fa2_fixed_cross_entropy


def patch_llama_rms_norm():
    try:
        from flash_attn.ops.rms_norm import RMSNorm

        class LlamaRMSNorm(RMSNorm):
            """Patched LLamaRMSNorm"""

            def __init__(self, hidden_size, eps=1e-6):
                super().__init__(hidden_size, eps=eps)

        LOG.info("patching with flash_attn.ops.rms_norm")
        transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
    except ImportError:
        LOG.warning(
            "optimized flash-attention RMSNorm not found (run `pip install 'git+https://github.com/Dao-AILab/flash-attention.git#egg=dropout_layer_norm&subdirectory=csrc/layer_norm'`)"
        )


def replace_llama_attn_with_flash_attn(
    cross_entropy: Optional[bool] = False,
    rms_norm: Optional[bool] = False,
    use_shifted_sparse_attn: Optional[bool] = False,
):
    transformers.models.llama.modeling_llama.LlamaModel._prepare_decoder_attention_mask = _prepare_decoder_attention_mask
    if use_shifted_sparse_attn:
        transformers.models.llama.modeling_llama.LlamaAttention.forward = (
            flashattn_forward_with_s2attn
        )

    # skip only if explicitly disabled
    if cross_entropy:
        patch_fa_llama_cross_entropy()

    # skip only if explicitly disabled
    if rms_norm:
        patch_llama_rms_norm()


# Disable the transformation of the attention mask in LlamaModel as the flash attention
# requires the attention mask to be the same as the key_padding_mask
def _prepare_decoder_attention_mask(
    self,
    attention_mask,
    input_shape,
    inputs_embeds,
    past_key_values_length,
):
    # [bsz, seq_len]
    return attention_mask


GROUP_SIZE_RATIO = 1 / 4


def flashattn_forward_with_s2attn(
    self,
    hidden_states: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.Tensor] = None,
    past_key_value: Optional[Tuple[torch.Tensor]] = None,
    output_attentions: bool = False,
    use_cache: bool = False,
    padding_mask: Optional[torch.LongTensor] = None,
    cu_seqlens: Optional[torch.Tensor] = None,
    max_seqlen: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
    """Input shape: Batch x Time x Channel

    From: https://github.com/dvlab-research/LongLoRA/blob/main/llama_attn_replace.py

    attention_mask: [bsz, q_len]

    `cu_seqlens` will be ignored if provided
    `max_seqlen` will be ignored if provided
    """
    if output_attentions:
        warnings.warn(
            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead.",
            stacklevel=2,
        )

    bsz, q_len, _ = hidden_states.size()

    query_states = (
        self.q_proj(hidden_states)
        .view(bsz, q_len, self.num_heads, self.head_dim)
        .transpose(1, 2)
    )
    key_states = (
        self.k_proj(hidden_states)
        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
        .transpose(1, 2)
    )
    value_states = (
        self.v_proj(hidden_states)
        .view(bsz, q_len, self.num_key_value_heads, self.head_dim)
        .transpose(1, 2)
    )
    # [bsz, q_len, nh, hd]
    # [bsz, nh, q_len, hd]

    cos, sin = self.rotary_emb(value_states, position_ids=position_ids)
    query_states, key_states = apply_rotary_pos_emb(
        query_states, key_states, cos, sin, position_ids
    )

    # Past Key value support
    if past_key_value is not None:
        # reuse k, v, self_attention
        key_states = torch.cat([past_key_value[0], key_states], dim=2)
        value_states = torch.cat([past_key_value[1], value_states], dim=2)

    past_key_value = (key_states, value_states) if use_cache else None

    # repeat k/v heads if n_kv_heads < n_heads
    key_states = repeat_kv(key_states, self.num_key_value_groups)
    value_states = repeat_kv(value_states, self.num_key_value_groups)

    # Flash attention codes from
    # https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/flash_attention.py

    # transform the data into the format required by flash attention
    qkv = torch.stack(
        [query_states, key_states, value_states], dim=2
    )  # [bsz, nh, 3, q_len, hd]
    qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]

    # We have disabled _prepare_decoder_attention_mask in LlamaModel
    # the attention_mask should be the same as the key_padding_mask

    key_padding_mask = attention_mask.repeat(2, 1)
    nheads = qkv.shape[-2]
    # shift

    group_size = int(q_len * GROUP_SIZE_RATIO)
    if q_len % group_size > 0:
        raise ValueError(
            f"q_len {q_len} should be divisible by group size {group_size}."
        )

    qkv = (
        qkv.reshape(bsz, q_len, 3, 2, self.num_heads // 2, self.head_dim)
        .permute(0, 3, 1, 2, 4, 5)
        .reshape(bsz * 2, q_len, 3, self.num_heads // 2, self.head_dim)
    )
    x = rearrange(qkv, "b s three h d -> b s (three h d)")
    x_unpad, indices, cu_q_lens, max_s = unpad_input(x, key_padding_mask)
    cu_q_len_tmp = torch.arange(
        0, max_s, group_size, device=key_padding_mask.device, dtype=cu_q_lens.dtype
    )
    cu_q_len_tmp = torch.stack([cu_q_len_tmp, cu_q_len_tmp + group_size // 2]).repeat(
        bsz, 1
    ) + cu_q_lens[:-1].unsqueeze(-1)
    cu_q_lens = torch.cat([cu_q_len_tmp, cu_q_lens[1:].unsqueeze(-1)], dim=-1).view(-1)

    x_unpad = rearrange(
        x_unpad, "nnz (three h d) -> nnz three h d", three=3, h=nheads // 2
    )
    output_unpad = flash_attn_varlen_qkvpacked_func(
        x_unpad, cu_q_lens, group_size, 0.0, softmax_scale=None, causal=True
    )
    output = rearrange(
        pad_input(
            rearrange(output_unpad, "nnz h d -> nnz (h d)"), indices, bsz * 2, q_len
        ),
        "b s (h d) -> b s h d",
        h=nheads // 2,
    )
    output = (
        output.reshape(bsz, 2, q_len, nheads // 2, self.head_dim)
        .transpose(1, 2)
        .reshape(bsz, q_len, nheads, self.head_dim)
    )
    return self.o_proj(rearrange(output, "b s h d -> b s (h d)")), None, past_key_value


================================================
FILE: src/axolotl/monkeypatch/llama_attn_hijack_xformers.py
================================================
"""
Directly copied the code from https://raw.githubusercontent.com/oobabooga/text-generation-webui/main/modules/llama_attn_hijack.py and made some adjustments
"""

import warnings
from typing import Optional, Tuple

import torch
import torch.nn.functional as F
import transformers.models.llama.modeling_llama
from transformers.models.llama.modeling_llama import apply_rotary_pos_emb, repeat_kv

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

try:
    import xformers.ops
except ImportError:
    LOG.error("xformers not found! Please install it before trying to use it.")


def hijack_llama_attention():
    transformers.models.llama.modeling_llama.LlamaAttention.forward = xformers_forward


def xformers_forward(
    self,
    hidden_states: torch.Tensor,
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_value: Optional[Tuple[torch.Tensor]] = None,
    output_attentions: bool = False,
    use_cache: bool = False,
    padding_mask: Optional[torch.LongTensor] = None,
    **kwargs,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
    bsz, q_len, _ = hidden_states.size()

    if not hasattr(self, "pretraining_tp"):
        self.pretraining_tp = 1

    if self.pretraining_tp > 1:
        key_value_slicing = (
            self.num_key_value_heads * self.head_dim
        ) // self.pretraining_tp
        query_slices = self.q_proj.weight.split(
            (self.num_heads * self.head_dim) // self.pretraining_tp, dim=0
        )
        key_slices = self.k_proj.weight.split(key_value_slicing, dim=0)
        value_slices = self.v_proj.weight.split(key_value_slicing, dim=0)

        query_states = [
            F.linear(hidden_states, query_slices[i]) for i in range(self.pretraining_tp)
        ]
        query_states = torch.cat(query_states, dim=-1)

        key_states = [
            F.linear(hidden_states, key_slices[i]) for i in range(self.pretraining_tp)
        ]
        key_states = torch.cat(key_states, dim=-1)

        value_states = [
            F.linear(hidden_states, value_slices[i]) for i in range(self.pretraining_tp)
        ]
        value_states = torch.cat(value_states, dim=-1)

    else:
        query_states = self.q_proj(hidden_states)
        key_states = self.k_proj(hidden_states)
        value_states = self.v_proj(hidden_states)

    query_states = query_states.view(
        bsz, q_len, self.num_heads, self.head_dim
    ).transpose(1, 2)
    key_states = key_states.view(
        bsz, q_len, self.num_key_value_heads, self.head_dim
    ).transpose(1, 2)
    value_states = value_states.view(
        bsz, q_len, self.num_key_value_heads, self.head_dim
    ).transpose(1, 2)
    # [bsz, q_len, nh, hd]
    # [bsz, nh, q_len, hd]

    cos, sin = self.rotary_emb(value_states)
    query_states, key_states = apply_rotary_pos_emb(
        query_states, key_states, cos, sin, position_ids
    )
    # [bsz, nh, t, hd]

    if past_key_value is not None:
        # reuse k, v, self_attention
        key_states = torch.cat([past_key_value[0], key_states], dim=2)
        value_states = torch.cat([past_key_value[1], value_states], dim=2)

    past_key_value = (key_states, value_states) if use_cache else None

    # repeat k/v heads if n_kv_heads < n_heads
    key_states = repeat_kv(key_states, self.num_key_value_groups)
    value_states = repeat_kv(value_states, self.num_key_value_groups)

    if output_attentions:
        warnings.warn(
            "Output attentions is not supported for patched `LlamaAttention`, returning `None` instead.",
            stacklevel=2,
        )

    #
    # xformers-attn start
    #

    query_states = query_states.transpose(1, 2)
    key_states = key_states.transpose(1, 2)
    value_states = value_states.transpose(1, 2)

    # This is a nasty hack. We know attention_mask in transformers is either LowerTriangular or all Zeros.
    # We therefore check if one element in the upper triangular portion is zero. If it is, then the mask is all zeros.
    if attention_mask is None or attention_mask[0, 0, 0, 1] == 0:
        # input and output should be of form (bsz, q_len, num_heads, head_dim)
        attn_output = xformers.ops.memory_efficient_attention(
            query_states, key_states, value_states, attn_bias=None
        )
    else:
        # input and output should be of form (bsz, q_len, num_heads, head_dim)
        attn_output = xformers.ops.memory_efficient_attention(
            query_states,
            key_states,
            value_states,
            # attn_bias=attention_mask,
            attn_bias=xformers.ops.LowerTriangularMask(),
        )

    if attn_output.size() != (bsz, q_len, self.num_heads, self.head_dim):
        raise ValueError(
            f"`attn_output` should be of size {(bsz, q_len, self.num_heads, self.head_dim)}, but is"
            f" {attn_output.size()}"
        )
    attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

    #
    # xformers-attn end
    #

    if self.pretraining_tp > 1:
        attn_output = attn_output.split(self.hidden_size // self.pretraining_tp, dim=2)
        o_proj_slices = self.o_proj.weight.split(
            self.hidden_size // self.pretraining_tp, dim=1
        )
        attn_output = sum(
            F.linear(attn_output[i], o_proj_slices[i])
            for i in range(self.pretraining_tp)
        )
    else:
        attn_output = self.o_proj(attn_output)

    return attn_output, None, past_key_value


================================================
FILE: src/axolotl/monkeypatch/lora_kernels.py
================================================
"""Module for patching custom LoRA Triton kernels and `torch.autograd` functions."""

import importlib
import inspect
import logging
import types
from typing import Generator, Tuple, Type

import torch
from peft import PeftModelForCausalLM
from torch import nn
from transformers import AutoConfig

from axolotl.kernels.lora import (
    apply_lora_mlp_geglu,
    apply_lora_mlp_swiglu,
    apply_lora_o,
    apply_lora_qkv,
)
from axolotl.monkeypatch.utils import detab_code
from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

QKV_PATCHES = [
    (
        """
    query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
    key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
    value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
""".lstrip("\n"),
        """
    query_states, key_states, value_states = self.apply_qkv(hidden_states)
    query_states = query_states.view(hidden_shape).transpose(1, 2)
    key_states = key_states.view(hidden_shape).transpose(1, 2)
    value_states = value_states.view(hidden_shape).transpose(1, 2)
""".lstrip("\n"),
    ),
    (
        """
    query_states = self.q_norm(self.q_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
    key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
    value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
""".lstrip("\n"),
        """
    query_states, key_states, value_states = self.apply_qkv(hidden_states)
    query_states = self.q_norm(query_states.view(hidden_shape)).transpose(1, 2)
    key_states = self.k_norm(key_states.view(hidden_shape)).transpose(1, 2)
    value_states = value_states.view(hidden_shape).transpose(1, 2)
""".lstrip("\n"),
    ),
    (
        """
    query_states, gate = torch.chunk(
        self.q_proj(hidden_states).view(*input_shape, -1, self.head_dim * 2), 2, dim=-1
    )
    gate = gate.reshape(*input_shape, -1)

    query_states = self.q_norm(query_states.view(hidden_shape)).transpose(1, 2)
    key_states = self.k_norm(self.k_proj(hidden_states).view(hidden_shape)).transpose(1, 2)
    value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
""".lstrip("\n"),
        """
    query_states, key_states, value_states = self.apply_qkv(hidden_states)
    query_states, gate = torch.chunk(
        query_states.view(*input_shape, -1, self.head_dim * 2), 2, dim=-1
    )
    gate = gate.reshape(*input_shape, -1)

    query_states = self.q_norm(query_states.view(hidden_shape)).transpose(1, 2)
    key_states = self.k_norm(key_states.view(hidden_shape)).transpose(1, 2)
    value_states = value_states.view(hidden_shape).transpose(1, 2)
""".lstrip("\n"),
    ),
]

ORIGINAL_O_CODE = """
    attn_output = self.o_proj(attn_output)
""".lstrip("\n")

PATCHED_O_CODE = """
    attn_output = self.apply_o(attn_output)
""".lstrip("\n")

SUPPORTED_ACTIVATIONS = ["silu", "gelu"]
APPLY_FN_MAPPING = {
    "silu": apply_lora_mlp_swiglu,
    "gelu": apply_lora_mlp_geglu,
}


def original_apply_qkv(
    self: nn.Module, hidden_states: torch.Tensor
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
    """
    Original implementation of QKV projection without optimizations.

    Args:
        self: The attention module instance.
        hidden_states: Input tensor of shape [batch_size, seq_len, hidden_dim].

    Returns:
        A tuple `(query_states, key_states, value_states)` containing the projected
            states for query, key, and value.
    """
    query_states = self.q_proj(hidden_states)
    key_states = self.k_proj(hidden_states)
    value_states = self.v_proj(hidden_states)

    return query_states, key_states, value_states


def original_apply_o(self: nn.Module, hidden_states: torch.Tensor) -> torch.Tensor:
    """
    Original implementation of output projection without optimizations.

    Args:
        self: The attention module instance.
        hidden_states: Input tensor of shape `[`batch_size, seq_len, hidden_dim]`.

    Returns:
        The output projection result.
    """
    attn_output = self.o_proj(hidden_states)

    return attn_output


def get_attention_cls_from_config(cfg: DictDefault) -> Type[nn.Module]:
    """
    Get the appropriate attention class by inspecting the model config.
    Uses dynamic import to support any model architecture that follows
    the standard transformers naming convention.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.

    Returns:
        The appropriate attention class for the model.

    Raises:
        ValueError: If `base_model` not specified or attention class cannot be imported
        ImportError: If the model module or attention class doesn't exist
    """
    if "base_model" not in cfg:
        raise ValueError("base_model must be specified in config")

    # Get model config without loading the model
    model_config = AutoConfig.from_pretrained(cfg["base_model"])
    model_type = model_config.model_type

    # Special case for model_type = "qwen2"
    if model_type == "qwen2":
        from transformers.models.qwen2.modeling_qwen2 import Qwen2Attention

        return Qwen2Attention

    if model_type == "qwen3_vl":
        from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLTextAttention

        return Qwen3VLTextAttention

    if model_type == "mllama":
        from transformers.models.mllama.modeling_mllama import MllamaTextSelfAttention

        return MllamaTextSelfAttention

    if model_type == "llama4":
        from transformers.models.llama4.modeling_llama4 import Llama4TextAttention

        return Llama4TextAttention

    if model_type == "mistral3":
        from transformers.models.mistral.modeling_mistral import MistralAttention

        return MistralAttention

    if model_type == "gemma3_text":
        from transformers.models.gemma3.modeling_gemma3 import Gemma3Attention

        return Gemma3Attention

    try:
        # Dynamically import the module and attention class
        module_path = f"transformers.models.{model_type}.modeling_{model_type}"
        model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
        module = __import__(module_path, fromlist=[f"{model_cls_prefix}Attention"])
        attention_cls = getattr(module, f"{model_cls_prefix}Attention")

        return attention_cls
    except (ImportError, AttributeError) as e:
        raise ValueError(
            f"Axolotl could not import attention class for model_type: {model_type}. "
            "Please raise an Issue and turn off lora kernels to continue training. "
            f"Error: {str(e)}"
        ) from e


def patch_self_attn_lora(cfg: DictDefault):
    """
    Given an `axolotl` config, this method patches the inferred attention class forward
    pass with optimized LoRA implementations.

    It modifies the attention class to use optimized QKV and output projections. The
    original implementation is preserved and can be restored if needed.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.

    Raises:
        AssertionError: If the required code blocks are not found in the attention
            implementation.
    """
    attention_cls = get_attention_cls_from_config(cfg)

    # Check if already patched
    if hasattr(attention_cls, "_original_forward"):
        LOG.info(f"{attention_cls.__name__} already patched")
        return

    self_attn_forward = inspect.getsource(attention_cls.forward)
    attention_cls._original_forward = self_attn_forward
    self_attn_forward, _ = detab_code(self_attn_forward)

    assert any(qkv_options[0] in self_attn_forward for qkv_options in QKV_PATCHES), (
        "Original QKV code not found"
    )
    assert ORIGINAL_O_CODE in self_attn_forward, "Original O code not found"

    for qkv_orig, qkv_patched in QKV_PATCHES:
        if qkv_orig in self_attn_forward:
            self_attn_forward = self_attn_forward.replace(
                qkv_orig,
                qkv_patched,
            )
            break
    self_attn_forward = self_attn_forward.replace(ORIGINAL_O_CODE, PATCHED_O_CODE)
    self_attn_forward = self_attn_forward.replace(
        "def forward(",
        "def axolotl_attn_forward(",
        1,
    )

    # Load necessary imports
    module_name = attention_cls.__module__
    module = importlib.import_module(module_name)

    items_to_import = []
    for item in dir(module):
        if item in self_attn_forward:
            items_to_import.append(item)

    exec(
        f"from {module_name} import ({', '.join(items_to_import)})",
        globals(),
    )
    exec(self_attn_forward, globals())

    LOG.info(f"Patched attention class with LoRA optims: {attention_cls.__name__}")
    attention_cls.forward = axolotl_attn_forward


def find_self_attn_in_layer(
    layer: nn.Module,
) -> Generator[Tuple[nn.Module], None, None]:
    # general case of most models
    if hasattr(layer, "self_attn"):
        if all(
            hasattr(layer.self_attn, proj)
            for proj in ["q_proj", "k_proj", "v_proj", "o_proj"]
        ):
            yield layer.self_attn


def find_mlp_in_layer(
    layer: nn.Module,
) -> Generator[Tuple[nn.Module, nn.Module, nn.Module, nn.Module], None, None]:
    # general case of most models
    if hasattr(layer, "mlp"):
        if all(
            hasattr(layer.mlp, proj) for proj in ["gate_proj", "up_proj", "down_proj"]
        ):
            yield layer.mlp.gate_proj, layer.mlp.up_proj, layer.mlp.down_proj, layer.mlp
    # llama4 linearized experts
    if hasattr(layer, "feedforward") and hasattr(layer.feedforward, "shared_expert"):
        mlp = layer.feedforward.shared_expert
        yield mlp.gate_proj, mlp.up_proj, mlp.down_proj, mlp
    if hasattr(layer, "feedforward") and hasattr(layer.feedforward, "experts"):
        if all(
            hasattr(layer.feedforward.experts, proj)
            for proj in ["gate_projs", "up_projs", "down_projs"]
        ):
            for gate_proj, up_proj, down_proj in zip(
                layer.feedforward.experts.gate_projs,
                layer.feedforward.experts.up_projs,
                layer.feedforward.experts.down_projs,
                strict=False,
            ):
                yield (
                    gate_proj,
                    up_proj,
                    down_proj,
                    FakeMLP(gate_proj, up_proj, down_proj),
                )


def get_layers(model: PeftModelForCausalLM) -> list[nn.Module]:
    """
    Get the layers of the model. Handles text-only and multimodal models.

    Args:
        model: A PEFT model.

    Returns:
        A list of layers.
    """
    pretrained_model = model.model

    # check for multimodal models first
    if hasattr(pretrained_model, "language_model"):
        return pretrained_model.language_model.layers
    if hasattr(pretrained_model, "model"):
        if hasattr(pretrained_model.model, "language_model"):
            return pretrained_model.model.language_model.layers
        return pretrained_model.model.layers

    raise NotImplementedError(
        f"Model type {model.config.model_type} is not supported yet. Please create an Issue."
    )


def apply_lora_kernel_patches(
    model: PeftModelForCausalLM, cfg: DictDefault
) -> PeftModelForCausalLM:
    """
    Applies optimized Triton kernel patches to a PEFT model.

    Patches a PEFT model with optimized implementations for MLP and attention
    computations. The optimizations include custom Triton kernels for activation
    functions and specialized autograd functions for LoRA computations.

    Args:
        model: A PEFT model to be patched with optimized kernels.
        cfg: Dictionary mapping `axolotl` config keys to values.

    Returns:
        PeftModelForCausalLM: The patched model with optimized kernels.

    Raises:
        TypeError: If the provided model is not a `PeftModelForCausalLM`.
        NotImplementedError: If the model type is not supported.
        AssertionError: If multiple adapters are active (currently unsupported).

    Note:
        The optimizations require LoRA adapters with no dropout and no bias terms. The
            function will skip patching if these conditions aren't met.
    """
    if not isinstance(model, PeftModelForCausalLM):
        raise TypeError("Model must be a PeftModelForCausalLM")

    # Get active LoRA adapter config
    if hasattr(model, "active_adapters"):
        assert len(model.active_adapters) == 1, (
            "Axolotl currently does not support LoRA Triton kernels for multiple adapters"
        )
        active_adapter = model.active_adapters[0]
    else:
        active_adapter = model.active_adapter
    lora_config = model.model.peft_config[active_adapter]

    # Only patch if conditions are met
    can_patch = lora_config.lora_dropout == 0 and lora_config.bias == "none"

    if not can_patch:
        LOG.warning("Cannot patch layers - requires no dropout and no bias")
        LOG.warning("Please specify `lora_dropout: 0` in your axolotl config file")
        return model

    # This needs to be reset after patching
    original_level = LOG.getEffectiveLevel()
    LOG.setLevel(logging.INFO)

    # Choose activation based on model type
    activation = None
    text_config = (
        model.config.get_text_config()
        if hasattr(model.config, "get_text_config")
        else model.config
    )
    if hasattr(text_config, "hidden_act"):
        activation = text_config.hidden_act
    elif hasattr(text_config, "hidden_activation"):
        activation = text_config.hidden_activation

    # map activation to supported activation
    if "gelu" in activation:
        # gemma3 uses gelu_pytorch_tanh
        activation = "gelu"

    if activation not in SUPPORTED_ACTIVATIONS:
        raise NotImplementedError(f"Activation {activation} is not supported")

    layers = get_layers(model)

    # Patch each layer
    for layer in layers:
        # Add QKV, O fallback implementations to start
        # These will be overwritten later (if some conditions apply)
        for self_attn in find_self_attn_in_layer(layer):
            self_attn.apply_qkv = types.MethodType(original_apply_qkv, self_attn)
            self_attn.apply_o = types.MethodType(original_apply_o, self_attn)

            if cfg.lora_qkv_kernel:
                # Query, key, value patching
                layer_modules = [
                    getattr(self_attn, linear_proj)
                    for linear_proj in ["q_proj", "k_proj", "v_proj"]
                ]
                can_patch_qkv = all(
                    hasattr(module, "lora_A")
                    and len(getattr(module, "lora_magnitude_vector", []) or []) == 0
                    for module in layer_modules
                )

                if can_patch_qkv:
                    # Add optimized implementation
                    self_attn.apply_qkv = types.MethodType(apply_lora_qkv, self_attn)
                else:
                    LOG.warning_once(
                        "Cannot patch some attention QKV projections - requires LoRA "
                        "adapters and no lora_magnitude_vector (DoRA)"
                    )
            if cfg.lora_o_kernel:
                # Output patching
                layer_modules = [
                    getattr(self_attn, linear_proj) for linear_proj in ["o_proj"]
                ]
                can_patch_o = all(
                    hasattr(module, "lora_A")
                    and len(getattr(module, "lora_magnitude_vector", []) or []) == 0
                    for module in layer_modules
                )

                if can_patch_o:
                    self_attn.apply_o = types.MethodType(apply_lora_o, self_attn)
                else:
                    LOG.warning_once(
                        "Cannot patch some attention output projection - requires LoRA "
                        "adapters and no lora_magnitude_vector (DoRA)"
                    )
        for gate_proj, up_proj, down_proj, mlp in find_mlp_in_layer(layer):
            if cfg.lora_mlp_kernel:
                # MLP patching
                can_patch_mlp = all(
                    hasattr(proj, "lora_A")
                    and len(getattr(proj, "lora_magnitude_vector", []) or []) == 0
                    for proj in (gate_proj, up_proj, down_proj)
                )

                if can_patch_mlp:
                    apply_fn = APPLY_FN_MAPPING[activation]
                    layer.mlp.forward = types.MethodType(apply_fn, mlp)
                else:
                    LOG.warning_once(
                        "Cannot patch some MLP layers - requires LoRA adapters and no "
                        "lora_magnitude_vector (DoRA)"
                    )

    LOG.setLevel(original_level)

    return model


class FakeMLP(nn.Module):
    """
    placeholder MLP for triton patching
    """

    gate_proj: nn.Linear
    up_proj: nn.Linear
    down_proj: nn.Linear

    def __init__(self, gate_proj, up_proj, down_proj):
        super().__init__()
        self.gate_proj = gate_proj
        self.up_proj = up_proj
        self.down_proj = down_proj


================================================
FILE: src/axolotl/monkeypatch/loss/__init__.py
================================================


================================================
FILE: src/axolotl/monkeypatch/loss/chunked.py
================================================
"""
chunked ce loss
"""

from typing import List, Optional

import torch
import torch.nn.functional as F


# copied and modified from torchtune.modules.loss.CEWithChunkedOutputLoss
class CEWithChunkedOutputLoss(torch.nn.Module):
    """
    Cross-entropy with chunked outputs that saves memory by only upcasting one chunk at a time.

    For more details, please refer to: https://github.com/pytorch/torchtune/pull/1390
    """

    def __init__(self, num_output_chunks: int = 8, ignore_index: int = -100):
        super().__init__()
        self.num_output_chunks = num_output_chunks
        self.ignore_index = ignore_index

    def compute_cross_entropy(
        self,
        logits: torch.Tensor,
        labels: torch.Tensor,
        normalize: bool = True,
    ) -> torch.Tensor:
        """
        Upcast logits to fp32 and compute cross entropy loss.
        """
        return F.cross_entropy(
            logits.float(), labels, ignore_index=self.ignore_index, reduction="sum"
        )

    def forward(
        self, logits: List[torch.Tensor], labels: torch.Tensor, reduction="sum"
    ) -> torch.Tensor:
        """
        Args:
            logits (List[torch.Tensor]): List of chunked logits of length
                ``self.num_output_chunks``, where each chunk has shape
                ``(batch_size, num_tokens / num_output_chunks, vocab_size)``.
            labels (torch.Tensor): Ground truth labels of shape ``(batch_size, num_tokens)``.
            reduction (str): The reduction to apply to the output.

        Returns:
            torch.Tensor: Cross entropy loss of shape (1,).
        """

        total_elements = (labels != self.ignore_index).sum()

        # chunk and reshape labels (bsz, num_tokens, vocab) -> [(bsz*num_tokens/num_chunks, vocab)]
        labels = [
            target_chunk.reshape(-1)
            for target_chunk in labels.chunk(self.num_output_chunks, dim=1)
        ]
        # reshape logits [(bsz, num_tokens/num_chunks, vocab)] -> [(bsz*num_tokens/num_chunks, vocab)]
        logits = [
            logit_chunk.reshape(-1, logit_chunk.size(-1)) for logit_chunk in logits
        ]

        # compute one chunk at a time
        total_loss = 0.0
        for logits_chunk, labels_chunk in zip(logits, labels, strict=False):
            total_loss += self.compute_cross_entropy(logits_chunk, labels_chunk)

        if reduction == "sum":
            return total_loss
        return total_loss / total_elements


def _build_chunked_ce_loss_fn(num_output_chunks: int = 8, ignore_index: int = -100):
    loss_fn_ce = CEWithChunkedOutputLoss(num_output_chunks, ignore_index)
    loss_fn_ce.compute_cross_entropy = torch.compile(
        loss_fn_ce.compute_cross_entropy, backend="inductor"
    )
    return loss_fn_ce


def get_causal_lm_loss(num_output_chunks: int = 8, ignore_index: int = -100):
    loss_fn_ce = _build_chunked_ce_loss_fn(num_output_chunks, ignore_index)

    def chunked_fix_cross_entropy(
        source,
        target,
        num_items_in_batch: int = None,
        ignore_index: int = -100,
        **kwargs,
    ):
        reduction = "sum" if num_items_in_batch is not None else "mean"
        logit_chunks = [
            chunk for chunk in source.chunk(loss_fn_ce.num_output_chunks, dim=1)
        ]
        loss = loss_fn_ce(logit_chunks, target, reduction=reduction)
        if reduction == "sum":
            loss = loss / num_items_in_batch
        return loss

    def for_causal_lm_chunked_loss(
        logits,
        labels,
        vocab_size: int = None,
        num_items_in_batch: Optional[int] = None,
        ignore_index: int = -100,
        shift_labels: Optional[torch.Tensor] = None,
        **kwargs,
    ) -> torch.Tensor:
        # skip the upcast to float since we handle that in the chunking loss
        if shift_labels is None:
            # Shift so that tokens < n predict n
            labels = F.pad(labels, (0, 1), value=ignore_index)
            shift_labels = labels[..., 1:].contiguous()

        # Skip Flattening the tokens
        # Enable model parallelism
        shift_labels = shift_labels.to(logits.device)
        loss = chunked_fix_cross_entropy(
            logits, shift_labels, num_items_in_batch, ignore_index, **kwargs
        )
        return loss

    return for_causal_lm_chunked_loss


def patch_chunked_ce_loss_fn(num_output_chunks: int = 8, ignore_index: int = -100):
    import transformers.loss.loss_utils

    for_causal_lm_chunked_loss = get_causal_lm_loss(num_output_chunks, ignore_index)
    transformers.loss.loss_utils.ForCausalLMLoss = for_causal_lm_chunked_loss
    transformers.loss.loss_utils.LOSS_MAPPING["ForCausalLM"] = (
        for_causal_lm_chunked_loss
    )


================================================
FILE: src/axolotl/monkeypatch/loss/eaft.py
================================================
"""
eaft (entropy-aware focal training) loss implementation
weights examples by entropy approximation from top-k logits

Reference: https://github.com/ymxyll/LlamaFactory-EAFT/blob/e2ce19e8efcc226450ee8f2b81dfe4e69f1f945d/src/llamafactory/train/trainer_utils.py
"""

import torch
import torch.nn.functional as F


def eaft_loss(outputs, labels, num_items_in_batch=None, alpha=1.0, k=20):
    """
    compute eaft loss with entropy weighting

    args:
        outputs: model outputs containing logits
        labels: target labels for computing loss
        num_items_in_batch: for sample packing support
        alpha: exponent for entropy weighting (default 1.0)
        k: number of top logits for entropy approximation (default 20)
    """
    logits = outputs.logits

    shift_logits = logits[..., :-1, :].contiguous()
    shift_labels = labels[..., 1:].contiguous()

    vocab_size = shift_logits.size(-1)
    shift_logits_view = shift_logits.view(-1, vocab_size)
    shift_labels_view = shift_labels.view(-1)

    mask = shift_labels_view != -100

    with torch.no_grad():
        top_k_logits, _ = torch.topk(
            shift_logits_view[mask].float(), k=min(k, vocab_size), dim=-1
        )
        top_k_probs = F.softmax(top_k_logits, dim=-1)
        entropy = -(top_k_probs * torch.log(top_k_probs + 1e-10)).sum(dim=-1)
        weights = torch.pow(entropy, alpha)

    loss_fct = torch.nn.CrossEntropyLoss(reduction="none")
    per_token_loss = loss_fct(shift_logits_view[mask], shift_labels_view[mask])
    weighted_loss = per_token_loss * weights

    if num_items_in_batch is not None:
        loss = weighted_loss.sum() / num_items_in_batch
    else:
        loss = weighted_loss.mean()

    return loss


================================================
FILE: src/axolotl/monkeypatch/mistral_attn_hijack_flash.py
================================================
"""Flash attention monkey patch for mistral model"""

from functools import partial

import transformers

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def patch_mistral_cross_entropy():
    from flash_attn.losses.cross_entropy import CrossEntropyLoss

    LOG.info("patching with flash_attn.losses.cross_entropy")
    transformers.models.mistral.modeling_mistral.CrossEntropyLoss = partial(
        CrossEntropyLoss, inplace_backward=True
    )


================================================
FILE: src/axolotl/monkeypatch/mixtral/__init__.py
================================================
"""
Patches to support multipack for mixtral
"""

import torch


def patch_mixtral_moe_forward_zero3() -> None:
    import torch.nn.functional as F

    def mlp_forward(self, hidden_states):
        current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(
            hidden_states
        )
        current_hidden_states = self.w2(current_hidden_states)
        return current_hidden_states

    # Ref. https://huggingface.co/deepseek-ai/deepseek-moe-16b-base/blob/main/modeling_deepseek.py
    def moe_forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        batch_size, sequence_length, hidden_dim = hidden_states.shape
        hidden_states = hidden_states.view(-1, hidden_dim)
        # router_logits: (batch * sequence_length, n_experts)
        router_logits = self.gate(hidden_states)

        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
        topk_weight, topk_idx = torch.topk(
            routing_weights, self.top_k, dim=-1, sorted=False
        )
        topk_weight /= topk_weight.sum(dim=-1, keepdim=True)
        # we cast back to the input dtype
        topk_weight = topk_weight.to(hidden_states.dtype)

        hidden_states = hidden_states.repeat_interleave(self.top_k, dim=0)
        y = torch.empty_like(hidden_states)
        flat_topk_idx = topk_idx.view(-1)
        for i in range(self.num_experts):
            expert = self.experts[i]
            y[flat_topk_idx == i] = expert(hidden_states[flat_topk_idx == i])
        y = (y.view(*topk_weight.shape, -1) * topk_weight.unsqueeze(-1)).sum(dim=1)
        final_hidden_states = y.reshape(batch_size, sequence_length, hidden_dim)
        return final_hidden_states, router_logits

    from transformers.models.mixtral.modeling_mixtral import (
        MixtralBlockSparseTop2MLP,
        MixtralSparseMoeBlock,
    )

    MixtralBlockSparseTop2MLP.forward = mlp_forward
    MixtralSparseMoeBlock.forward = moe_forward


================================================
FILE: src/axolotl/monkeypatch/models/__init__.py
================================================


================================================
FILE: src/axolotl/monkeypatch/models/apertus/__init__.py
================================================


================================================
FILE: src/axolotl/monkeypatch/models/apertus/activation.py
================================================
"""Monkeypatch for Apertus to dtype mismatch in XIELU act"""

from torch import Tensor


def patch_apertus_xielu_activation():
    try:
        from transformers.activations import XIELUActivation
    except ImportError as err:
        raise ImportError(
            "Cannot import XIELUActivation. "
            "Please make sure to update your transformers version >= 4.56.1."
        ) from err

    from transformers.activations import logger

    # Store the original method
    old_fn = XIELUActivation._xielu_cuda

    def _xielu_cuda_fixed(self, x: Tensor) -> Tensor:
        """Firewall function to prevent torch.compile from seeing .item() calls"""
        original_shape = x.shape
        # CUDA kernel expects 3D tensors, reshape if needed
        while x.dim() < 3:
            x = x.unsqueeze(0)
        if x.dim() > 3:
            x = x.view(-1, 1, x.size(-1))
        if original_shape != x.shape:
            logger.warning_once(
                "Warning: xIELU input tensor expects 3 dimensions but got (shape: %s). Reshaping to (shape: %s).",
                original_shape,
                x.shape,
            )
        result = self._xielu_cuda_obj.forward(
            x,
            self.alpha_p.to(x.dtype),
            self.alpha_n.to(x.dtype),
            # Temporary until xIELU CUDA fully implemented -> self.{beta,eps}.item()
            self._beta_scalar,
            self._eps_scalar,
            self.with_vector_loads,
        )
        return result.view(original_shape)

    # Apply the patch
    XIELUActivation._xielu_cuda = _xielu_cuda_fixed

    def unpatch():
        """Restore the original method"""
        XIELUActivation._xielu_cuda = old_fn

    return unpatch


================================================
FILE: src/axolotl/monkeypatch/models/kimi_linear/__init__.py
================================================


================================================
FILE: src/axolotl/monkeypatch/models/kimi_linear/configuration_kimi.py
================================================
"""
Kimi-Linear configuration.

Source: https://huggingface.co/moonshotai/Kimi-Linear-48B-A3B-Instruct/blob/main/configuration_kimi.py
Revision: 6e163f3
"""

from typing import Optional

from transformers.configuration_utils import PretrainedConfig


class KimiLinearConfig(PretrainedConfig):
    model_type = "kimi_linear"
    keys_to_ignore_at_inference = ["past_key_values"]

    def __init__(
        self,
        model_type="kimi_linear",
        vocab_size=163840,
        hidden_size=4096,
        head_dim=None,
        intermediate_size=11008,
        num_hidden_layers=32,
        num_attention_heads=32,
        num_key_value_heads=None,
        hidden_act="silu",
        initializer_range=0.02,
        rms_norm_eps=1e-6,
        use_cache=True,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
        rope_theta=10000.0,
        rope_scaling=None,
        tie_word_embeddings=False,
        moe_intermediate_size: Optional[int] = None,
        moe_renormalize: bool = True,
        moe_router_activation_func: str = "sigmoid",
        num_experts: Optional[int] = None,
        num_experts_per_token: Optional[int] = None,
        num_shared_experts: int = 0,
        routed_scaling_factor: float = 1.0,
        first_k_dense_replace: int = 0,
        moe_layer_freq: int = 1,
        use_grouped_topk: bool = True,
        num_expert_group: int = 1,
        topk_group: int = 1,
        q_lora_rank: Optional[int] = None,
        kv_lora_rank: Optional[int] = None,
        qk_nope_head_dim: Optional[int] = None,
        qk_rope_head_dim: Optional[int] = None,
        v_head_dim: Optional[int] = None,
        mla_use_nope: Optional[bool] = False,
        num_nextn_predict_layers: int = 0,
        linear_attn_config: Optional[dict] = None,
        router_aux_loss_coef: float = 0.01,
        **kwargs,
    ):
        self.model_type = model_type
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.head_dim = (
            head_dim if head_dim is not None else hidden_size // num_attention_heads
        )
        self.intermediate_size = intermediate_size
        self.num_hidden_layers = num_hidden_layers
        self.num_attention_heads = num_attention_heads

        # for backward compatibility
        if num_key_value_heads is None:
            num_key_value_heads = num_attention_heads

        self.num_key_value_heads = num_key_value_heads
        self.hidden_act = hidden_act
        self.initializer_range = initializer_range
        self.rms_norm_eps = rms_norm_eps
        self.use_cache = use_cache
        self.rope_theta = rope_theta
        self.rope_scaling = rope_scaling

        self.q_lora_rank = q_lora_rank
        self.kv_lora_rank = kv_lora_rank
        self.qk_nope_head_dim = qk_nope_head_dim
        self.qk_rope_head_dim = qk_rope_head_dim
        self.v_head_dim = v_head_dim
        self.mla_use_nope = mla_use_nope
        # moe config
        self.num_experts = num_experts
        self.num_experts_per_token = num_experts_per_token
        self.moe_renormalize = moe_renormalize
        self.num_shared_experts = num_shared_experts
        self.routed_scaling_factor = routed_scaling_factor
        self.moe_router_activation_func = moe_router_activation_func
        assert self.moe_router_activation_func in ("softmax", "sigmoid")
        self.moe_intermediate_size = moe_intermediate_size
        self.first_k_dense_replace = first_k_dense_replace
        self.moe_layer_freq = moe_layer_freq
        self.use_grouped_topk = use_grouped_topk
        self.num_expert_group = num_expert_group
        self.topk_group = topk_group
        self.num_nextn_predict_layers = num_nextn_predict_layers
        self.router_aux_loss_coef = router_aux_loss_coef

        if linear_attn_config is not None:
            assert linear_attn_config["kda_layers"] is not None
            assert linear_attn_config["full_attn_layers"] is not None
        self.linear_attn_config = linear_attn_config

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            tie_word_embeddings=tie_word_embeddings,
            **kwargs,
        )

    @property
    def is_mla(self):
        return (
            self.q_lora_rank is not None
            or self.kv_lora_rank is not None
            or self.qk_nope_head_dim is not None
            or self.qk_rope_head_dim is not None
            or self.v_head_dim is not None
            or self.mla_use_nope is True
        )

    @property
    def is_moe(self):
        return self.num_experts is not None

    @property
    def is_linear_attn(self) -> bool:
        return not (
            self.linear_attn_config is None
            or (
                isinstance(self.linear_attn_config, dict)
                and self.linear_attn_config["kda_layers"] is not None
                and len(self.linear_attn_config["kda_layers"]) == 0
            )
        )

    def is_kda_layer(self, layer_idx: int):
        return (
            self.linear_attn_config is not None
            and (layer_idx + 1) in self.linear_attn_config["kda_layers"]
        )


================================================
FILE: src/axolotl/monkeypatch/models/kimi_linear/modeling_kimi.py
================================================
"""
Adapted Kimi-Linear modeling to enable MoE differentiable.

Source: https://huggingface.co/moonshotai/Kimi-Linear-48B-A3B-Instruct/blob/main/modeling_kimi.py
Revision: 6e163f3
"""

import math
from collections.abc import Callable
from typing import Any, List, Optional, Tuple, Union

import torch
import torch.nn.functional as F
import transformers
from einops import rearrange
from packaging import version
from torch import nn
from transformers.activations import ACT2FN
from transformers.cache_utils import Cache
from transformers.generation import GenerationMixin
from transformers.masking_utils import create_causal_mask
from transformers.modeling_flash_attention_utils import FlashAttentionKwargs
from transformers.modeling_outputs import (
    BaseModelOutputWithPast,
    CausalLMOutputWithPast,
    MoeCausalLMOutputWithPast,
)
from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
from transformers.processing_utils import Unpack
from transformers.pytorch_utils import ALL_LAYERNORM_LAYERS
from transformers.utils import (
    TransformersKwargs,
    can_return_tuple,
    logging,
)
from transformers.utils.generic import OutputRecorder

try:
    from fla.layers.utils import get_unpad_data, index_first_axis, pad_input
    from fla.modules import FusedRMSNormGated, ShortConvolution
    from fla.ops.kda import chunk_kda, fused_recurrent_kda
    from fla.ops.kda.gate import fused_kda_gate
except ImportError as err:
    raise ImportError(
        "Plese run `pip uninstall fla-core flash-linear-attention -y && pip install git+https://github.com/fla-org/flash-linear-attention@v0.4.0`"
    ) from err

from axolotl.monkeypatch.models.kimi_linear.configuration_kimi import KimiLinearConfig

assert version.parse(transformers.__version__) >= version.parse("4.56.0"), (
    "Please upgrade transformers to >= 4.56.0"
)

logger = logging.get_logger(__name__)


def load_balancing_loss_func(
    gate_logits: Union[torch.Tensor, tuple[torch.Tensor], None],
    num_experts: Optional[int] = None,
    top_k=2,
    attention_mask: Optional[torch.Tensor] = None,
) -> Union[torch.Tensor, int]:
    """Standard Switch Transformer load balancing loss."""
    if gate_logits is None or not isinstance(gate_logits, tuple):
        return 0

    # Concatenate all layer logits
    concatenated_gate_logits = torch.cat(
        [layer_gate for layer_gate in gate_logits], dim=0
    )

    routing_weights = F.softmax(concatenated_gate_logits, dim=-1)
    _, selected_experts = torch.topk(routing_weights, top_k, dim=-1)
    expert_mask = F.one_hot(selected_experts, num_experts)

    tokens_per_expert = torch.mean(expert_mask.float(), dim=0)
    router_prob_per_expert = torch.mean(routing_weights, dim=0)

    overall_loss = torch.sum(tokens_per_expert * router_prob_per_expert.unsqueeze(0))
    return overall_loss * num_experts


class KimiDynamicCache:
    """
    Dynamic cache for Kimi model.
    Inspired by Qwen3-Next
    """

    is_compileable = False

    def __init__(self, config: KimiLinearConfig):
        super().__init__()
        self.config = config

        if config.linear_attn_config is not None:
            self.layer_types = []
            for i in range(config.num_hidden_layers):
                if config.is_kda_layer(i):
                    self.layer_types.append("linear_attention")
                else:
                    self.layer_types.append("full_attention")
        else:
            self.layer_types = ["full_attention"] * config.num_hidden_layers

        self.transformer_layers = [
            i
            for i in range(config.num_hidden_layers)
            if self.layer_types[i] == "full_attention"
        ]

        linear_layers = [
            i
            for i in range(config.num_hidden_layers)
            if self.layer_types[i] == "linear_attention"
        ]
        self.last_linear_layer = linear_layers[-1] if linear_layers else -1

        self.conv_states = [None for _ in range(config.num_hidden_layers)]
        self.recurrent_states = [None for _ in range(config.num_hidden_layers)]
        self.key_cache = [None for _ in range(config.num_hidden_layers)]
        self.value_cache = [None for _ in range(config.num_hidden_layers)]

    def __len__(self):
        return len(self.layer_types)

    def update(
        self,
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        layer_idx: int,
        cache_kwargs: Optional[dict[str, Any]] = None,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        if self.key_cache[layer_idx] is None:
            self.key_cache[layer_idx] = key_states
            self.value_cache[layer_idx] = value_states
        else:
            self.key_cache[layer_idx] = torch.cat(
                [self.key_cache[layer_idx], key_states], dim=2
            )
            self.value_cache[layer_idx] = torch.cat(
                [self.value_cache[layer_idx], value_states], dim=2
            )

        return self.key_cache[layer_idx], self.value_cache[layer_idx]

    def reorder_cache(self, beam_idx: torch.LongTensor):
        """Reorders the cache for beam search, given the selected beam indices."""
        for layer_idx in range(len(self.key_cache)):
            if self.key_cache[layer_idx] is not None:
                device = self.key_cache[layer_idx].device
                beam_idx = beam_idx.to(device)
                self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(
                    0, beam_idx
                )
                self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(
                    0, beam_idx
                )

            if self.conv_states[layer_idx] is not None:
                device = self.conv_states[layer_idx][0].device
                beam_idx = beam_idx.to(device)
                q_conv, k_conv, v_conv = self.conv_states[layer_idx]
                self.conv_states[layer_idx] = (
                    q_conv.index_select(0, beam_idx),
                    k_conv.index_select(0, beam_idx),
                    v_conv.index_select(0, beam_idx),
                )
                self.recurrent_states[layer_idx] = self.recurrent_states[
                    layer_idx
                ].index_select(0, beam_idx)

    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
        # take any layer that contains cache and not empty tensor
        layer_idx = (
            self.transformer_layers[0]
            if layer_idx not in self.transformer_layers
            else layer_idx
        )
        if len(self.key_cache) <= layer_idx or self.key_cache[layer_idx] is None:
            return 0
        return self.key_cache[layer_idx].shape[-2]

    def get_mask_sizes(
        self, cache_position: torch.Tensor, layer_idx: int
    ) -> tuple[int, int]:
        """
        Return a tuple (kv_length, kv_offset) corresponding to the length and offset that will be returned for
        the given layer at `layer_idx`.
        The masks are then prepared according to the given lengths (kv_length, kv_offset) and patterns for each layer.
        """
        kv_offset = 0
        query_length = cache_position.shape[0]
        past_seen_tokens = self.get_seq_length(layer_idx)
        kv_length = query_length + past_seen_tokens
        return kv_length, kv_offset

    @property
    def has_previous_state(self):
        """We have a previous state if the last linear (conv) layer was already updated."""
        if self.last_linear_layer == -1:
            return False
        return self.conv_states[self.last_linear_layer] is not None


class KimiRMSNorm(nn.Module):
    def __init__(self, hidden_size, eps=1e-6):
        """
        KimiRMSNorm is equivalent to T5LayerNorm
        """
        super().__init__()
        self.weight = nn.Parameter(torch.ones(hidden_size))
        self.variance_epsilon = eps

    def forward(self, hidden_states):
        input_dtype = hidden_states.dtype
        hidden_states = hidden_states.to(torch.float32)
        variance = hidden_states.pow(2).mean(-1, keepdim=True)
        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
        return self.weight * hidden_states.to(input_dtype)


ALL_LAYERNORM_LAYERS.append(KimiRMSNorm)


class KimiBlockSparseMLP(nn.Module):
    def __init__(
        self, config: KimiLinearConfig, hidden_size=None, intermediate_size=None
    ):
        super().__init__()
        self.config = config
        self.ffn_dim = (
            config.intermediate_size if intermediate_size is None else intermediate_size
        )
        self.hidden_dim = config.hidden_size if hidden_size is None else hidden_size

        self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)  # gate
        self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False)  # down
        self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False)  # up

        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, hidden_states):
        current_hidden_states = self.act_fn(self.w1(hidden_states)) * self.w3(
            hidden_states
        )
        current_hidden_states = self.w2(current_hidden_states)
        return current_hidden_states


class KimiMLP(nn.Module):
    def __init__(
        self, config: KimiLinearConfig, hidden_size=None, intermediate_size=None
    ):
        super().__init__()
        self.config = config
        self.hidden_size = config.hidden_size if hidden_size is None else hidden_size
        self.intermediate_size = (
            config.intermediate_size if intermediate_size is None else intermediate_size
        )
        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, x):
        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
        return down_proj


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(
        batch, num_key_value_heads, n_rep, slen, head_dim
    )
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


def eager_attention_forward(
    module: nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask: Optional[torch.Tensor],
    scaling: float,
    dropout: float = 0.0,
    **kwargs: Unpack[TransformersKwargs],
):
    key_states = repeat_kv(key, module.num_key_value_groups)
    value_states = repeat_kv(value, module.num_key_value_groups)

    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
    if attention_mask is not None:
        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
        attn_weights = attn_weights + causal_mask

    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(
        query.dtype
    )
    attn_weights = nn.functional.dropout(
        attn_weights, p=dropout, training=module.training
    )
    attn_output = torch.matmul(attn_weights, value_states)
    attn_output = attn_output.transpose(1, 2).contiguous()

    return attn_output, attn_weights


class KimiMLAAttention(nn.Module):
    """
    Multi-Latent Attention adapted from deepseek-v3
    """

    def __init__(self, config: KimiLinearConfig, layer_idx: int):
        nn.Module.__init__(self)
        self.config = config
        self.layer_idx = layer_idx
        self.hidden_size = config.hidden_size
        self.num_heads = config.num_attention_heads
        self.num_key_value_heads = config.num_key_value_heads
        self.num_key_value_groups = self.num_heads // self.num_key_value_heads

        self.rope_theta = config.rope_theta
        self.attention_dropout = getattr(config, "attention_dropout", 0.0)

        try:
            self.q_lora_rank = config.q_lora_rank
            self.qk_rope_head_dim = config.qk_rope_head_dim
            self.kv_lora_rank = config.kv_lora_rank
            self.v_head_dim = config.v_head_dim
            self.qk_nope_head_dim = config.qk_nope_head_dim
            self.q_head_dim = self.qk_nope_head_dim + self.qk_rope_head_dim
            self.use_nope = config.mla_use_nope
            self.scaling = self.q_head_dim ** (-0.5)
        except Exception as e:
            raise ValueError(
                f"Kimi MLA config is not found or not properly formatted: {e}"
            ) from e

        assert self.q_lora_rank is None
        self.q_proj = nn.Linear(
            self.hidden_size,
            self.num_heads * self.q_head_dim,
            bias=False,
        )
        self.kv_a_proj_with_mqa = nn.Linear(
            self.hidden_size,
            self.kv_lora_rank + self.qk_rope_head_dim,
            bias=False,
        )
        self.kv_a_layernorm = KimiRMSNorm(self.kv_lora_rank)
        self.kv_b_proj = nn.Linear(
            self.kv_lora_rank,
            self.num_heads
            * (self.q_head_dim - self.qk_rope_head_dim + self.v_head_dim),
            bias=False,
        )
        self.o_proj = nn.Linear(
            self.num_heads * self.v_head_dim,
            self.hidden_size,
            bias=False,
        )
        self.is_causal = True
        assert self.use_nope

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        past_key_values: Optional[Cache] = None,
        **kwargs,
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
        batch_size, seq_length = hidden_states.shape[:-1]
        query_shape = (batch_size, seq_length, -1, self.q_head_dim)
        key_shape = (
            batch_size,
            seq_length,
            -1,
            self.qk_nope_head_dim + self.v_head_dim,
        )

        q_states = self.q_proj(hidden_states)
        q_states = q_states.view(query_shape).transpose(1, 2)
        q_pass, q_rot = torch.split(
            q_states, [self.qk_nope_head_dim, self.qk_rope_head_dim], dim=-1
        )

        compressed_kv = self.kv_a_proj_with_mqa(hidden_states)
        k_pass, k_rot = torch.split(
            compressed_kv, [self.kv_lora_rank, self.qk_rope_head_dim], dim=-1
        )

        k_pass = (
            self.kv_b_proj(self.kv_a_layernorm(k_pass)).view(key_shape).transpose(1, 2)
        )
        k_pass, value_states = torch.split(
            k_pass, [self.qk_nope_head_dim, self.v_head_dim], dim=-1
        )

        k_rot = k_rot.view(batch_size, 1, seq_length, self.qk_rope_head_dim)
        k_rot = k_rot.expand(*k_pass.shape[:-1], -1)

        query_states = torch.cat((q_pass, q_rot), dim=-1)
        key_states = torch.cat((k_pass, k_rot), dim=-1)

        if past_key_values is not None:
            key_states, value_states = past_key_values.update(
                key_states, value_states, self.layer_idx
            )

        if (
            self.config._attn_implementation == "flash_attention_2"
            and self.q_head_dim != self.v_head_dim
        ):
            value_states = F.pad(value_states, [0, self.q_head_dim - self.v_head_dim])

        attention_interface: Callable = eager_attention_forward
        if self.config._attn_implementation != "eager":
            attention_interface = ALL_ATTENTION_FUNCTIONS[
                self.config._attn_implementation
            ]

        attn_output, _ = attention_interface(
            self,
            query_states,
            key_states,
            value_states,
            attention_mask,
            dropout=0.0 if not self.training else self.attention_dropout,
            scaling=self.scaling,
            **kwargs,
        )

        if (
            self.config._attn_implementation == "flash_attention_2"
            and self.q_head_dim != self.v_head_dim
        ):
            attn_output = attn_output[:, :, :, : self.v_head_dim]

        attn_output = attn_output.reshape(batch_size, seq_length, -1).contiguous()
        attn_output = self.o_proj(attn_output)
        return attn_output


class KimiDeltaAttention(nn.Module):
    def __init__(self, config: KimiLinearConfig, layer_idx: int):
        super().__init__()
        self.config = config
        self.mode = "chunk"

        self.hidden_size = config.hidden_size
        self.conv_size = config.linear_attn_config["short_conv_kernel_size"]
        self.head_dim = config.linear_attn_config["head_dim"]
        self.num_heads = config.linear_attn_config["num_heads"]
        self.head_k_dim = self.head_dim
        self.num_k_heads = self.num_heads

        self.layer_idx = layer_idx

        assert self.mode in ["chunk", "fused_recurrent"], (
            f"Not suppoerted mode `{self.mode}`."
        )

        projection_k_size = self.head_k_dim * self.num_k_heads
        projection_size = self.head_dim * self.num_heads

        self.q_proj = nn.Linear(self.hidden_size, projection_k_size, bias=False)
        self.k_proj = nn.Linear(self.hidden_size, projection_k_size, bias=False)
        self.v_proj = nn.Linear(self.hidden_size, projection_size, bias=False)

        self.q_conv1d = ShortConvolution(
            hidden_size=projection_k_size,
            kernel_size=self.conv_size,
            activation="silu",
        )
        self.k_conv1d = ShortConvolution(
            hidden_size=projection_k_size, kernel_size=self.conv_size, activation="silu"
        )
        self.v_conv1d = ShortConvolution(
            hidden_size=projection_size, kernel_size=self.conv_size, activation="silu"
        )

        self.A_log = torch.nn.Parameter(
            torch.log(
                torch.empty(self.num_heads, dtype=torch.float32).uniform_(1, 16)
            ).view(1, 1, -1, 1)
        )

        self.f_a_proj = nn.Linear(self.hidden_size, self.head_dim, bias=False)
        self.f_b_proj = nn.Linear(self.head_dim, projection_size, bias=False)

        self.dt_bias = nn.Parameter(torch.empty(projection_size, dtype=torch.float32))

        self.b_proj = nn.Linear(self.hidden_size, self.num_heads, bias=False)

        self.g_a_proj = nn.Linear(self.hidden_size, self.head_dim, bias=False)
        self.g_b_proj = nn.Linear(self.head_dim, projection_size, bias=False)

        self.o_norm = FusedRMSNormGated(
            self.head_dim, eps=config.rms_norm_eps, activation="sigmoid"
        )
        self.o_proj = nn.Linear(projection_size, self.hidden_size, bias=False)

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        cache_params: Optional[KimiDynamicCache] = None,
        **kwargs: Unpack[dict],
    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
        if attention_mask is not None:
            if attention_mask.dim() != 2:
                attention_mask = kwargs.get("padding_mask", None)

            if attention_mask is not None and attention_mask.dim() != 2:
                raise ValueError(
                    "attention_mask must be a 0-1 matrix of shape [batch_size, seq_len] "
                    "(0 = padding). 3D masks are not supported here."
                )
        use_cache = cache_params is not None
        batch_size, q_len, _ = hidden_states.shape
        mode = "fused_recurrent" if q_len <= 64 else self.mode
        if self.training:
            assert mode == "chunk", "Only chunk mode is supported in training."

        cu_seqlens = kwargs.get("cu_seqlens", None)
        indices = None
        if attention_mask is not None:
            indices, cu_seqlens, _ = get_unpad_data(attention_mask[:, -q_len:])
            hidden_states = index_first_axis(
                rearrange(hidden_states, "b s ... -> (b s) ..."), indices
            ).unsqueeze(0)

        conv_state_q, conv_state_k, conv_state_v = None, None, None
        recurrent_state = None
        if cache_params is not None:
            if cache_params.conv_states[self.layer_idx] is not None:
                conv_state_q, conv_state_k, conv_state_v = cache_params.conv_states[
                    self.layer_idx
                ]
            recurrent_state = cache_params.recurrent_states[self.layer_idx]
        q, conv_state_q = self.q_conv1d(
            x=self.q_proj(hidden_states),
            cache=conv_state_q,
            output_final_state=use_cache,
            cu_seqlens=cu_seqlens,
        )
        k, conv_state_k = self.k_conv1d(
            x=self.k_proj(hidden_states),
            cache=conv_state_k,
            output_final_state=use_cache,
            cu_seqlens=cu_seqlens,
        )
        v, conv_state_v = self.v_conv1d(
            x=self.v_proj(hidden_states),
            cache=conv_state_v,
            output_final_state=use_cache,
            cu_seqlens=cu_seqlens,
        )
        g = self.f_b_proj(self.f_a_proj(hidden_states))
        g = fused_kda_gate(g, self.A_log, self.head_dim, g_bias=self.dt_bias)
        beta = self.b_proj(hidden_states).float().sigmoid()

        q, k = map(
            lambda x: rearrange(x, "... (h d) -> ... h d", d=self.head_k_dim), (q, k)
        )
        v = rearrange(v, "... (h d) -> ... h d", d=self.head_dim)

        if mode == "chunk":
            o, recurrent_state = chunk_kda(
                q=q,
                k=k,
                v=v,
                g=g,
                beta=beta,
                initial_state=recurrent_state,
                output_final_state=True,
                use_qk_l2norm_in_kernel=True,
                cu_seqlens=cu_seqlens,
            )
        else:
            o, recurrent_state = fused_recurrent_kda(
                q=q,
                k=k,
                v=v,
                g=g,
                beta=beta,
                initial_state=recurrent_state,
                output_final_state=True,
                use_qk_l2norm_in_kernel=True,
                cu_seqlens=cu_seqlens,
            )
        if cache_params is not None:
            cache_params.recurrent_states[self.layer_idx] = recurrent_state
            cache_params.conv_states[self.layer_idx] = (
                conv_state_q,
                conv_state_k,
                conv_state_v,
            )

        g = self.g_b_proj(self.g_a_proj(hidden_states))
        g = rearrange(g, "... (h d) -> ... h d", d=self.head_dim)
        o = self.o_norm(o, g)

        o = rearrange(o, "b t h d -> b t (h d)")
        o = self.o_proj(o)
        if attention_mask is not None:
            o = pad_input(o.squeeze(0), indices, batch_size, q_len)

        return o


class KimiMoEGate(nn.Module):
    """
    MoE Gate that returns router logits.
    Routing decisions are made in KimiSparseMoeBlock.
    """

    def __init__(self, config: KimiLinearConfig):
        super().__init__()
        self.config = config
        self.num_experts = config.num_experts
        self.gating_dim = config.hidden_size

        self.weight = nn.Parameter(torch.empty((self.num_experts, self.gating_dim)))
        self.e_score_correction_bias = nn.Parameter(torch.zeros((self.num_experts,)))
        self.reset_parameters()

    def reset_parameters(self) -> None:
        import torch.nn.init as init

        init.kaiming_uniform_(self.weight, a=math.sqrt(5))

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """
        Args:
            hidden_states: [batch_size, seq_len, hidden_dim]

        Returns:
            router_logits: [batch_size * seq_len, num_experts]
        """
        _, _, h = hidden_states.shape
        hidden_states = hidden_states.view(-1, h)
        router_logits = F.linear(
            hidden_states.type(torch.float32), self.weight.type(torch.float32), None
        )
        return router_logits

    # def forward(self, hidden_states):
    #     bsz, seq_len, h = hidden_states.shape
    #     # compute gating score
    #     hidden_states = hidden_states.view(-1, h)
    #     logits = F.linear(
    #         hidden_states.type(torch.float32), self.weight.type(
    #             torch.float32), None
    #     )
    #     if self.moe_router_activation_func == "sigmoid":
    #         scores = logits.sigmoid()
    #     elif self.moe_router_activation_func == "softmax":
    #         scores = logits.softmax(dim=1)
    #     else:
    #         raise NotImplementedError(
    #             f"insupportable scoring function for MoE gating: {self.moe_router_activation_func}"
    #         )

    #     # select top-k experts
    #     assert not self.training
    #     scores_for_choice = scores.view(bsz * seq_len, -1)
    #     scores_for_choice += self.e_score_correction_bias.unsqueeze(0)
    #     group_scores = (
    #         scores_for_choice.view(
    #             bsz * seq_len, self.num_expert_group, -1).topk(2, dim=-1)[0].sum(dim=-1)
    #     )  # [n, num_expert_group]
    #     group_idx = torch.topk(
    #         group_scores, k=self.topk_group, dim=-1, sorted=False
    #     )[
    #         1
    #     ]  # [n, top_k_group]
    #     group_mask = torch.zeros_like(group_scores)  # [n, num_expert_group]
    #     group_mask.scatter_(1, group_idx, 1)  # [n, num_expert_group]
    #     score_mask = (
    #         group_mask.unsqueeze(-1)
    #         .expand(
    #             bsz * seq_len, self.num_expert_group, self.num_experts // self.num_expert_group
    #         )
    #         .reshape(bsz * seq_len, -1)
    #     )  # [n, e]
    #     tmp_scores = scores_for_choice.masked_fill(
    #         ~score_mask.bool(), 0.0)  # [n, e]
    #     _, topk_idx = torch.topk(
    #         tmp_scores, k=self.top_k, dim=-1, sorted=False
    #     )
    #     topk_weight = scores.gather(1, topk_idx)

    #     # norm gate to sum 1
    #     if self.top_k > 1 and self.moe_renormalize:
    #         denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
    #         topk_weight = topk_weight / denominator
    #     # must multiply the scaling factor
    #     topk_weight = topk_weight * self.routed_scaling_factor

    #     return topk_idx, topk_weight


# class KimiSparseMoeBlock(nn.Module):
#     """
#     Adapted from Deepseek-V3's MOE implementation
#     The namings are consistent with Kimi's version.
#     """

#     def __init__(self, config: KimiLinearConfig):
#         super().__init__()
#         self.config = config
#         self.hidden_dim = config.hidden_size
#         self.num_experts = config.num_experts
#         self.top_k = config.num_experts_per_token
#         self.moe_renormalize = config.moe_renormalize

#         self.ep_size = 1
#         self.experts_per_rank = config.num_experts
#         self.ep_rank = 0
#         self.experts = nn.ModuleList(
#             [
#                 KimiBlockSparseMLP(
#                     config, intermediate_size=config.moe_intermediate_size
#                 )
#                 for _ in range(config.num_experts)
#             ]
#         )
#         self.gate = KimiMoEGate(config)
#         if config.num_shared_experts is not None:
#             intermediate_size = config.moe_intermediate_size * config.num_shared_experts
#             self.shared_experts = KimiMLP(
#                 config=config, intermediate_size=intermediate_size
#             )

#     def forward(self, hidden_states):
#         identity = hidden_states
#         orig_shape = hidden_states.shape
#         topk_idx, topk_weight = self.gate(hidden_states)
#         hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
#         flat_topk_idx = topk_idx.view(-1)
#         if not self.training:
#             y = self.moe_infer(hidden_states, topk_idx,
#                                topk_weight).view(*orig_shape)
#         else:
#             raise NotImplementedError(
#                 "Training mode is not supported in KimiSparseMoeBlock")
#         if self.config.num_shared_experts is not None:
#             y = y + self.shared_experts(identity)
#         return y

#     @torch.no_grad()
#     def moe_infer(self, x, topk_ids, topk_weight):
#         cnts = topk_ids.new_zeros((topk_ids.shape[0], len(self.experts)))
#         cnts.scatter_(1, topk_ids, 1)
#         tokens_per_expert = cnts.sum(dim=0)
#         idxs = topk_ids.view(-1).argsort()
#         sorted_tokens = x[idxs // topk_ids.shape[1]]

#         tokens_per_expert = tokens_per_expert.cpu().numpy()

#         outputs = []
#         start_idx = 0
#         for i, num_tokens in enumerate(tokens_per_expert):
#             end_idx = start_idx + num_tokens
#             if num_tokens == 0:
#                 continue
#             expert = self.experts[i + self.ep_rank * self.experts_per_rank]
#             tokens_for_this_expert = sorted_tokens[start_idx:end_idx]
#             expert_out = expert(tokens_for_this_expert)
#             outputs.append(expert_out)
#             start_idx = end_idx

#         outs = torch.cat(outputs, dim=0) if len(
#             outputs) else sorted_tokens.new_empty(0)

#         new_x = torch.empty_like(outs)
#         new_x[idxs] = outs
#         final_out = (
#             new_x.view(*topk_ids.shape, -1)
#             .type(topk_weight.dtype)
#             .mul_(topk_weight.unsqueeze(dim=-1))
#             .sum(dim=1)
#             .type(new_x.dtype)
#         )
#         return final_out


# Replace the KimiSparseMoeBlock class with this new version
class KimiSparseMoeBlock(nn.Module):
    """
    MoE block adapted from Deepseek-V3.
    Returns only hidden_states - router_logits captured by OutputRecorder.
    """

    def __init__(self, config: KimiLinearConfig):
        super().__init__()
        self.config = config
        self.hidden_dim = config.hidden_size
        self.num_experts = config.num_experts
        self.top_k = config.num_experts_per_token
        self.moe_renormalize = config.moe_renormalize
        self.routed_scaling_factor = config.routed_scaling_factor
        self.num_expert_group = getattr(config, "num_expert_group", 1)
        self.topk_group = getattr(config, "topk_group", 1)

        self.experts = nn.ModuleList(
            [
                KimiBlockSparseMLP(
                    config, intermediate_size=config.moe_intermediate_size
                )
                for _ in range(config.num_experts)
            ]
        )
        self.gate = KimiMoEGate(config)

        if config.num_shared_experts is not None:
            intermediate_size = config.moe_intermediate_size * config.num_shared_experts
            self.shared_experts = KimiMLP(
                config=config, intermediate_size=intermediate_size
            )

    def route_tokens_to_experts(
        self,
        router_logits: torch.Tensor,
    ) -> tuple[torch.Tensor, torch.Tensor]:
        """
        Compute routing decisions from router logits.

        Args:
            router_logits: [num_tokens, num_experts]

        Returns:
            topk_idx: [num_tokens, top_k]
            topk_weight: [num_tokens, top_k]
        """
        num_tokens = router_logits.shape[0]

        if self.training:
            # Training: use softmax for standard aux loss compatibility
            scores = F.softmax(router_logits, dim=-1, dtype=torch.float32)
            topk_weight, topk_idx = torch.topk(scores, self.top_k, dim=-1, sorted=False)
        else:
            # Inference: use original sigmoid + group selection
            scores = router_logits.sigmoid()
            scores_for_choice = scores + self.gate.e_score_correction_bias.unsqueeze(0)

            # Group-based selection
            group_scores = (
                scores_for_choice.view(num_tokens, self.num_expert_group, -1)
                .topk(2, dim=-1)[0]
                .sum(dim=-1)
            )
            group_idx = torch.topk(
                group_scores, k=self.topk_group, dim=-1, sorted=False
            )[1]
            group_mask = torch.zeros_like(group_scores)
            group_mask.scatter_(1, group_idx, 1)
            score_mask = (
                group_mask.unsqueeze(-1)
                .expand(
                    num_tokens,
                    self.num_expert_group,
                    self.num_experts // self.num_expert_group,
                )
                .reshape(num_tokens, -1)
            )
            tmp_scores = scores_for_choice.masked_fill(~score_mask.bool(), 0.0)
            _, topk_idx = torch.topk(tmp_scores, k=self.top_k, dim=-1, sorted=False)
            topk_weight = scores.gather(1, topk_idx)

        # Normalize and scale
        if self.top_k > 1 and self.moe_renormalize:
            denominator = topk_weight.sum(dim=-1, keepdim=True) + 1e-20
            topk_weight = topk_weight / denominator
        topk_weight = topk_weight * self.routed_scaling_factor

        return topk_idx, topk_weight

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """
        Forward pass returning only hidden_states.
        Router logits are captured by OutputRecorder for aux loss.
        """
        identity = hidden_states
        batch_size, seq_len, hidden_dim = hidden_states.shape
        num_tokens = batch_size * seq_len

        # Flatten for routing
        hidden_states_flat = hidden_states.view(num_tokens, hidden_dim)

        # Get router logits - OutputRecorder captures this!
        router_logits = self.gate(hidden_states)

        # Get routing decisions
        topk_idx, topk_weight = self.route_tokens_to_experts(router_logits)

        if self.training:
            final_hidden_states = self._training_forward(
                hidden_states_flat, topk_idx, topk_weight, num_tokens, hidden_dim
            )
        else:
            final_hidden_states = self._inference_forward(
                hidden_states_flat, topk_idx, topk_weight
            )

        final_hidden_states = final_hidden_states.view(batch_size, seq_len, hidden_dim)

        # Add shared experts if present
        if self.config.num_shared_experts is not None:
            final_hidden_states = final_hidden_states + self.shared_experts(identity)

        return final_hidden_states

    def _training_forward(
        self,
        hidden_states: torch.Tensor,
        topk_idx: torch.Tensor,
        topk_weight: torch.Tensor,
        num_tokens: int,
        hidden_dim: int,
    ) -> torch.Tensor:
        """
        Differentiable training forward using scatter-gather pattern.
        """
        # Flatten expert indices: [num_tokens * top_k]
        flat_topk_idx = topk_idx.view(-1)

        # Sort by expert index to group tokens going to same expert
        sorted_indices = torch.argsort(flat_topk_idx)
        inverse_permutation = torch.argsort(sorted_indices)

        # Each token appears top_k times (once per expert choice)
        token_indices = torch.arange(
            num_tokens, device=hidden_states.device
        ).repeat_interleave(self.top_k)

        # Gather tokens and weights in sorted order
        shuffled_tokens = hidden_states[token_indices[sorted_indices]]
        shuffled_weights = topk_weight.view(-1)[sorted_indices].unsqueeze(-1)

        # Count tokens per expert
        tokens_per_expert = F.one_hot(flat_topk_idx, num_classes=self.num_experts).sum(
            dim=0
        )

        # Process each expert's batch
        expert_outputs = []
        current_pos = 0
        for i in range(self.num_experts):
            num_tokens_for_expert = tokens_per_expert[i].item()
            if num_tokens_for_expert == 0:
                continue

            expert_input = shuffled_tokens[
                current_pos : current_pos + num_tokens_for_expert
            ]
            expert_output = self.experts[i](expert_input)
            expert_outputs.append(expert_output)
            current_pos += num_tokens_for_expert

        # Concatenate all outputs
        if expert_outputs:
            concatenated_outputs = torch.cat(expert_outputs, dim=0)
        else:
            concatenated_outputs = torch.zeros(
                num_tokens * self.top_k,
                hidden_dim,
                device=hidden_states.device,
                dtype=hidden_states.dtype,
            )

        # Apply weights while still in sorted order
        weighted_outputs = concatenated_outputs * shuffled_weights

        # Unsort back to original token order
        unshuffled_outputs = weighted_outputs[inverse_permutation]

        # Sum contributions from all top_k experts for each token
        final_hidden_states = unshuffled_outputs.view(
            num_tokens, self.top_k, hidden_dim
        ).sum(dim=1)

        return final_hidden_states

    @torch.no_grad()
    def _inference_forward(
        self,
        hidden_states: torch.Tensor,
        topk_idx: torch.Tensor,
        topk_weight: torch.Tensor,
    ) -> torch.Tensor:
        """
        Optimized inference forward (original implementation).
        """
        cnts = topk_idx.new_zeros((topk_idx.shape[0], len(self.experts)))
        cnts.scatter_(1, topk_idx, 1)
        tokens_per_expert = cnts.sum(dim=0)
        idxs = topk_idx.view(-1).argsort()
        sorted_tokens = hidden_states[idxs // topk_idx.shape[1]]

        tokens_per_expert_list = tokens_per_expert.cpu().numpy()

        outputs = []
        start_idx = 0
        for i, num_tokens in enumerate(tokens_per_expert_list):
            end_idx = start_idx + num_tokens
            if num_tokens == 0:
                continue
            expert = self.experts[i]
            tokens_for_expert = sorted_tokens[start_idx:end_idx]
            expert_out = expert(tokens_for_expert)
            outputs.append(expert_out)
            start_idx = end_idx

        outs = torch.cat(outputs, dim=0) if outputs else sorted_tokens.new_empty(0)

        new_x = torch.empty_like(outs)
        new_x[idxs] = outs
        final_out = (
            new_x.view(*topk_idx.shape, -1)
            .type(topk_weight.dtype)
            .mul_(topk_weight.unsqueeze(dim=-1))
            .sum(dim=1)
            .type(new_x.dtype)
        )
        return final_out


class KimiDecoderLayer(nn.Module):
    def __init__(self, config: KimiLinearConfig, layer_idx: int):
        super().__init__()
        self.hidden_size = config.hidden_size
        self.config = config
        if config.is_kda_layer(layer_idx):
            self.is_linear_attn = True
            self.self_attn = KimiDeltaAttention(config=config, layer_idx=layer_idx)
        elif config.is_mla:
            self.is_linear_attn = False
            self.self_attn = KimiMLAAttention(config=config, layer_idx=layer_idx)
        else:
            raise NotImplementedError
        if (
            config.num_experts is not None
            and layer_idx >= config.first_k_dense_replace
            and layer_idx % getattr(config, "moe_layer_freq", 1) == 0
        ):
            self.block_sparse_moe = KimiSparseMoeBlock(config)
        else:
            self.mlp = KimiMLP(config)
        self.input_layernorm = KimiRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
        self.post_attention_layernorm = KimiRMSNorm(
            config.hidden_size, eps=config.rms_norm_eps
        )

    def forward(
        self,
        hidden_states: torch.Tensor,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[torch.Tensor]] = None,
        output_attentions: Optional[bool] = False,
        use_cache: Optional[bool] = False,
        **kwargs: Unpack[FlashAttentionKwargs],
    ) -> Tuple[
        torch.FloatTensor, Optional[Tuple[torch.FloatTensor, torch.FloatTensor]]
    ]:
        """
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
            use_cache (`bool`, *optional*):
                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
                (see `past_key_values`).
            past_key_value (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
        """

        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Self Attention
        if self.is_linear_attn is False:
            hidden_states = self.self_attn(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_values=past_key_values,
                output_attentions=output_attentions,
                use_cache=use_cache,
                **kwargs,
            )
        else:
            hidden_states = self.self_attn(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                cache_params=past_key_values,
                output_attentions=output_attentions,
                use_cache=use_cache,
                **kwargs,
            )
        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        if hasattr(self, "block_sparse_moe"):
            hidden_states = self.block_sparse_moe(hidden_states)
        else:
            hidden_states = self.mlp(hidden_states)
        hidden_states = residual + hidden_states

        return hidden_states


class KimiPreTrainedModel(PreTrainedModel):
    config_class = KimiLinearConfig
    base_model_prefix = "model"
    supports_gradient_checkpointing = True
    _no_split_modules = ["KimiDecoderLayer"]
    _skip_keys_device_placement = "past_key_values"
    _supports_flash_attn_2 = True
    _can_record_outputs = {
        "router_logits": OutputRecorder(KimiMoEGate, index=0),
        "hidden_states": KimiDecoderLayer,
        "attentions": KimiMLAAttention,
    }
    _is_stateful = True

    def _init_weights(self, module):
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()


class KimiLinearModel(KimiPreTrainedModel):
    def __init__(self, config: KimiLinearConfig):
        super().__init__(config)
        self.padding_idx = config.pad_token_id
        self.vocab_size = config.vocab_size

        self.embed_tokens = nn.Embedding(
            config.vocab_size, config.hidden_size, self.padding_idx
        )
        self.layers = nn.ModuleList(
            [
                KimiDecoderLayer(config, layer_idx)
                for layer_idx in range(config.num_hidden_layers)
            ]
        )
        self.norm = KimiRMSNorm(config.hidden_size, eps=config.rms_norm_eps)

        if getattr(config, "_attn_implementation", None) is not None:
            if config._attn_implementation != "flash_attention_2":
                logger.warning_once(
                    f"Ignoring the provided attention implementation {config._attn_implementation}"
                )
                logger.warning_once("Using flash_attention_2 backend instead.")
                config._attn_implementation = "flash_attention_2"
        else:
            config._attn_implementation = "flash_attention_2"

        self._use_flash_attention_2 = config._attn_implementation == "flash_attention_2"
        self.gradient_checkpointing = False
        # Initialize weights and apply final processing
        self.post_init()

    def _update_linear_attn_mask(self, attention_mask, cache_position):
        """
        NOTE: Left-padding is used for linear attention mask.
        No need for zeroing states when
            1. Cached forward
            2. Attending to all inputs
        """
        linear_attn_mask = attention_mask
        if cache_position[0] > 0 or (
            attention_mask is not None and torch.all(attention_mask == 1)
        ):
            linear_attn_mask = None
        return linear_attn_mask

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        cache_position: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        use_cache = use_cache if use_cache is not None else self.config.use_cache

        if (input_ids is None) and (inputs_embeds is None):
            raise ValueError(
                "You must specify exactly one of input_ids or inputs_embeds"
            )

        # Get inputs_embeds
        if inputs_embeds is None:
            inputs_embeds = self.embed_tokens(input_ids)

        if use_cache and past_key_values is None:
            past_key_values = KimiDynamicCache(config=self.config)

        if cache_position is None:
            past_seen_tokens = (
                past_key_values.get_seq_length() if past_key_values is not None else 0
            )
            cache_position: torch.Tensor = torch.arange(
                past_seen_tokens,
                past_seen_tokens + inputs_embeds.shape[1],
                device=inputs_embeds.device,
            )

        if position_ids is None:
            position_ids = cache_position.unsqueeze(0)

        causal_mask = create_causal_mask(
            config=self.config,
            input_embeds=inputs_embeds,
            attention_mask=attention_mask,
            cache_position=cache_position,
            past_key_values=past_key_values,
            position_ids=position_ids,
        )
        linear_attn_mask = self._update_linear_attn_mask(attention_mask, cache_position)

        hidden_states = inputs_embeds
        if past_key_values is not None:
            assert isinstance(past_key_values, KimiDynamicCache)

        for decoder_layer in self.layers:
            layer_mask = (
                linear_attn_mask if decoder_layer.is_linear_attn else causal_mask
            )

            hidden_states = decoder_layer(
                hidden_states,
                attention_mask=layer_mask,
                past_key_values=past_key_values,
                cache_position=cache_position,
                **kwargs,
            )

        hidden_states = self.norm(hidden_states)

        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            past_key_values=past_key_values,
        )


class KimiLinearForCausalLM(KimiPreTrainedModel, GenerationMixin):
    _tied_weights_keys = ["lm_head.weight"]

    def __init__(self, config):
        super().__init__(config)
        self.model = KimiLinearModel(config)
        self.vocab_size = config.vocab_size
        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)

        # Initialize weights and apply final processing
        self.post_init()

    @can_return_tuple
    def forward(
        self,
        input_ids: torch.LongTensor = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[List[torch.FloatTensor]] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        generation_mode: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs: Unpack[TransformersKwargs],
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        r"""
        Args:
            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Returns:

        Example:

        ```python
        >>> from transformers import AutoTokenizer, KimiLinearForCausalLM

        >>> model = KimiLinearForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS)
        >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER)

        >>> prompt = "Hey, are you conscious? Can you talk to me?"
        >>> inputs = tokenizer(prompt, return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
        ```"""
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            use_cache=use_cache,
            cache_position=cache_position,
            **kwargs,
        )

        logits = outputs[0]
        if generation_mode:
            logits = logits[:, -1:]
        logits = self.lm_head(logits)

        loss = None
        if labels is not None:
            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)

        aux_loss = None
        if kwargs.get("output_router_logits", False):
            aux_loss = load_balancing_loss_func(
                outputs.router_logits,
                num_experts=self.config.num_experts,
                top_k=self.config.num_experts_per_token,
                attention_mask=attention_mask,
            )
            if loss is not None:
                loss = loss + self.config.router_aux_loss_coef * aux_loss

        return MoeCausalLMOutputWithPast(
            loss=loss,
            aux_loss=aux_loss,
            logits=logits,
            past_key_values=outputs.past_key_values,
            hidden_states=outputs.hidden_states,
            attentions=outputs.attentions,
        )


================================================
FILE: src/axolotl/monkeypatch/models/kimi_linear/patch_kimi_linear.py
================================================
import importlib.resources
import importlib.util
import sys
from pathlib import Path

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

KIMI_PATCH_PACKAGE = "axolotl.monkeypatch.models.kimi_linear"


def get_patch_file_path(package_dot_path: str, filename: str) -> Path:
    """
    Gets the absolute path to a patch file using importlib.resources.files.
    """
    try:
        return importlib.resources.files(package_dot_path) / filename
    except ModuleNotFoundError:
        return None


def _load_local_module(module_name: str, filename: str):
    """Helper to load a local module if not already loaded."""
    if module_name in sys.modules:
        return sys.modules[module_name]

    patch_path = get_patch_file_path(KIMI_PATCH_PACKAGE, filename)
    if patch_path and patch_path.exists():
        spec = importlib.util.spec_from_file_location(module_name, patch_path)
        module = importlib.util.module_from_spec(spec)
        sys.modules[module_name] = module
        spec.loader.exec_module(module)
        return module
    return None


def _patch_get_class_in_module():
    """
    Core patch function that hijacks Transformers' dynamic module loading.
    """
    from transformers.dynamic_module_utils import get_class_in_module

    if hasattr(get_class_in_module, "_axolotl_patched"):
        return

    original_get_class_in_module = get_class_in_module

    # Mapping of module path patterns to (module_name, filename)
    KIMI_MODULE_MAP = {
        "configuration_kimi": ("configuration_kimi", "configuration_kimi.py"),
        "modeling_kimi": ("modeling_kimi", "modeling_kimi.py"),
        "tokenization_kimi": ("tokenization_kimi", "tokenization_kimi.py"),
    }

    def patched_get_class_in_module(class_name, module_path, **kwargs):
        """Patched version that returns our local modules instead of remote ones."""
        for pattern, (module_name, filename) in KIMI_MODULE_MAP.items():
            if pattern in module_path:
                module = _load_local_module(module_name, filename)
                if module:
                    return getattr(module, class_name)
                break  # Pattern matched but file not found, fall through

        return original_get_class_in_module(class_name, module_path, **kwargs)

    import transformers.dynamic_module_utils

    transformers.dynamic_module_utils.get_class_in_module = patched_get_class_in_module
    patched_get_class_in_module._axolotl_patched = True


def patch_kimi():
    """
    Apply all Kimi patches.
    Must be called BEFORE loading config/tokenizer/model.
    """
    _patch_get_class_in_module()
    LOG.info("Kimi patches applied successfully!")


# Keep these for backward compatibility if needed
patch_kimi_config = patch_kimi
patch_kimi_tokenizer = patch_kimi
patch_kimi_model = patch_kimi


================================================
FILE: src/axolotl/monkeypatch/models/kimi_linear/tokenization_kimi.py
================================================
"""
Adapted Kimi-Linear tokenizer to use proper template defaults and misc fixes.

Source: https://huggingface.co/moonshotai/Kimi-Linear-48B-A3B-Instruct/blob/main/tokenization_kimi.py
Revision: 919416f
"""

import os
from logging import getLogger
from pathlib import Path
from shutil import copyfile
from typing import (
    Any,
    Dict,
    Iterator,
    List,
    Optional,
    Tuple,
    Union,
    cast,
)

import tiktoken
from tiktoken.load import load_tiktoken_bpe
from tokenizers import AddedToken
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
from transformers.tokenization_utils import PreTrainedTokenizer

logger = getLogger(__name__)
VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}


class TikTokenTokenizer(PreTrainedTokenizer):
    """
    Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.

    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
    this superclass for more information regarding those methods.

    Args:
        vocab_file (`str`):
            The path to the Tiktoken model file.
        bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
        eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
            The end of sequence token.
        unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead. The second to last item in special_tokens.
        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
            The token used for padding, for example when batching sequences of different lengths.
        additional_special_tokens (list of `str`, *optional*):
            A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
            skipped when decoding if `skip_special_tokens` is set to `True`.
    """

    vocab_files_names = VOCAB_FILES_NAMES

    model_input_names = ["input_ids", "attention_mask"]

    special_tokens: Dict[str, int]

    num_reserved_special_tokens = 256

    pat_str = "|".join(
        [
            r"""[\p{Han}]+""",
            r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
            r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
            r"""\p{N}{1,3}""",
            r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
            r"""\s*[\r\n]+""",
            r"""\s+(?!\S)""",
            r"""\s+""",
        ]
    )

    def __init__(
        self,
        vocab_file,
        bos_token: Union[str, AddedToken] = "[BOS]",  # nosec: B107
        eos_token: Union[str, AddedToken] = "[EOS]",  # nosec: B107
        unk_token: Union[str, AddedToken, None] = None,
        pad_token: Union[str, AddedToken, None] = None,
        additional_special_tokens: List[str] = None,
        added_tokens_decoder: Optional[dict] = None,
        **kwargs,
    ):
        assert os.path.isfile(vocab_file), vocab_file

        if additional_special_tokens is None:
            additional_special_tokens = [
                "<|im_end|>",
                "<|im_user|>",
                "<|im_assistant|>",
                "<|start_header_id|>",
                "<|end_header_id|>",
                "[EOT]",
                "<|im_system|>",
                "<|im_middle|>",
            ]

        special_tokens_mapping = {
            i: added_tokens_decoder[i].content for i in added_tokens_decoder
        }

        self.vocab_file = vocab_file
        mergeable_ranks = load_tiktoken_bpe(vocab_file)
        num_base_tokens = len(mergeable_ranks)
        self.special_tokens = {
            special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
            for i in range(
                num_base_tokens, num_base_tokens + self.num_reserved_special_tokens + 2
            )
        }

        self.model = tiktoken.Encoding(
            name=Path(vocab_file).name,
            pat_str=self.pat_str,
            mergeable_ranks=mergeable_ranks,
            special_tokens=self.special_tokens,
        )
        logger.info(f"Reloaded tiktoken model from {vocab_file}")

        self.n_words: int = self.model.n_vocab
        # BOS / EOS token IDs
        self.bos_id: int = self.special_tokens[str(bos_token)]
        self.eos_id: int = self.special_tokens[str(eos_token)]
        logger.info(
            f"#words: {self.n_words} - BOS ID: {self.bos_id} - EOS ID: {self.eos_id}"
        )

        self.pad_id: int = self.special_tokens[str(pad_token)]
        self.unk_id: int = self.special_tokens[str(unk_token)]

        self.byte_encoder = bytes_to_unicode()
        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}

        self.decoder = {}
        for i in range(self.n_words):
            # Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
            decoding = "".join(
                [
                    self.byte_encoder[ord(char)]
                    for char in self.model.decode_single_token_bytes(i).decode(
                        "latin-1"
                    )
                ]
            )
            self.decoder[i] = decoding

        self.encoder = {}
        for i in range(self.n_words):
            if i in self.decoder:
                self.encoder[self.decoder[i]] = i

        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            pad_token=pad_token,
            additional_special_tokens=additional_special_tokens,
            **kwargs,
        )
        self.all_special_ids_set = set(self.all_special_ids)

    def encode(
        self, text: str, allow_special_tokens: bool = True, **kwargs
    ) -> List[int]:
        """
        Encodes a string into a list of token IDs.

        Args:
            text (str): The input string to be encoded.

        Returns:
            list[int]: A list of token IDs.
        """
        # If there are other args, we should call super().encode because there are a lot of code
        # to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
        # NOTE: our encode method is not compatible with the super().encode method,
        #   e.g. split_special_tokens' default is True in our encode method.
        if len(kwargs) > 0:
            # logger.warning(f"Calling super().encode with {kwargs}")
            return super().encode(text, **kwargs)

        assert type(text) is str

        # The tiktoken tokenizer can handle <=400k chars without
        # pyo3_runtime.PanicException.
        TIKTOKEN_MAX_ENCODE_CHARS = 400_000

        # https://github.com/openai/tiktoken/issues/195
        # Here we iterate over subsequences and split if we exceed the limit
        # of max consecutive non-whitespace or whitespace characters.
        MAX_NO_WHITESPACES_CHARS = 25_000

        texts = self.pre_tokenizer_process(text)

        all_substrs = []
        for text in texts:
            substrs = (
                substr
                for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
                for substr in self._split_whitespaces_or_nonwhitespaces(
                    text[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
                )
            )
            all_substrs.extend(substrs)

        t: List[int] = []
        for substr in all_substrs:
            if allow_special_tokens:
                t.extend(
                    # we should consider special token as a common token
                    self.model.encode(
                        substr,
                        allowed_special="all",
                    )
                )
            else:
                t.extend(
                    # we should consider special token as a common token
                    self.model.encode(
                        substr,
                        disallowed_special=(),
                    )
                )

        return t

    def decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
        """
        Decodes a list of token IDs into a string.

        Args:
            token_ids (List[int]): The list of token IDs to be decoded.

        Returns:
            str: The decoded string.
        """
        # If there are other args, we should call super().decode because there are a lot of code
        # to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
        if len(kwargs) > 0:
            return super().decode(token_ids, **kwargs)

        if type(token_ids) is int:
            token_ids = [token_ids]

        return self.model.decode(cast(List[int], token_ids))

    @staticmethod
    def _split_whitespaces_or_nonwhitespaces(
        s: str, max_consecutive_slice_len: int
    ) -> Iterator[str]:
        """
        Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
        consecutive whitespaces or consecutive non-whitespaces.
        """
        current_slice_len = 0
        current_slice_is_space = s[0].isspace() if len(s) > 0 else False
        slice_start = 0

        for i in range(len(s)):
            is_now_space = s[i].isspace()

            if current_slice_is_space ^ is_now_space:
                current_slice_len = 1
                current_slice_is_space = is_now_space
            else:
                current_slice_len += 1
                if current_slice_len > max_consecutive_slice_len:
                    yield s[slice_start:i]
                    slice_start = i
                    current_slice_len = 1
        yield s[slice_start:]

    def pre_tokenizer_process(self, text: str) -> List[str]:
        """
        pre-tokenizes the input text into a list of tokens.
        This method is used to split the input text into smaller chunks for internal processing.
        """
        return [text]

    """ ----- Below are the abstract methods required by PreTrainedTokenizer ----- """

    @property
    def vocab_size(self) -> int:
        return self.n_words

    def get_vocab(self) -> Dict[str, int]:
        return self.encoder

    def _tokenize(self, text: str, **kwargs) -> List[str]:
        return [self.decoder[t] for t in self.encode(text)]

    def _convert_token_to_id(self, token: str) -> int:
        return self.encoder.get(token, self.unk_id)

    def _convert_id_to_token(self, index: int) -> str:
        return self.decoder.get(index)

    @staticmethod
    def clean_up_tokenization(out_string: str) -> str:
        return out_string

    def convert_tokens_to_string(self, tokens: List[str]) -> str:
        text = "".join(tokens)
        text = bytearray([self.byte_decoder[c] for c in text]).decode(
            "utf-8", "replace"
        )
        return text

    def save_vocabulary(
        self, save_directory: str, filename_prefix: Optional[str] = None
    ) -> Tuple[str]:
        if not os.path.isdir(save_directory):
            raise ValueError(
                f"vocabulary path ({save_directory}) should be a directory"
            )
        out_vocab_file = os.path.join(
            save_directory,
            (filename_prefix + "-" if filename_prefix else "")
            + VOCAB_FILES_NAMES["vocab_file"],
        )

        if os.path.abspath(self.vocab_file) != os.path.abspath(
            out_vocab_file
        ) and os.path.isfile(self.vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        return (out_vocab_file,)

    def apply_chat_template(
        self,
        conversation,
        tools: Optional[list[dict]] = None,
        tokenize: bool = True,
        add_generation_prompt: bool = False,
        **kwargs,
    ):
        tools = deep_sort_dict(tools)
        return super().apply_chat_template(
            conversation,
            tools=tools,
            tokenize=tokenize,
            add_generation_prompt=add_generation_prompt,
            **kwargs,
        )


def deep_sort_dict(obj: Any) -> Any:
    if isinstance(obj, dict):
        return {k: deep_sort_dict(v) for k, v in sorted(obj.items())}
    if isinstance(obj, list):
        return [deep_sort_dict(item) for item in obj]
    return obj


================================================
FILE: src/axolotl/monkeypatch/models/llama4/__init__.py
================================================


================================================
FILE: src/axolotl/monkeypatch/models/llama4/modeling.py
================================================
"""
Modified Llama-4 text experts modeling for linearized experts for improved LoRA support
"""

import sys

import torch
from torch import nn
from transformers import Llama4Config
from transformers.activations import ACT2FN


class Llama4TextExperts(nn.Module):
    """
    Modified Llama-4 text experts modeling for linearized experts
    """

    def __init__(self, config: Llama4Config):
        super().__init__()
        self.num_experts = config.num_local_experts
        self.intermediate_size = config.intermediate_size
        self.hidden_size = config.hidden_size
        self.expert_dim = self.intermediate_size

        # Replace fused gate_up_proj with separate Linear modules
        self.gate_projs = nn.ModuleList(
            [
                nn.Linear(self.hidden_size, self.expert_dim, bias=False)
                for _ in range(self.num_experts)
            ]
        )

        self.up_projs = nn.ModuleList(
            [
                nn.Linear(self.hidden_size, self.expert_dim, bias=False)
                for _ in range(self.num_experts)
            ]
        )

        # Replace down_proj Parameter with Linear modules
        self.down_projs = nn.ModuleList(
            [
                nn.Linear(self.expert_dim, self.hidden_size, bias=False)
                for _ in range(self.num_experts)
            ]
        )

        self.act_fn = ACT2FN[config.hidden_act]

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        """
        Forward method using separate Linear layers for each expert.

        Args:
            hidden_states (torch.Tensor): (num_experts * batch_size, hidden_size)
                The input should be organized by expert

        Returns:
            torch.Tensor: (num_experts * batch_size, hidden_size)
        """
        # Reshape to separate by expert
        hidden_states = hidden_states.view(self.num_experts, -1, self.hidden_size)
        # batch_size_per_expert = hidden_states.size(1)

        # Initialize output tensor
        next_states = torch.zeros_like(hidden_states)

        # Process each expert separately
        for i in range(self.num_experts):
            # Get input for this expert
            expert_input = hidden_states[
                i
            ]  # Shape: (batch_size_per_expert, hidden_size)

            # Apply gate and up projections
            gate = self.gate_projs[i](
                expert_input
            )  # Shape: (batch_size_per_expert, expert_dim)
            up = self.up_projs[i](
                expert_input
            )  # Shape: (batch_size_per_expert, expert_dim)

            # Apply activation and down projection
            next_states[i] = self.down_projs[i](up * self.act_fn(gate))

        # Flatten back to original shape
        return next_states.view(-1, self.hidden_size)


def patch_llama4_linearized_modeling():
    """
    Patch Llama4TextExperts to use separate Linear layers for each expert.
    """
    from transformers.models.llama4 import modeling_llama4

    old_lamma_4_text_experts = modeling_llama4.Llama4TextExperts
    modeling_llama4.Llama4TextExperts = Llama4TextExperts
    sys.modules["transformers.models.llama4"].Llama4TextExperts = Llama4TextExperts

    def unpatch():
        modeling_llama4.Llama4TextExperts = old_lamma_4_text_experts
        sys.modules[
            "transformers.models.llama4"
        ].Llama4TextExperts = old_lamma_4_text_experts

    return unpatch


================================================
FILE: src/axolotl/monkeypatch/models/mistral3/__init__.py
================================================


================================================
FILE: src/axolotl/monkeypatch/models/mistral3/mistral_common_tokenizer.py
================================================
"""
Monkeypatch to fix inefficient tensor conversion in MistralCommonBackend.apply_chat_template
"""

import importlib
import inspect

from axolotl.monkeypatch.utils import detab_code
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def apply_mistral_tokenizer_image_patch():
    """Apply patch to MistralCommonBackend.apply_chat_template to fix image tensor conversion."""
    from transformers.tokenization_mistral_common import MistralCommonBackend

    # Get original source
    original_source = inspect.getsource(MistralCommonBackend.apply_chat_template)
    original_source, _ = detab_code(original_source)

    # Define the replacement
    original_tensor_conversion = (
        "                    pixel_values = torch.tensor(images)"
    )

    patched_tensor_conversion = """                    if isinstance(images, list) and len(images) > 0 and isinstance(images[0], np.ndarray):
                        pixel_values = torch.tensor(np.array(images))
                    else:
                        pixel_values = torch.tensor(images)"""

    # Apply the replacement
    if original_tensor_conversion in original_source:
        patched_source = original_source.replace(
            original_tensor_conversion, patched_tensor_conversion
        )
        patched_source = patched_source.replace(
            "def apply_chat_template(",
            "def patched_apply_chat_template(",
            1,
        )

        # Load necessary imports from the module
        module_name = MistralCommonBackend.__module__
        module = importlib.import_module(module_name)

        # Detect what needs to be imported
        items_to_import = []
        for item in dir(module):
            if item in patched_source and not item.startswith("_"):
                items_to_import.append(item)

        # Execute imports in global scope
        if items_to_import:
            exec(  # nosec B102
                f"from {module_name} import ({', '.join(items_to_import)})",
                globals(),
            )

        # Also need standard imports that might be used
        exec("import numpy as np", globals())  # nosec B102
        exec("import torch", globals())  # nosec B102
        exec("from typing import Union, Optional, List, Dict, Any, Callable", globals())  # nosec B102
        exec("from pathlib import Path", globals())  # nosec B102

        # Import other dependencies that might be needed
        try:
            exec("from transformers.utils import is_torch_available", globals())  # nosec B102
            exec(
                "from transformers.tokenization_utils_base import BatchEncoding, PaddingStrategy, TensorType",
                globals(),
            )  # nosec B102
            exec("from transformers.utils import logging", globals())  # nosec B102
            exec("logger = logging.get_logger(__name__)", globals())  # nosec B102
        except ImportError as e:
            LOG.warning(f"Could not import some dependencies: {e}")

        # Execute the patched source
        exec(patched_source, globals())  # nosec B102

        # Replace the method
        MistralCommonBackend.apply_chat_template = patched_apply_chat_template
        LOG.info("Successfully applied MistralCommonBackend tensor conversion patch")
    else:
        LOG.warning("Could not find target code for MistralCommonBackend patching")


================================================
FILE: src/axolotl/monkeypatch/models/pixtral/__init__.py
================================================


================================================
FILE: src/axolotl/monkeypatch/models/pixtral/modeling_flash_attention_utils.py
================================================
"""Monkeypatch for FA utils to accept 1D position_ids from Pixtral's position_ids_in_meshgrid"""

import torch


def apply_patch_is_packed_sequence():
    """Apply patch to FA utils to accept 1D position_ids from Pixtral's position_ids_in_meshgrid"""
    from transformers import modeling_flash_attention_utils

    def fixed_is_packed_sequence(position_ids, batch_size):
        """
        Check the position ids whether packed sequences are indicated or not
            1. Position ids exist
            2. Flattened sequences only are supported
            3. Compile-friendly `not (torch.diff(position_ids, dim=-1) >= 0).all()`, i.e. we have multiple increasing sequences
        """
        if position_ids is None:
            return False

        if position_ids.ndim == 1:
            position_ids = position_ids.unsqueeze(0)  # [N] -> [1, N]

        increasing_position_sequences = (
            torch.arange(position_ids.shape[1], device=position_ids.device)
            + position_ids.min()
        )
        return (
            batch_size == 1
            and (increasing_position_sequences - position_ids).abs().sum().bool().item()
        )

    # Store original method
    old_fn = modeling_flash_attention_utils._is_packed_sequence

    # Apply the patch
    modeling_flash_attention_utils._is_packed_sequence = fixed_is_packed_sequence

    def unpatch():
        """Restore the original method"""
        modeling_flash_attention_utils._is_packed_sequence = old_fn

    return unpatch


================================================
FILE: src/axolotl/monkeypatch/models/qwen3_5/__init__.py
================================================


================================================
FILE: src/axolotl/monkeypatch/models/qwen3_5/modeling.py
================================================
"""Monkeypatch for Qwen3_5 and Qwen3_5Moe models to pass position_ids to linear attention."""

import importlib
from typing import Optional, Tuple

import torch
import torch.nn.functional as F

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

try:
    from fla.modules.convolution import (
        causal_conv1d as fla_causal_conv1d,  # FLA >= 0.4.1
    )
except ImportError:
    try:
        from fla.modules.conv import causal_conv1d as fla_causal_conv1d  # FLA < 0.4.1
    except ImportError:
        fla_causal_conv1d = None


def get_cu_seqlens(position_ids):
    """
    Compute cumulative sequence lengths from position_ids for FLA varlen kernels.

    Adapted from transformers.modeling_flash_attention_utils.prepare_fa_kwargs_from_position_ids.
    https://github.com/huggingface/transformers/blob/0f1b128d3359a26bd18be99c26d7f04fb3cba914/src/transformers/modeling_flash_attention_utils.py#L316

    Qwen3.5 uses MRoPE: position_ids arrive as [axes, B, T]. All axes carry the
    same temporal positions, so axis 0 is used to recover the [B, T] layout.
    See: https://github.com/huggingface/transformers/blob/main/src/transformers/models/qwen3_5/modeling_qwen3_5.py
    """
    if position_ids.ndim == 3:
        position_ids = position_ids[0]

    tensor_kwargs = {"dtype": torch.int32, "device": position_ids.device}
    position_ids = position_ids.view(-1)
    indices_q = (position_ids == 0).nonzero().view(-1)
    return torch.cat(
        (
            indices_q.to(**tensor_kwargs),
            torch.tensor(position_ids.size(), **tensor_kwargs),
        )
    )


def _inject_fla_kernels(module) -> None:
    """Inject FLA kernels into a modeling module, bypassing is_flash_linear_attention_available."""
    try:
        from fla.modules import FusedRMSNormGated
        from fla.ops.gated_delta_rule import (
            chunk_gated_delta_rule,
            fused_recurrent_gated_delta_rule,
        )

        module.FusedRMSNormGated = FusedRMSNormGated
        module.chunk_gated_delta_rule = chunk_gated_delta_rule
        module.fused_recurrent_gated_delta_rule = fused_recurrent_gated_delta_rule
        module.is_fast_path_available = True
    except ImportError:
        module.chunk_gated_delta_rule = None
        module.fused_recurrent_gated_delta_rule = None
        module.FusedRMSNormGated = None


def _patched_decoder_forward(
    self,
    hidden_states: torch.Tensor,
    position_embeddings: Tuple[torch.Tensor, torch.Tensor],
    attention_mask: Optional[torch.Tensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values=None,
    cache_position: Optional[torch.LongTensor] = None,
    **kwargs,
) -> torch.FloatTensor:
    """Decoder layer forward that passes position_ids through to linear attention."""
    residual = hidden_states
    hidden_states = self.input_layernorm(hidden_states)

    if self.layer_type == "linear_attention":
        hidden_states = self.linear_attn(
            hidden_states=hidden_states,
            cache_params=past_key_values,
            cache_position=cache_position,
            attention_mask=attention_mask,
            position_ids=position_ids,
        )
    elif self.layer_type == "full_attention":
        hidden_states, _ = self.self_attn(
            hidden_states=hidden_states,
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            cache_position=cache_position,
            position_embeddings=position_embeddings,
            **kwargs,
        )

    hidden_states = residual + hidden_states

    residual = hidden_states
    hidden_states = self.post_attention_layernorm(hidden_states)
    hidden_states = self.mlp(hidden_states)
    if isinstance(hidden_states, tuple):  # MoE returns (hidden_states, router_logits)
        hidden_states, _ = hidden_states
    hidden_states = residual + hidden_states

    return hidden_states


def _make_qwen3_5_gated_delta_forward(apply_mask_fn):
    """Factory for patched Qwen3_5/Qwen3_5Moe GatedDeltaNet forward with packing support."""

    def patched_forward(
        self,
        hidden_states: torch.Tensor,
        cache_params=None,
        cache_position: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
    ):
        hidden_states = apply_mask_fn(hidden_states, attention_mask)

        batch_size, seq_len, _ = hidden_states.shape

        use_precomputed_states = (
            cache_params is not None
            and cache_params.has_previous_state
            and seq_len == 1
            and cache_position is not None
        )

        cu_seqlens = None
        if not use_precomputed_states and position_ids is not None:
            cu_seqlens = get_cu_seqlens(position_ids=position_ids)

        if cache_params is not None:
            conv_state = cache_params.conv_states[self.layer_idx]
            recurrent_state = cache_params.recurrent_states[self.layer_idx]

        # mixed_qkv stays [B, T, D]; only transposed inside paths that require [B, D, T]
        mixed_qkv = self.in_proj_qkv(hidden_states)  # [B, T, D]

        z = self.in_proj_z(hidden_states)
        z = z.reshape(batch_size, seq_len, -1, self.head_v_dim)

        b = self.in_proj_b(hidden_states)
        a = self.in_proj_a(hidden_states)

        if use_precomputed_states:
            mixed_qkv = self.causal_conv1d_update(
                mixed_qkv.transpose(1, 2),
                conv_state,
                self.conv1d.weight.squeeze(1),
                self.conv1d.bias,
                self.activation,
            ).transpose(1, 2)
        else:
            if cache_params is not None:
                mixed_qkv_t = mixed_qkv.transpose(1, 2)
                cache_params.conv_states[self.layer_idx] = F.pad(
                    mixed_qkv_t,
                    (self.conv_kernel_size - mixed_qkv_t.shape[-1], 0),
                )

            if fla_causal_conv1d is not None and cu_seqlens is not None:
                # FLA varlen kernel for packed sequences; input must be contiguous [B, T, D]
                mixed_qkv, _ = fla_causal_conv1d(
                    x=mixed_qkv,
                    weight=self.conv1d.weight.squeeze(1),
                    bias=self.conv1d.bias,
                    activation=self.activation,
                    cu_seqlens=cu_seqlens,
                )
            else:
                if cu_seqlens is not None and fla_causal_conv1d is None:
                    raise RuntimeError(
                        "Packed sequences require fla.modules.convolution.causal_conv1d "
                        "(cu_seqlens support). Install flash-linear-attention or disable packing."
                    )
                mixed_qkv = F.silu(
                    self.conv1d(mixed_qkv.transpose(1, 2))[:, :, :seq_len]
                ).transpose(1, 2)

        query, key, value = torch.split(
            mixed_qkv,
            [self.key_dim, self.key_dim, self.value_dim],
            dim=-1,
        )
        query = query.reshape(batch_size, seq_len, -1, self.head_k_dim)
        key = key.reshape(batch_size, seq_len, -1, self.head_k_dim)
        value = value.reshape(batch_size, seq_len, -1, self.head_v_dim)

        beta = b.sigmoid()
        g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
        if self.num_v_heads // self.num_k_heads > 1:
            query = query.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)
            key = key.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)

        if not use_precomputed_states:
            core_attn_out, last_recurrent_state = self.chunk_gated_delta_rule(
                query,
                key,
                value,
                g=g.to(dtype=query.dtype),
                beta=beta,
                initial_state=None,
                output_final_state=cache_params is not None,
                use_qk_l2norm_in_kernel=True,
                # torch_chunk_gated_delta_rule fallback does not accept cu_seqlens
                **({"cu_seqlens": cu_seqlens} if cu_seqlens is not None else {}),
            )
        else:
            core_attn_out, last_recurrent_state = self.recurrent_gated_delta_rule(
                query,
                key,
                value,
                g=g.to(dtype=query.dtype),
                beta=beta,
                initial_state=recurrent_state,
                output_final_state=cache_params is not None,
                use_qk_l2norm_in_kernel=True,
            )

        if cache_params is not None:
            cache_params.recurrent_states[self.layer_idx] = last_recurrent_state

        core_attn_out = core_attn_out.reshape(-1, self.head_v_dim)
        z = z.reshape(-1, self.head_v_dim)
        core_attn_out = self.norm(core_attn_out, z)
        core_attn_out = core_attn_out.reshape(batch_size, seq_len, -1)

        return self.out_proj(core_attn_out)

    return patched_forward


def _apply_packing_patches(model_type: str, cls_prefix: str, forward_factory) -> None:
    module_name = f"transformers.models.{model_type}.modeling_{model_type}"

    try:
        module = importlib.import_module(module_name)
    except ImportError:
        LOG.warning(f"{model_type} not found in transformers, skipping packing patches")
        return

    _inject_fla_kernels(module)
    getattr(module, f"{cls_prefix}DecoderLayer").forward = _patched_decoder_forward
    gated_cls = getattr(module, f"{cls_prefix}GatedDeltaNet")
    gated_cls.forward = forward_factory(module.apply_mask_to_padding_states)

    LOG.info(
        f"Applied {cls_prefix} packing patch "
        f"(fla_causal_conv1d={'available' if fla_causal_conv1d else 'unavailable'})"
    )


def patch_qwen3_5_modeling_packing():
    _apply_packing_patches("qwen3_5", "Qwen3_5", _make_qwen3_5_gated_delta_forward)


def patch_qwen3_5_moe_modeling_packing():
    _apply_packing_patches(
        "qwen3_5_moe", "Qwen3_5Moe", _make_qwen3_5_gated_delta_forward
    )


def patch_qwen3_5_vlm_flash_attention():
    """
    Patch _is_packed_sequence to handle Qwen3.5's 3-D MRoPE position_ids.

    transformers passes position_ids as [axes, B, T] to decoder layers, but
    _is_packed_sequence only handles 2-D tensors and mis-classifies the 3-D
    shape as a packed-sequence indicator, causing CUDA errors in the varlen path.
    """
    try:
        import transformers.modeling_flash_attention_utils as fa_utils

        _original = fa_utils._is_packed_sequence

        def _patched(position_ids, batch_size):
            if position_ids is not None and position_ids.ndim != 2:
                return False
            return _original(position_ids, batch_size)

        fa_utils._is_packed_sequence = _patched
        LOG.info("Applied Qwen3.5 VLM flash-attention patch (3-D MRoPE position_ids)")
    except Exception as exc:  # pragma: no cover
        LOG.warning(f"Failed to apply Qwen3.5 VLM flash-attention patch: {exc}")


================================================
FILE: src/axolotl/monkeypatch/models/qwen3_next/__init__.py
================================================
"""Qwen3_Next model monkeypatches."""


================================================
FILE: src/axolotl/monkeypatch/models/qwen3_next/modeling.py
================================================
"""Monkeypatch for Qwen3_Next model to pass position_ids to linear attention."""

from typing import Optional, Tuple

import torch
import torch.nn.functional as F

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

try:
    from fla.modules.convolution import causal_conv1d as fla_causal_conv1d
except ImportError:
    fla_causal_conv1d = None


def get_cu_seqlens(position_ids):
    """
    Adapted from transformers.modeling_flash_attention_utils.prepare_fa_kwargs_from_position_ids.

    https://github.com/huggingface/transformers/blob/0f1b128d3359a26bd18be99c26d7f04fb3cba914/src/transformers/modeling_flash_attention_utils.py#L316
    """
    tensor_kwargs = {"dtype": torch.int32, "device": position_ids.device}

    position_ids = position_ids.view(-1)
    indices_q = (position_ids == 0).nonzero().view(-1)

    cu_seq_lens_q = torch.cat(
        (
            indices_q.to(**tensor_kwargs),
            torch.tensor(position_ids.size(), **tensor_kwargs),
        )
    )

    return cu_seq_lens_q


def patch_qwen3_next_decoder_layer():
    """Patch Qwen3NextDecoderLayer to pass position_ids to linear attention."""
    try:
        from transformers.models.qwen3_next.modeling_qwen3_next import (
            Qwen3NextDecoderLayer,
        )
    except ImportError:
        LOG.warning("Qwen3Next model not found, skipping patch")
        return

    # Store original forward method
    original_decoder_forward = Qwen3NextDecoderLayer.forward

    def patched_decoder_forward(
        self,
        hidden_states: torch.Tensor,
        position_embeddings: Tuple[torch.Tensor, torch.Tensor],
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Tuple[torch.Tensor]] = None,
        cache_position: Optional[torch.LongTensor] = None,
        **kwargs,
    ) -> torch.FloatTensor:
        residual = hidden_states

        hidden_states = self.input_layernorm(hidden_states)

        # Token Mixer
        if self.layer_type == "linear_attention":
            hidden_states = self.linear_attn(
                hidden_states=hidden_states,
                cache_params=past_key_values,
                cache_position=cache_position,
                attention_mask=attention_mask,
                position_ids=position_ids,
            )
        elif self.layer_type == "full_attention":
            # Self Attention
            hidden_states, _ = self.self_attn(
                hidden_states=hidden_states,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_values=past_key_values,
                cache_position=cache_position,
                position_embeddings=position_embeddings,
                **kwargs,
            )

        hidden_states = residual + hidden_states

        # Fully Connected
        residual = hidden_states
        hidden_states = self.post_attention_layernorm(hidden_states)
        hidden_states = self.mlp(hidden_states)
        # For the MoE layers, we need to unpack
        if isinstance(hidden_states, Tuple):
            hidden_states, _ = hidden_states
        hidden_states = residual + hidden_states

        return hidden_states

    # Apply the patches
    Qwen3NextDecoderLayer.forward = patched_decoder_forward

    def unpatch():
        """Restore the original forward method"""
        Qwen3NextDecoderLayer.forward = original_decoder_forward

    return unpatch


def patch_qwen3_next_gateddelta_layer():
    """Patch Qwen3NextGatedDeltaNet to parse cu_seqlens and pass to chunk_gated_delta_rule"""
    try:
        from transformers.models.qwen3_next.modeling_qwen3_next import (
            Qwen3NextDynamicCache,
            Qwen3NextGatedDeltaNet,
            apply_mask_to_padding_states,
        )
    except ImportError:
        LOG.warning("Qwen3Next model not found, skipping patch")
        return

    # Store original forward method
    original_gated_delta_net_forward = Qwen3NextGatedDeltaNet.forward

    def patched_gated_delta_net_forward(
        self,
        hidden_states: torch.Tensor,
        cache_params: Optional[Qwen3NextDynamicCache] = None,
        cache_position: Optional[torch.LongTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
    ):
        hidden_states = apply_mask_to_padding_states(hidden_states, attention_mask)

        # Set up dimensions for reshapes later
        batch_size, seq_len, _ = hidden_states.shape

        use_precomputed_states = (
            cache_params is not None
            and cache_params.has_previous_state
            and seq_len == 1
            and cache_position is not None
        )

        # Compute cu_seqlens early for use by both causal_conv1d and chunk_gated_delta_rule
        cu_seqlens = None
        if not use_precomputed_states and position_ids is not None:
            cu_seqlens = get_cu_seqlens(position_ids=position_ids)

        # getting projected states from cache if it exists
        if cache_params is not None:
            conv_state = cache_params.conv_states[self.layer_idx]
            recurrent_state = cache_params.recurrent_states[self.layer_idx]

        projected_states_qkvz = self.in_proj_qkvz(hidden_states)
        projected_states_ba = self.in_proj_ba(hidden_states)
        query, key, value, z, b, a = self.fix_query_key_value_ordering(
            projected_states_qkvz, projected_states_ba
        )
        query, key, value = (
            x.reshape(x.shape[0], x.shape[1], -1) for x in (query, key, value)
        )

        mixed_qkv = torch.cat((query, key, value), dim=-1)  # [B, T, D]

        if use_precomputed_states:
            # Inference single-token path: causal_conv1d_update expects [B, D, T]
            mixed_qkv = mixed_qkv.transpose(1, 2)
            mixed_qkv = self.causal_conv1d_update(
                mixed_qkv,
                conv_state,
                self.conv1d.weight.squeeze(1),
                self.conv1d.bias,
                self.activation,
            )
            mixed_qkv = mixed_qkv.transpose(1, 2)
        else:
            if cache_params is not None:
                # Cache state expects [B, D, T] for the inference update path
                mixed_qkv_t = mixed_qkv.transpose(1, 2)
                conv_state = F.pad(
                    mixed_qkv_t,
                    (self.conv_kernel_size - mixed_qkv_t.shape[-1], 0),
                )
                cache_params.conv_states[self.layer_idx] = conv_state

            if fla_causal_conv1d is not None:
                # FLA Triton causal_conv1d: [B, T, D] in/out, with cu_seqlens support
                mixed_qkv, _ = fla_causal_conv1d(
                    x=mixed_qkv,
                    weight=self.conv1d.weight.squeeze(1),
                    bias=self.conv1d.bias,
                    activation=self.activation,
                    cu_seqlens=cu_seqlens,
                )
            else:
                # PyTorch fallback (no cu_seqlens support)
                if cu_seqlens is not None and cu_seqlens.shape[0] > batch_size + 1:
                    raise RuntimeError(
                        "Packed sequences require fla.modules.convolution.causal_conv1d "
                        "(cu_seqlens support). Install flash-linear-attention or disable packing."
                    )
                LOG.warning_once(
                    "FLA causal_conv1d not available. Falling back to PyTorch conv1d."
                )
                mixed_qkv = mixed_qkv.transpose(1, 2)
                mixed_qkv = F.silu(self.conv1d(mixed_qkv)[:, :, :seq_len])
                mixed_qkv = mixed_qkv.transpose(1, 2)

        # mixed_qkv is [B, T, D] in all paths
        query, key, value = torch.split(
            mixed_qkv,
            [
                self.key_dim,
                self.key_dim,
                self.value_dim,
            ],
            dim=-1,
        )
        query = query.reshape(query.shape[0], query.shape[1], -1, self.head_k_dim)
        key = key.reshape(key.shape[0], key.shape[1], -1, self.head_k_dim)
        value = value.reshape(value.shape[0], value.shape[1], -1, self.head_v_dim)

        beta = b.sigmoid()
        # If the model is loaded in fp16, without the .float() here, A might be -inf
        g = -self.A_log.float().exp() * F.softplus(a.float() + self.dt_bias)
        if self.num_v_heads // self.num_k_heads > 1:
            query = query.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)
            key = key.repeat_interleave(self.num_v_heads // self.num_k_heads, dim=2)

        if not use_precomputed_states:
            core_attn_out, last_recurrent_state = self.chunk_gated_delta_rule(
                query,
                key,
                value,
                g=g,
                beta=beta,
                initial_state=None,
                output_final_state=cache_params is not None,
                use_qk_l2norm_in_kernel=True,
                cu_seqlens=cu_seqlens,
            )

        else:
            core_attn_out, last_recurrent_state = self.recurrent_gated_delta_rule(
                query,
                key,
                value,
                g=g,
                beta=beta,
                initial_state=recurrent_state,
                output_final_state=cache_params is not None,
                use_qk_l2norm_in_kernel=True,
            )

        # Update cache
        if cache_params is not None:
            cache_params.recurrent_states[self.layer_idx] = last_recurrent_state

        z_shape_og = z.shape
        # reshape input data into 2D tensor
        core_attn_out = core_attn_out.reshape(-1, core_attn_out.shape[-1])
        z = z.reshape(-1, z.shape[-1])
        core_attn_out = self.norm(core_attn_out, z)
        core_attn_out = core_attn_out.reshape(z_shape_og)
        core_attn_out = core_attn_out.reshape(
            core_attn_out.shape[0], core_attn_out.shape[1], -1
        )

        output = self.out_proj(core_attn_out)
        return output

    # Apply the patches
    Qwen3NextGatedDeltaNet.forward = patched_gated_delta_net_forward

    def unpatch():
        """Restore the original forward method"""
        Qwen3NextGatedDeltaNet.forward = original_gated_delta_net_forward

    return unpatch


def patch_qwen3_next_imports():
    """Patch Qwen3Next imports to use try/except instead of is_flash_linear_attention_available."""
    try:
        import transformers.models.qwen3_next.modeling_qwen3_next as qwen3_modeling
    except ImportError:
        LOG.warning("Qwen3Next model not found, skipping import patch")
        return

    # Save original values for unpatch
    original_FusedRMSNormGated = getattr(qwen3_modeling, "FusedRMSNormGated", None)
    original_chunk_gated_delta_rule = getattr(
        qwen3_modeling, "chunk_gated_delta_rule", None
    )
    original_fused_recurrent_gated_delta_rule = getattr(
        qwen3_modeling, "fused_recurrent_gated_delta_rule", None
    )
    original_is_fast_path_available = getattr(
        qwen3_modeling, "is_fast_path_available", False
    )

    try:
        from fla.modules import FusedRMSNormGated
        from fla.ops.gated_delta_rule import (
            chunk_gated_delta_rule,
            fused_recurrent_gated_delta_rule,
        )

        qwen3_modeling.FusedRMSNormGated = FusedRMSNormGated
        qwen3_modeling.chunk_gated_delta_rule = chunk_gated_delta_rule
        qwen3_modeling.fused_recurrent_gated_delta_rule = (
            fused_recurrent_gated_delta_rule
        )

        # Force is_fast_path_available to be True
        # fla has triton kernels for causal_conv1d
        qwen3_modeling.is_fast_path_available = True
    except ImportError:
        qwen3_modeling.chunk_gated_delta_rule = None
        qwen3_modeling.fused_recurrent_gated_delta_rule = None
        qwen3_modeling.FusedRMSNormGated = None

    def unpatch():
        """Restore the original import values"""
        qwen3_modeling.FusedRMSNormGated = original_FusedRMSNormGated
        qwen3_modeling.chunk_gated_delta_rule = original_chunk_gated_delta_rule
        qwen3_modeling.fused_recurrent_gated_delta_rule = (
            original_fused_recurrent_gated_delta_rule
        )
        qwen3_modeling.is_fast_path_available = original_is_fast_path_available

    return unpatch


def patch_qwen3_next_modeling_packing():
    """Apply all Qwen3Next model patches."""
    patch_qwen3_next_imports()
    patch_qwen3_next_decoder_layer()
    patch_qwen3_next_gateddelta_layer()

    LOG.info("Applied Qwen3Next patch for packing")


================================================
FILE: src/axolotl/monkeypatch/models/voxtral/__init__.py
================================================


================================================
FILE: src/axolotl/monkeypatch/models/voxtral/modeling.py
================================================
"""Monkeypatch for voxtral to fix leaf node and dtype mismatch"""

from typing import Optional, Union

import torch
from transformers.cache_utils import Cache
from transformers.modeling_outputs import CausalLMOutputWithPast


def patch_voxtral_conditional_generation_forward():
    from transformers.models.voxtral.modeling_voxtral import (
        VoxtralForConditionalGeneration,
    )

    # Store the original forward method
    old_forward = VoxtralForConditionalGeneration.forward

    def _forward(
        self,
        input_ids: Optional[torch.LongTensor] = None,
        input_features: Optional[torch.FloatTensor] = None,
        attention_mask: Optional[torch.Tensor] = None,
        position_ids: Optional[torch.LongTensor] = None,
        past_key_values: Optional[Cache] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        labels: Optional[torch.LongTensor] = None,
        use_cache: Optional[bool] = None,
        cache_position: Optional[torch.LongTensor] = None,
        logits_to_keep: Union[int, torch.Tensor] = 0,
        **kwargs,
    ) -> CausalLMOutputWithPast:
        if inputs_embeds is None:
            inputs_embeds = self.get_input_embeddings()(input_ids)

        if input_features is not None:
            audio_embeds = self.get_audio_embeds(input_features)

            # Cast audio_embeds to match inputs_embeds dtype
            audio_embeds = audio_embeds.to(inputs_embeds.dtype)

            # replace text-audio token placeholders with audio embeddings
            audio_token_mask = input_ids == self.config.audio_token_id

            inputs_embeds = inputs_embeds.clone()
            inputs_embeds[audio_token_mask] = audio_embeds

        outputs = self.language_model(
            attention_mask=attention_mask,
            position_ids=position_ids,
            past_key_values=past_key_values,
            inputs_embeds=inputs_embeds,
            labels=labels,
            use_cache=use_cache,
            cache_position=cache_position,
            logits_to_keep=logits_to_keep,
            **kwargs,
        )
        return outputs

    # Apply the patch
    VoxtralForConditionalGeneration.forward = _forward

    def unpatch():
        """Restore the original forward method"""
        VoxtralForConditionalGeneration.forward = old_forward

    return unpatch


================================================
FILE: src/axolotl/monkeypatch/moe_quant.py
================================================
"""Loading-time quantization for MoE expert weights stored as 3D nn.Parameter tensors."""

import bitsandbytes as bnb
import torch
import torch.nn.utils.parametrize as P

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

_moe_load_state = {
    "count": 0,
    "mode": "4bit",
    "quant_type": "nf4",
    "compress_statistics": True,
    "patched": False,
    # Module path → param names in definition order, captured before quantization.
    # Without this, alphabetical loading order would mismatch merge order.
    "expert_param_order": {},
}


class Bnb8bitParametrization(torch.nn.Module):
    """Dequantizes int8 row-wise quantized data on access."""

    def __init__(self, row_stats: torch.Tensor):
        super().__init__()
        self.register_buffer("row_stats", row_stats)

    @torch.no_grad()
    def forward(self, quantized_param: torch.Tensor) -> torch.Tensor:
        """Flatten 3D+ to 2D for BnB's dequant, then reshape back."""
        orig_shape = quantized_param.shape
        if quantized_param.ndim > 2:
            quantized_param = quantized_param.reshape(-1, orig_shape[-1])
        result = bnb.functional.int8_vectorwise_dequant(quantized_param, self.row_stats)
        return result.reshape(orig_shape)


def _enable_parametrization_cache(module, inputs):
    P._cache_enabled += 1


def _disable_parametrization_cache(module, inputs, output):
    P._cache_enabled -= 1
    if not P._cache_enabled:
        P._cache = {}


def replace_parameter_8bit(module, param_name):
    """Replace a module parameter with an 8-bit quantized version using parametrization."""
    original_param = getattr(module, param_name)
    int8_data, row_stats, _ = bnb.functional.int8_vectorwise_quant(
        original_param.data.to(torch.float16)
    )

    setattr(module, param_name, torch.nn.Parameter(int8_data, requires_grad=False))
    del original_param

    P.register_parametrization(
        module, param_name, Bnb8bitParametrization(row_stats), unsafe=True
    )

    # Cache dequantized values during forward to avoid redundant dequantization.
    if not getattr(module, "_axolotl_8bit_hooks_registered", False):
        module.register_forward_pre_hook(_enable_parametrization_cache)
        module.register_forward_hook(_disable_parametrization_cache)
        module._axolotl_8bit_hooks_registered = True


def patch_moe_quantization_on_load(cfg):
    """Patch transformers' weight loading to quantize MoE expert params on-the-fly."""
    mode = "8bit" if getattr(cfg, "load_in_8bit", False) else "4bit"
    _moe_load_state["mode"] = mode
    _moe_load_state["count"] = 0
    _moe_load_state["expert_param_order"] = {}

    if _moe_load_state["patched"]:
        LOG.debug("MoE loading-time quantization patch already active")
        return

    import transformers.core_model_loading
    import transformers.modeling_utils

    if mode == "4bit":
        from bitsandbytes.nn.parametrize import replace_parameter_4bit

        quant_type = getattr(cfg, "bnb_4bit_quant_type", None) or "nf4"
        compress_statistics = getattr(cfg, "bnb_4bit_use_double_quant", None)
        if compress_statistics is None:
            compress_statistics = True

        _moe_load_state["quant_type"] = quant_type
        _moe_load_state["compress_statistics"] = compress_statistics

    # Disable caching_allocator_warmup — it pre-allocates a huge tensor at bf16
    # size for all params, defeating our on-load quantization VRAM savings.
    def _noop_warmup(*args, **kwargs):
        pass

    transformers.modeling_utils.caching_allocator_warmup = _noop_warmup

    original_set_param = transformers.core_model_loading.set_param_for_module

    def _patched_set_param_for_module(model, target_name, param_value, *args, **kwargs):
        original_set_param(model, target_name, param_value, *args, **kwargs)

        if param_value.ndim >= 3 and param_value.is_cuda:
            mod_path, _, pname = target_name.rpartition(".")
            mod = model.get_submodule(mod_path) if mod_path else model
            if not isinstance(mod, (bnb.nn.Linear4bit, bnb.nn.Linear8bitLt)):
                if "expert" not in target_name.lower():
                    LOG.debug(
                        "Skipping non-expert 3D param: %s (shape=%s)",
                        target_name,
                        list(param_value.shape),
                    )
                    return

                # Record definition order before parametrizations override it
                # with alphabetical order.
                if mod_path not in _moe_load_state["expert_param_order"]:
                    _moe_load_state["expert_param_order"][mod_path] = list(
                        mod._parameters.keys()
                    )

                if _moe_load_state["mode"] == "4bit":
                    replace_parameter_4bit(
                        mod,
                        pname,
                        compress_statistics=_moe_load_state["compress_statistics"],
                        quant_type=_moe_load_state["quant_type"],
                    )
                else:
                    replace_parameter_8bit(mod, pname)
                _moe_load_state["count"] += 1

                # Release the bf16 tensor so CUDA memory is freed immediately.
                param_value.data = torch.empty(0, device="cpu")
                torch.cuda.empty_cache()

    transformers.core_model_loading.set_param_for_module = _patched_set_param_for_module
    _moe_load_state["patched"] = True


def get_moe_quantized_count():
    """Return the number of expert parameters quantized during loading."""
    return _moe_load_state["count"]


def patch_peft_target_parameters_matching():
    """Fix PEFT's _inject_parameters for target_parameters on quantized MoE experts.

    1. Expands short suffixes to full module paths for parametrized modules.
    2. Iterates params in definition order (not alphabetical order) so saved
       adapters are compatible with standard PEFT, vLLM, etc.
    """
    if getattr(patch_peft_target_parameters_matching, "_axolotl_patched", False):
        return

    from contextlib import nullcontext

    from peft.tuners.tuners_utils import BaseTuner, BaseTunerLayer
    from peft.utils.integrations import init_empty_weights
    from peft.utils.other import _get_submodules

    def _patched_inject_parameters(
        self, peft_config, model, adapter_name, low_cpu_mem_usage
    ):
        original_targets = list(peft_config.target_parameters)
        expanded = set(original_targets)

        # Expand short suffixes to full paths for parametrized modules.
        for module_name, module in model.named_modules():
            if not hasattr(module, "parametrizations"):
                continue
            for target in original_targets:
                mod_path, _, param_name = target.rpartition(".")
                if (
                    module_name == mod_path or module_name.endswith("." + mod_path)
                ) and hasattr(module, param_name):
                    expanded.add(f"{module_name}.{param_name}")

        target_names_set = expanded

        def strip_base_layer_from_name(module_name):
            name = ".base_layer"
            while name in module_name:
                prefix, _, suffix = module_name.rpartition(name)
                module_name = prefix + suffix
            return module_name

        def create_and_replace_param(module_name, key, param_name):
            parent, target, target_name = _get_submodules(model, module_name)
            unwrapped_module_name = strip_base_layer_from_name(module_name)
            unwrapped_module = model.get_submodule(unwrapped_module_name)
            if (
                isinstance(unwrapped_module, BaseTunerLayer)
                and unwrapped_module.__class__.__name__ != "ParamWrapper"
            ):
                raise ValueError(
                    f"Trying to wrap an `nn.Parameter` of layer "
                    f"'{unwrapped_module_name}' of type "
                    f"{type(target).__name__}, which is not a valid target. "
                    f"Make sure that this layer is not also targeted with "
                    f"`target_modules`."
                )
            self._check_target_module_compatiblity(peft_config, model, target_name)
            ctx = init_empty_weights if low_cpu_mem_usage else nullcontext
            with ctx():
                self._create_and_replace(
                    peft_config,
                    adapter_name,
                    target,
                    target_name,
                    parent,
                    current_key=key,
                    parameter_name=param_name.rpartition(".")[-1],
                )

        # Use definition order (not alphabetical order) for parametrized modules
        # so ParamWrapper nesting matches vanilla PEFT on a plain model.
        expert_param_order = _moe_load_state.get("expert_param_order", {})

        for module_name, module in model.named_modules():
            if hasattr(module, "parametrizations"):
                stored_order = expert_param_order.get(module_name)
                if stored_order is not None:
                    params_iter = [
                        p for p in stored_order if p in module.parametrizations
                    ]
                else:
                    # Fallback for paths that bypass model loading (e.g. unit tests).
                    params_iter = list(module.parametrizations.keys())
                for param_name in params_iter:
                    key = f"{module_name}.{param_name}"
                    if (key in target_names_set) or any(
                        key.endswith(f".{t}") for t in target_names_set
                    ):
                        create_and_replace_param(module_name, key, param_name)
                        self.targeted_parameter_names.append(key)
            else:
                unwrapped_module_name = strip_base_layer_from_name(module_name)
                for param_name, _ in module.named_parameters(recurse=False):
                    key = f"{unwrapped_module_name}.{param_name}"
                    if (key in target_names_set) or any(
                        key.endswith(f".{t}") for t in target_names_set
                    ):
                        create_and_replace_param(module_name, key, param_name)
                        self.targeted_parameter_names.append(key)

    BaseTuner._inject_parameters = _patched_inject_parameters
    patch_peft_target_parameters_matching._axolotl_patched = True
    LOG.info("Patched PEFT _inject_parameters for consistent ParamWrapper ordering")


================================================
FILE: src/axolotl/monkeypatch/multipack.py
================================================
"""multipack patching for v2 of sample packing"""

import importlib

import transformers
from accelerate import init_empty_weights
from transformers import AutoConfig, AutoModelForCausalLM
from transformers.integrations import is_deepspeed_zero3_enabled

from axolotl.monkeypatch.mixtral import patch_mixtral_moe_forward_zero3
from axolotl.monkeypatch.utils import get_unpad_data

SUPPORTED_MULTIPACK_MODEL_TYPES = [
    "apertus",
    "mllama_text_model",
    "llama",
    "llama4",
    "mistral",
    "mixtral",
    "qwen2",
    "qwen2_moe",
    "qwen3",
    "qwen3_moe",
    "qwen3_next",
    "qwen3_5",
    "qwen3_5_moe",
    "falcon",
    "phi",
    "phi3",
    "gemma",
    "gemma2",
    "gemma3",
    "gemma3_text",
    "cohere",
    "cohere2",
    "gemmoe",
    "starcoder2",
    "deepseek_v2",
    "deepseek_v3",
    "glm",
    "glm4",
    "glm4_moe",
    "smollm3",
    "granite",
    "granitemoe",
    "granitemoeshared",
    "granitemoehybrid",
    "hunyuan_v1_dense",
    "hunyuan_v1_moe",
    "gpt_oss",
    "arcee",
    "seed_oss",
    "lfm2",
    "lfm2_moe",
    "olmo",
    "olmo2",
    "olmo3",
    "ministral",
    "ministral3",
    "mistral4",
    "afmoe",
    "nemotron",
]


def patch_for_multipack(model_type, model_name=None, has_remote_code=False):
    if has_remote_code:
        patch_remote(model_name)
    elif hasattr(transformers, "modeling_flash_attention_utils"):
        # sanity check in case upstream api changes on this
        assert hasattr(
            transformers.modeling_flash_attention_utils, "_get_unpad_data"
        ), "transformers api changed for _get_unpad_data for flash attention"
        transformers.modeling_flash_attention_utils._get_unpad_data = get_unpad_data

    if model_type == "mixtral" and is_deepspeed_zero3_enabled():
        patch_mixtral_moe_forward_zero3()


def patch_remote(model_name):
    model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
    # we need to load the model here in order for modeling_* to be available
    with init_empty_weights():
        AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
    parts = model_config.__class__.__module__.split(".")
    parts[-1] = parts[-1].replace("configuration_", "modeling_", 1)
    module_name = ".".join(parts)
    modeling_arch = importlib.import_module(module_name)
    if hasattr(modeling_arch, "_get_unpad_data"):
        modeling_arch._get_unpad_data = get_unpad_data


================================================
FILE: src/axolotl/monkeypatch/peft/__init__.py
================================================


================================================
FILE: src/axolotl/monkeypatch/peft/utils.py
================================================
"""
Patch prepare_model_for_kbit_training to not upcast everything
"""

import inspect

import peft

import axolotl
from axolotl.monkeypatch.utils import detab_code
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

ORIGINAL_PREPARE_CODE = """
        for param in model.parameters():
            if (
                (param.dtype == torch.float16) or (param.dtype == torch.bfloat16)
            ) and param.__class__.__name__ != "Params4bit":
                param.data = param.data.to(torch.float32)
"""

PATCHED_PREPARE_CODE = """
        for name, param in model.named_parameters():
            if (
                (param.dtype == torch.float16) or (param.dtype == torch.bfloat16)
            ) and param.__class__.__name__ != "Params4bit" and all(embed_name not in name for embed_name in ["embed_tokens", "lm_head"]):
                param.data = param.data.to(torch.float32)
"""


def get_peft_prep_code() -> str:
    prepare = inspect.getsource(peft.utils.other.prepare_model_for_kbit_training)
    return prepare


def check_peft_prep_code_is_patchable() -> bool:
    prep_code = get_peft_prep_code()
    prep_code, _ = detab_code(prep_code)
    return ORIGINAL_PREPARE_CODE in prep_code


def patch_peft_prep_code():
    """
    monkeypatch create_accelerator_and_postprocess so it checks for additional kwargs
    """

    try:
        prep_code = get_peft_prep_code()
    except OSError:
        return
    peft.utils.other._original_create_accelerator_and_postprocess = prep_code
    prep_code, _ = detab_code(prep_code)
    if ORIGINAL_PREPARE_CODE not in prep_code:
        return

    prep_code = prep_code.replace(ORIGINAL_PREPARE_CODE, PATCHED_PREPARE_CODE)
    prep_code = prep_code.replace(
        "def prepare_model_for_kbit_training(",
        "def fixed_prepare_model_for_kbit_training(",
        1,
    )

    items_to_import = []
    for item in dir(peft.utils.other):
        if item in prep_code:
            items_to_import.append(item)

    exec(
        "from peft.utils.other import (" + ", ".join(x for x in items_to_import) + ")",
        globals(),
    )
    exec(prep_code, globals())
    LOG.info("patching prepare_model_for_kbit_training to allow for overrides")
    peft.utils.other.prepare_model_for_kbit_training = (
        fixed_prepare_model_for_kbit_training
    )
    axolotl.loaders.model.prepare_model_for_kbit_training = (
        fixed_prepare_model_for_kbit_training
    )


================================================
FILE: src/axolotl/monkeypatch/relora.py
================================================
"""Implements the ReLoRA training procedure from https://arxiv.org/abs/2307.05695, minus the initial full fine-tune."""

import glob
import json
import os.path
import shutil
from functools import partial
from pathlib import Path
from typing import Dict, List, Union

import bitsandbytes as bnb
import peft
import safetensors.torch as st
import torch
from huggingface_hub import snapshot_download
from torch.distributed.optim import ZeroRedundancyOptimizer
from transformers import (
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
)
from transformers.trainer_utils import PREFIX_CHECKPOINT_DIR

from axolotl.utils.dict import DictDefault
from axolotl.utils.distributed import barrier, is_main_process
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


@torch.no_grad()
def magnitude_pruning_(tensor, prune_ratio):
    tensor_magnitude = torch.abs(tensor)
    threshold = torch.quantile(
        tensor_magnitude.flatten().to(dtype=torch.float32), prune_ratio
    ).to(dtype=tensor.dtype)

    mask = tensor_magnitude > threshold
    tensor.mul_(mask.to(dtype=tensor.dtype))


def reset_optimizer(
    optimizer: torch.optim.Optimizer,
    *,
    reset_params: List[str],  # where str is the key to a torch.nn.Parameter
    optimizer_state_keys: List[str],
    optimizer_magnitude_pruning: float = 0.9,
):
    # pylint:disable=unused-argument
    pruning_fn = partial(magnitude_pruning_, prune_ratio=optimizer_magnitude_pruning)
    n_zeros = 0
    n_total = 0

    optimizer_state = optimizer.state
    if isinstance(optimizer, ZeroRedundancyOptimizer):
        optimizer_state = optimizer.optim.state

    for group in optimizer.param_groups:
        for param in group["params"]:
            state = optimizer_state[param]
            for key, value in state.items():
                if key not in optimizer_state_keys:
                    continue
                if torch.is_tensor(value):
                    try:
                        pruning_fn(value)
                        n_total += value.numel()
                        n_zeros += torch.sum(value == 0).item()
                    except RuntimeError as exc:
                        if "quantile() input tensor is too large" in str(exc):
                            pass
                        else:
                            raise exc

    _zeroed = n_zeros / (1e-7 + n_total) * 100
    LOG.info(f"Percent of optimizer states zeroed: {_zeroed:.2f}")
    LOG.info(f"absolute n of optimizer states zeroed: {n_zeros}")


class ReLoRACallback(TrainerCallback):
    """Callback to merge LoRA weights into the base model and save full-weight checkpoints"""

    def __init__(self, cfg: DictDefault):
        self.relora_steps = cfg.jagged_restart_steps
        self.cpu_offload = cfg.relora_cpu_offload
        self.quantized = cfg.load_in_4bit or cfg.load_in_8bit
        self.last_full_model = cfg.base_model
        self.resume_from_checkpoint = cfg.resume_from_checkpoint

        if not os.path.exists(self.last_full_model):
            self.last_full_model = str(Path(snapshot_download(cfg.base_model)))

        assert os.path.exists(self.last_full_model), (
            "for ReLORA base_model must be a local path"
        )

        self.num_lora_restarts = 0
        self.need_full_save = False

    def on_train_begin(
        self,
        _args: TrainingArguments,
        _state: TrainerState,
        control: TrainerControl,
        model: peft.LoraModel,
        **_kwargs,
    ):
        if self.resume_from_checkpoint:
            weight_path = os.path.join(self.resume_from_checkpoint, "relora")
            if not os.path.exists(weight_path):
                LOG.warning(
                    "Resuming ReLoRA from checkpoint, but no full-weight save found"
                )
            else:
                LOG.info(f"Loading adjusted base weights from {weight_path}")
                load_weight_checkpoint(model, weight_path)
        return control

    def on_step_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        model: peft.LoraModel,
        optimizer: torch.optim.Optimizer,
        **_kwargs,
    ):
        if not optimizer:
            optimizer = state.optimizer
        if state.global_step > 0 and state.global_step % self.relora_steps == 0:
            checkpoint_folder = os.path.join(
                args.output_dir,
                f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
                "relora",
            )

            if "adam" in args.optim.lower():
                optimizer_state_keys = ["exp_avg", "exp_avg_sq"]
                if "8bit" in args.optim.lower():
                    optimizer_state_keys.append("state1")
                    optimizer_state_keys.append("state2")
            else:
                raise ValueError(f"Optimizer {args.optim} not supported with ReLoRA")

            lora_params = [
                n
                for n, p in model.named_parameters()
                if p.requires_grad and "lora_" in n
            ]

            model.save_pretrained(
                os.path.join(
                    args.output_dir,
                    f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}",
                    "adapter",
                ),
            )
            with torch.no_grad():
                merge_and_save(
                    model,
                    self.last_full_model,
                    checkpoint_folder,
                    reinit=True,
                    quantized=self.quantized,
                    actually_save=is_main_process(),
                    cpu_offload=self.cpu_offload,
                )
                reset_optimizer(
                    optimizer,
                    reset_params=lora_params,
                    optimizer_state_keys=optimizer_state_keys,
                    optimizer_magnitude_pruning=args.relora_prune_ratio,
                )

            if self.quantized:
                self.last_full_model = checkpoint_folder
            self.num_lora_restarts += 1

        return control

    def on_save(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        model: peft.LoraModel,
        **_kwargs,
    ):
        checkpoint_folder = os.path.join(
            args.output_dir, f"{PREFIX_CHECKPOINT_DIR}-{state.global_step}", "relora"
        )
        if (
            state.global_step >= self.relora_steps
            and state.global_step % self.relora_steps != 0
        ):
            if self.quantized:
                if is_main_process() and self.last_full_model != checkpoint_folder:
                    # ensure the latest full parameter save is in the latest checkpoint
                    # folder, so that automatic pruning of checkpoints does not remove it
                    LOG.info(f"moving last full parameter save to {checkpoint_folder}")
                    os.makedirs(checkpoint_folder, exist_ok=True)
                    chunks = glob.glob(
                        f"{self.last_full_model}/model*.safetensors"
                    ) + glob.glob(f"{self.last_full_model}/model*.index.json")
                    for path in chunks:
                        new_path = os.path.abspath(shutil.move(path, checkpoint_folder))
                        try:
                            os.symlink(new_path, path)
                        except OSError:
                            # probably on windows without permission to symlink
                            pass

                    self.last_full_model = checkpoint_folder
            else:
                model.model.save_pretrained(checkpoint_folder)

        return control

    def on_log(
        self,
        _args: TrainingArguments,
        _state: TrainerState,
        control: TrainerControl,
        logs: Dict[str, float],
        **_kwargs,
    ):
        logs["num_lora_restarts"] = self.num_lora_restarts
        return control

    def on_train_end(
        self,
        args: TrainingArguments,
        _state: TrainerState,
        control: TrainerControl,
        model: peft.LoraModel,
        **_kwargs,
    ):
        if self.quantized:
            # perform final merge and save
            with torch.no_grad():
                merge_and_save(
                    model,
                    self.last_full_model,
                    args.output_dir,
                    reinit=False,
                    quantized=self.quantized,
                    actually_save=is_main_process(),
                    cpu_offload=self.cpu_offload,
                )
        # no need to save if unquantized, as finetune.py will call merge_and_unload()
        return control


def sharded_paths(path: str, module_names: List[str]) -> Dict[str, str]:
    model_name = "model.safetensors"
    if not os.path.exists(str(Path(path) / model_name)) and not os.path.exists(
        str(Path(path) / f"{model_name}.index.json")
    ):
        model_name = "pytorch_model.bin"

    index_path = str(Path(path) / f"{model_name}.index.json")
    if os.path.exists(index_path):
        with open(index_path, "r", encoding="utf-8") as file:
            data = json.load(file)
        return data["weight_map"]
    return {(module_name + ".weight"): model_name for module_name in module_names}


def lora_delta_weight(layer: peft.tuners.lora.LoraLayer, device) -> torch.Tensor:
    if isinstance(layer, (peft.tuners.lora.Linear8bitLt, peft.tuners.lora.Linear4bit)):
        adapter: Union[List[str], str] = layer.active_adapter
        if isinstance(adapter, list):
            if len(adapter) > 1:
                raise ValueError("unhandled relora for multiple adapters")
            adapter = adapter[0]
        return (
            peft.utils.transpose(
                layer.lora_B[adapter].weight.detach().to(device)
                @ layer.lora_A[adapter].weight.detach().to(device),
                getattr(layer, "fan_in_fan_out", False),
            )
            * layer.scaling[adapter]
        )

    raise ValueError("unhandled lora layer type")


def find_lora_modules(model: peft.LoraModel) -> Dict[str, peft.tuners.lora.LoraLayer]:
    modules: Dict[str, peft.tuners.lora.LoraLayer] = {}

    key_list = [key for key, _ in model.model.named_modules() if "lora" not in key]
    for key in key_list:
        try:
            _parent, target, _target_name = peft.utils._get_submodules(model.model, key)
        except AttributeError:
            continue

        if isinstance(target, peft.tuners.lora.LoraLayer):
            modules[key] = target

    return modules


def update_weights(
    target: peft.tuners.lora.LoraLayer, new_weight: torch.Tensor, reinit: bool, device
):
    if reinit:
        for adapter_name in target.lora_A:
            target.reset_lora_parameters(adapter_name, True)
        for adapter_name in target.lora_embedding_A:
            target.reset_lora_parameters(adapter_name, True)

    if isinstance(target, peft.tuners.lora.Linear4bit):
        # This could be faster, but the quantization of Linear4bit weights occurs
        # when the module is moved from cpu to gpu. Without meddling *too* deeply in
        # PEFT's innards or maintaining a duplicate of that codepath, this is good
        # enough for now.
        target.weight.quant_state = None
        target.weight.data = new_weight.cpu()
        target.to(device)
    elif isinstance(target, peft.tuners.lora.Linear8bitLt):
        target.weight.data = (
            bnb.nn.Int8Params(new_weight, requires_grad=False).to(device).data
        )
    else:
        target.weight.data = new_weight.to(device)


def merge_and_save(
    model: peft.LoraModel,
    model_src: str,
    model_dst: str,
    reinit: bool = False,
    quantized: bool = False,
    cpu_offload: bool = False,
    actually_save: bool = True,
):
    modules = find_lora_modules(model)

    if not quantized:
        for _, target in modules.items():
            active_adapter = target.active_adapter
            if isinstance(active_adapter, list):
                active_adapter = active_adapter[0]
            update = target.get_delta_weight(active_adapter).detach()
            target.weight.data += update

            if reinit:
                for adapter_name in target.lora_A:
                    target.reset_lora_parameters(adapter_name, True)
                for adapter_name in target.lora_embedding_A:
                    target.reset_lora_parameters(adapter_name, True)
        return

    os.makedirs(model_dst, exist_ok=True)
    shard_paths = sharded_paths(model_src, modules.keys())
    out_shard_paths = {}

    unique_shards = list(set(shard_paths.values()))
    for shard_path in unique_shards:
        out_tensors = {}
        if shard_path.endswith(".safetensors"):
            in_tensors = st.load_file(str(Path(model_src) / shard_path))
        else:
            in_tensors = torch.load(
                Path(model_src) / shard_path,
                weights_only=True,  # to prevent arbitrary code execution
            )
            if "state_dict" in in_tensors:
                in_tensors = in_tensors["state_dict"]

        for module_name, target in modules.items():
            key = module_name + ".weight"
            if key not in shard_paths or shard_paths[key] != shard_path:
                continue

            orig_weight = in_tensors[key]
            old_dev = target.weight.device
            math_dev = "cpu" if cpu_offload else old_dev

            delta_weight = lora_delta_weight(target, math_dev)
            new_weight = orig_weight.to(math_dev) + delta_weight
            del delta_weight

            if actually_save:
                out_tensors[key] = new_weight.half().cpu()

            update_weights(target, new_weight, reinit=reinit, device=old_dev)

        if actually_save:
            out_shard_name = shard_path
            if out_shard_name.startswith("pytorch_model"):
                out_shard_name = (
                    out_shard_name.replace("pytorch_model", "model").rstrip(".bin")
                    + ".safetensors"
                )

            for module_name in in_tensors:
                if module_name not in out_tensors:
                    out_tensors[module_name] = in_tensors[module_name].half()
                out_shard_paths[module_name] = out_shard_name

            shard_fn = str(Path(model_dst) / out_shard_name)
            LOG.info(f"saving tensors to {shard_fn}")
            st.save_file(out_tensors, shard_fn, metadata={"format": "pt"})

        barrier()
        del in_tensors
        del out_tensors
        torch.cuda.empty_cache()

    if actually_save and len(unique_shards) > 1:
        with open(
            str(Path(model_dst, "model.safetensors.index.json")), "w", encoding="utf-8"
        ) as file:
            json.dump({"metadata": {}, "weight_map": out_shard_paths}, file)


def load_weight_checkpoint(model: peft.LoraModel, checkpoint_path: str):
    modules = find_lora_modules(model)
    shard_paths = sharded_paths(checkpoint_path, modules.keys())
    unique_shards = list(set(shard_paths.values()))

    for shard_path in unique_shards:
        tensors = st.load_file(os.path.join(checkpoint_path, shard_path))

        for module_name, target in modules.items():
            key = module_name + ".weight"
            if key not in shard_paths or shard_paths[key] != shard_path:
                continue

            new_weight = tensors[key]
            update_weights(
                target, new_weight, reinit=False, device=target.weight.device
            )


================================================
FILE: src/axolotl/monkeypatch/ring_attn/__init__.py
================================================
"""Init for ring attention monkeypatch module"""

# flake8: noqa

from .patch import (
    get_ring_attn_group,
    register_ring_attn_from_device_mesh,
    set_ring_attn_group,
    update_ring_attn_params,
)

__all__ = (
    "get_ring_attn_group",
    "register_ring_attn_from_device_mesh",
    "set_ring_attn_group",
    "update_ring_attn_params",
)


================================================
FILE: src/axolotl/monkeypatch/ring_attn/adapters/__init__.py
================================================


================================================
FILE: src/axolotl/monkeypatch/ring_attn/adapters/batch.py
================================================
"""
HuggingFace flash attention adapter for basic ring attention (batch API).

Inspired by
https://github.com/zhuzilin/ring-flash-attention/blob/ce9fd3935ca0e5f0592bb0826cbed18ec69da729/ring_flash_attn/adapters/hf_adapter.py.
Our implementation closely follows the structure of that module, but we've minified it
somewhat to support only the latest versions of transformers.
"""

import os
from typing import Callable

import torch
import torch.distributed as dist
import transformers
import transformers.modeling_flash_attention_utils
from ring_flash_attn import ring_flash_attn_func
from ring_flash_attn.adapters.hf_adapter import check_params
from transformers.modeling_flash_attention_utils import is_flash_attn_greater_or_equal

try:
    from transformers.modeling_flash_attention_utils import _flash_supports_window
except ImportError:
    try:
        from transformers.modeling_flash_attention_utils import (
            _flash_supports_window_size as _flash_supports_window,
        )
    except ImportError:
        _flash_supports_window = True

from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS

from axolotl.utils.schemas.enums import RingAttnFunc

RING_ATTN_FUNC_MAPPING = {
    RingAttnFunc.BATCH_RING: torch.compile(ring_flash_attn_func),
    # RingAttnFunc.BATCH_ZIGZAG: torch.compile(zigzag_ring_flash_attn_func),
    # RingAttnFunc.BATCH_STRIPE: torch.compile(stripe_flash_attn_func),
}


def create_flash_attn_forward_varlen_llama3(
    process_group: dist.ProcessGroup, ring_attn_func: RingAttnFunc
) -> Callable:
    """
    Create a ring flash attention forward function compatible with HuggingFace's
    interface.

    Args:
        process_group: A PyTorch distributed process group.
        ring_attn_func: Function from `ring_flash_attention` to replace HF flash
            attention with.

    Returns:
        A function that implements the ring flash attention forward pass with the
            signature expected by HuggingFace Transformers.
    """

    # transformers 4.48+

    def _flash_attention_forward(
        query_states: torch.Tensor,
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        attention_mask: torch.Tensor,
        query_length: int,
        is_causal: bool,
        dropout: float = 0.0,
        position_ids: torch.Tensor | None = None,
        softmax_scale: float | None = None,
        sliding_window: int | None = None,
        use_top_left_mask: bool = False,
        softcap: float | None = None,
        deterministic: bool = None,
        cu_seq_lens_q: torch.LongTensor | None = None,
        cu_seq_lens_k: torch.LongTensor | None = None,
        max_length_q: int | None = None,
        max_length_k: int | None = None,
        target_dtype: torch.dtype | None = None,
        attn_implementation: str | None = None,
        **kwargs,
    ):
        """
        Calls the forward method of Ring Flash Attention.

        Args:
            query_states: Tensor containing the query vectors.
            key_states: Tensor containing the key vectors.
            value_states: Tensor containing the value vectors.
            attention_mask: Not used in this implementation.
            query_length: Integer representing the length of the query sequence.
            is_causal: Boolean indicating whether to apply a causal mask to the attention.
            dropout: Float representing the dropout probability. Default is 0.0.
            position_ids: Not used in this implementation.
            softmax_scale: Optional float value for the softmax scaling factor. Default is None.
            sliding_window: Optional integer defining the size of the sliding attention window.
                Default is None.
            use_top_left_mask: Boolean indicating whether to use a top-left mask for the attention.
                Default is False.
            softcap: Not used in this implementation.
            deterministic: Optional boolean to enforce deterministic computation. Default is None.
            cu_seq_lens_q: Not used in this implementation.
            cu_seq_lens_k: Not used in this implementation.
            max_length_q: Not used in this implementation.
            max_length_k: Not used in this implementation.
            target_dtype: Not used in this implementation.
            attn_implementation: Not used in this implementation.
            **kwargs: Additional keyword arguments. Not used in this implementation.

        Returns:
            torch.Tensor: The output of the attention mechanism, with shape
                `[batch_size, query_length, num_heads, head_dim]`.
        """
        if not use_top_left_mask:
            causal = is_causal
        else:
            causal = is_causal and query_length != 1

        # Handle sliding window
        use_sliding_windows = (
            _flash_supports_window
            and sliding_window is not None
            and key_states.shape[1] > sliding_window
        )
        window_size = (
            (sliding_window, sliding_window) if use_sliding_windows else (-1, -1)
        )

        # Handle deterministic mode
        if is_flash_attn_greater_or_equal("2.4.1"):
            if deterministic is None:
                deterministic = (
                    os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1"
                )

        # Call ring flash attention function
        attn_output = RING_ATTN_FUNC_MAPPING[ring_attn_func](
            query_states,
            key_states,
            value_states,
            dropout_p=dropout,
            softmax_scale=softmax_scale,
            causal=causal,
            window_size=window_size,
            alibi_slopes=None,
            deterministic=deterministic,
            return_attn_probs=False,
            group=process_group,
        )

        return attn_output

    return _flash_attention_forward


def substitute_hf_flash_attn(
    process_group: dist.ProcessGroup, ring_attn_func: RingAttnFunc
):
    """
    Substitute HuggingFace's flash attention implementation with ring-based implementation.

    Args:
        process_group: PyTorch distributed process group for communication.
        ring_attn_func: Function from `ring_flash_attention` to replace HF flash
            attention with.
    """
    try:
        # Substitute flash attention
        old_flash_attention_forward = (
            transformers.modeling_flash_attention_utils._flash_attention_forward
        )
        new_flash_attention_forward = create_flash_attn_forward_varlen_llama3(
            process_group=process_group, ring_attn_func=ring_attn_func
        )

        if check_params(old_flash_attention_forward, new_flash_attention_forward):
            transformers.modeling_flash_attention_utils._flash_attention_forward = (
                new_flash_attention_forward
            )
        else:
            raise ValueError(
                "The signature of the new flash attention forward function does not match the old one."
            )
    except Exception as exception:
        raise ValueError(
            f"The current transformer version {transformers.__version__} is not supported. "
            "Please use pip install -U transformers to upgrade to the latest version. "
            "If the code failed with the latest version, "
            f"please file an issue."
        ) from exception

    # Register with ALL_ATTENTION_FUNCTIONS if available
    if ALL_ATTENTION_FUNCTIONS is not None:
        from ring_flash_attn.adapters.hf_adapter import flash_attention_forward

        ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = flash_attention_forward


================================================
FILE: src/axolotl/monkeypatch/ring_attn/patch.py
================================================
"""Ring attention group registration and flash attention patching.

Make use of the `ring-flash-attn` (https://github.com/zhuzilin/ring-flash-attention)
package, specifically the `hf_adapter.substitute_hf_flash_attn` function to patch in
their sequence parallel version of Flash Attention 2.

We also provide some patches for accelerate functions to prepare the dataloader for
sequence parallelism training.
"""

import os
from typing import Optional

import torch
import torch.distributed as dist
from torch.distributed import DeviceMesh

try:
    from transformers.modeling_flash_attention_utils import _flash_supports_window
except ImportError:
    try:
        from transformers.modeling_flash_attention_utils import (
            _flash_supports_window_size as _flash_supports_window,
        )
    except ImportError:
        _flash_supports_window = True

from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
from axolotl.utils.logging import get_logger
from axolotl.utils.schemas.enums import RingAttnFunc

LOG = get_logger(__name__)

RING_ATTN_GROUP = None


def get_ring_attn_group() -> dist.ProcessGroup:
    """Getter for ring attention group on this rank."""
    if RING_ATTN_GROUP is None:
        raise RuntimeError("register_ring_attn_from_device_mesh() not yet called")
    return RING_ATTN_GROUP


def set_ring_attn_group(ring_attn_group: dist.ProcessGroup | None):
    """Setter for ring attention group on this rank."""
    global RING_ATTN_GROUP
    RING_ATTN_GROUP = ring_attn_group


def create_ring_flash_attention_forward(
    process_group: dist.ProcessGroup, heads_k_stride: int
):
    from ring_flash_attn import llama3_flash_attn_varlen_func
    from ring_flash_attn.adapters.hf_adapter import DATA_PARAMS

    def _flash_attention_forward_v3(
        query_states: torch.Tensor,
        key_states: torch.Tensor,
        value_states: torch.Tensor,
        attention_mask: torch.Tensor,
        query_length: int,
        is_causal: bool,
        dropout: float = 0.0,
        position_ids: Optional[torch.Tensor] = None,
        softmax_scale: Optional[float] = None,
        sliding_window: Optional[int] = None,
        use_top_left_mask: bool = False,
        softcap: Optional[float] = None,
        deterministic: bool = None,
        cu_seq_lens_q: Optional[torch.LongTensor] = None,
        cu_seq_lens_k: Optional[torch.LongTensor] = None,
        max_length_q: Optional[int] = None,
        max_length_k: Optional[int] = None,
        target_dtype: Optional[torch.dtype] = None,
        attn_implementation: Optional[str] = None,
        **kwargs,
    ):
        if not use_top_left_mask:
            causal = is_causal
        else:
            # TODO: Remove the `query_length != 1` check once Flash Attention for RoCm is bumped to 2.1. For details, please see the comment in transformers.models.llama.modeling_llama.LlamaFlashAttention2.__init__.
            causal = is_causal and query_length != 1

        # Assuming 4D tensors, key_states.shape[1] is the key/value sequence length (source length).
        use_sliding_windows = (
            _flash_supports_window
            and sliding_window is not None
            and key_states.shape[1] > sliding_window
        )
        flash_kwargs = (
            {"window_size": (sliding_window, sliding_window)}
            if use_sliding_windows
            else {}
        )

        if deterministic is None:
            deterministic = os.environ.get("FLASH_ATTENTION_DETERMINISTIC", "0") == "1"
        flash_kwargs["deterministic"] = deterministic
        assert softcap is None, (
            "llama3_flash_attn_varlen_func does not support softcap yet."
        )
        # flash_kwargs["softcap"] = softcap
        flash_kwargs["group"] = process_group

        # not sure why attention_mask can be not None...
        assert causal, "only causal attention is supported yet."
        batch_size = query_states.size(0)
        assert batch_size == 1, "varlen data should be processed in advance."

        attn_output = llama3_flash_attn_varlen_func(
            query_states.squeeze(dim=0),
            key_states.squeeze(dim=0),
            value_states.squeeze(dim=0),
            cu_seqlens_q=DATA_PARAMS["cu_seqlens_q"],
            cu_seqlens_k=DATA_PARAMS["cu_seqlens_k"],
            max_seqlen_q=DATA_PARAMS["max_seqlen_q"],
            max_seqlen_k=DATA_PARAMS["max_seqlen_k"],
            heads_k_stride=heads_k_stride,
            local_k_slice=DATA_PARAMS["local_k_slice"],
            dropout_p=dropout,
            softmax_scale=softmax_scale,
            causal=causal,
            **flash_kwargs,
        )

        attn_output = attn_output.unsqueeze(dim=0)

        return attn_output

    return [
        _flash_attention_forward_v3,
    ]


def register_ring_attn_from_device_mesh(
    device_mesh: "DeviceMesh",
    context_parallel_dim: tuple[str, ...],
    heads_k_stride: int | None,
    ring_attn_func: RingAttnFunc | None,
):
    """Create ring attention group using DeviceMesh and substitute flash attn with ring flash attn.

    Args:
        device_mesh: DeviceMesh object containing the parallelism topology.
        context_parallel_dim: Name of the sequence parallel dimension in the device mesh.
        heads_k_stride: Sequence parallelism K head stride size. Passed through to
            `varlen_llama3` `ring_flash_attn` implementation.
        ring_attn_func: `ring_flash_attn` ring attention implemention. If sample
            packing is enabled, it must be a `varlen` function; otherwise, it must be a
            `batch` function.
    """
    rank = dist.get_rank()

    LOG.info(
        f"Enabling ring attention sequence parallelism using DeviceMesh "
        f"dimension '{context_parallel_dim}'",
    )

    # Extract the sequence parallel submesh
    try:
        sequence_mesh = device_mesh[context_parallel_dim]
    except (KeyError, IndexError) as e:
        raise ValueError(
            f"Dimension '{context_parallel_dim}' not found in device_mesh. "
            f"Available dimensions: {device_mesh.mesh_dim_names}"
        ) from e

    # Get the process group for context parallelism
    sequence_pg = sequence_mesh.get_group()
    context_parallel_size = sequence_mesh.size()

    if rank == 0:
        LOG.info(
            f"Sequence parallel degree: {context_parallel_size}, "
            f"mesh shape: {sequence_mesh.mesh.shape}"
        )

    # Log which ranks are in the current process group
    if sequence_pg != dist.GroupMember.WORLD:
        ranks_in_group = dist.get_process_group_ranks(sequence_pg)
        LOG.info(f"Current sequence parallel group ranks: {ranks_in_group}")

    # Set the ring attention group
    set_ring_attn_group(sequence_pg)

    if ring_attn_func is RingAttnFunc.VARLEN_LLAMA3:
        # fmt: off
        import ring_flash_attn.adapters.hf_adapter

        from ring_flash_attn.adapters.hf_adapter import (  # isort: skip
            create_ring_flash_attention_forward as create_ring_flash_attention_forward_orig,
        )

        create_ring_flash_attention_forward_orig = (  # noqa: F811,F841
            create_ring_flash_attention_forward
        )
        ring_flash_attn.adapters.hf_adapter.create_ring_flash_attention_forward = create_ring_flash_attention_forward
        # fmt: on

        ring_flash_attn.adapters.hf_adapter.substitute_hf_flash_attn(
            process_group=get_ring_attn_group(), heads_k_stride=heads_k_stride or 1
        )
    elif ring_attn_func is RingAttnFunc.BATCH_RING:
        from axolotl.monkeypatch.ring_attn.adapters.batch import (
            substitute_hf_flash_attn,
        )

        substitute_hf_flash_attn(
            process_group=get_ring_attn_group(),
            ring_attn_func=ring_attn_func,
        )


def update_ring_attn_params(position_ids: torch.Tensor | None):
    """
    Calculate the cumulative sequence lengths for the current forward pass and pass the
    value to the substituted `ring_flash_attn`.

    Args:
        position_ids: Optional tensor of position IDs (for sample packed data).
    """
    from ring_flash_attn import update_ring_flash_attn_params

    cu_seqlens, _ = get_cu_seqlens_from_pos_ids(position_ids)
    cu_seqlens = cu_seqlens.squeeze().to(device=torch.cuda.current_device())
    update_ring_flash_attn_params(cu_seqlens, get_ring_attn_group())


================================================
FILE: src/axolotl/monkeypatch/scaled_softmax_attn.py
================================================
"""
Scaled Softmax (SSMax) attention patch using FlexAttention.
SSMax:  softmax(scores * s * log(n) + b) where n is the position index
Ref: https://arxiv.org/abs/2501.19399
"""

import torch
from transformers import PreTrainedModel

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

try:
    from torch.nn.attention.flex_attention import BlockMask
    from transformers.integrations.flex_attention import (
        compile_friendly_flex_attention,
        repeat_kv,
    )

    FLEX_ATTENTION_AVAILABLE = True
except ImportError:
    FLEX_ATTENTION_AVAILABLE = False
    BlockMask = None

_ssmax_config = {}


def patch_scaled_softmax_attention(
    scaling_factor_init: float = 0.43, bias: float = 0.0, model: PreTrainedModel = None
):
    """Patch attention to apply SSMax via FlexAttention score_mod."""
    global _ssmax_config

    if not FLEX_ATTENTION_AVAILABLE:
        raise RuntimeError("SSMax requires FlexAttention.")

    _ssmax_config["ssmax_s"] = scaling_factor_init
    _ssmax_config["ssmax_b"] = bias

    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS

    if "flex_attention" in ALL_ATTENTION_FUNCTIONS:
        _ssmax_config["original_flex_fn"] = ALL_ATTENTION_FUNCTIONS["flex_attention"]
        ALL_ATTENTION_FUNCTIONS["flex_attention"] = ssmax_flex_attention_forward
        LOG.info(
            f"Patched flex_attention with SSMax (s={scaling_factor_init}, b={bias})"
        )
    else:
        LOG.warning("flex_attention not found.  Ensure flex_attention:  true is set.")


def ssmax_flex_attention_forward(
    module: torch.nn.Module,
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    attention_mask,
    scaling: float | None = None,
    softcap: float | None = None,
    **kwargs,
) -> tuple[torch.Tensor, torch.Tensor | None]:
    """FlexAttention forward with SSMax:  score * (s * log(n) + b)."""

    if kwargs.get("dropout", 0.0) > 0:
        raise ValueError("flex_attention does not support dropout")

    ssmax_s = _ssmax_config.get("ssmax_s", 0.43)
    ssmax_b = _ssmax_config.get("ssmax_b", 0.0)

    position_ids = kwargs.get("position_ids", None)
    position_ids_flat = position_ids.view(-1) if position_ids is not None else None

    block_mask = attention_mask if isinstance(attention_mask, BlockMask) else None
    score_mask = None if block_mask else attention_mask

    if score_mask is not None:
        score_mask = score_mask[:, :, :, : key.shape[-2]]

    def score_mod(score, batch_idx, head_idx, q_idx, kv_idx):
        """
        Apply SSMax scaling:  score * (s * log(n) + b)
        where n is the relative position within each packed sequence.
        """
        if position_ids_flat is not None:
            relative_pos = position_ids_flat[q_idx]
            n = (relative_pos + 1).float()
        else:
            n = (q_idx + 1).float()

        n = torch.clamp(n, min=2.0)

        ssmax_scale = ssmax_s * torch.log(n) + ssmax_b
        score = score * ssmax_scale

        if softcap is not None:
            score = softcap * torch.tanh(score / softcap)

        if score_mask is not None:
            score = score + score_mask[batch_idx][0][q_idx][kv_idx]

        return score

    enable_gqa = True
    if (query.shape[1] & (query.shape[1] - 1)) != 0:
        key = repeat_kv(key, query.shape[1] // key.shape[1])
        value = repeat_kv(value, query.shape[1] // value.shape[1])
        enable_gqa = False

    return_lse = query.device.type != "cpu"
    flex_output = compile_friendly_flex_attention(
        query,
        key,
        value,
        score_mod=score_mod,
        block_mask=block_mask,
        enable_gqa=enable_gqa,
        scale=scaling,
        kernel_options=kwargs.get("kernel_options"),
        return_lse=return_lse,
        training=module.training,
    )

    if return_lse:
        attention_output, lse = flex_output
        lse = lse.to(value.dtype)
    else:
        attention_output, lse = flex_output, None

    return attention_output.transpose(1, 2).contiguous(), lse


def unpatch_scaled_softmax_attention():
    """Restore the original FlexAttention function."""
    global _ssmax_config
    from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS

    if "original_flex_fn" in _ssmax_config:
        ALL_ATTENTION_FUNCTIONS["flex_attention"] = _ssmax_config["original_flex_fn"]
        _ssmax_config.clear()
        LOG.info("Unpatched flex_attention, restored original")


================================================
FILE: src/axolotl/monkeypatch/stablelm_attn_hijack_flash.py
================================================
# coding=utf-8
# Copyright 2023 Stability AI, EleutherAI, and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This code is based off the following work:
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/modeling_llama.py
# https://github.com/huggingface/transformers/blob/main/src/transformers/models/gpt_neox/modeling_gpt_neox.py
"""PyTorch StableLM Epoch model."""

import importlib
import math
from typing import Optional, Tuple, Union

import torch
import torch.utils.checkpoint
from accelerate import init_empty_weights
from einops import rearrange
from flash_attn.flash_attn_interface import (
    flash_attn_varlen_qkvpacked_func,
)
from torch import nn
from transformers import AutoConfig, AutoModelForCausalLM
from transformers.modeling_outputs import BaseModelOutputWithPast

from axolotl.monkeypatch.utils import get_cu_seqlens_from_pos_ids
from axolotl.utils.logging import get_logger

logger = get_logger(__name__)


def replace_stablelm_attn_with_flash_attn(model_name="stabilityai/stablelm-3b-4e1t"):
    # this is a wonky hack to get the remotely loaded module
    model_config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
    # we need to load the model here in order for modeling_stablelm_epoch to be available
    with init_empty_weights():
        AutoModelForCausalLM.from_pretrained(model_name, trust_remote_code=True)
    module_name = model_config.__class__.__module__.replace(
        ".configuration_stablelm_epoch", ".modeling_stablelm_epoch"
    )
    modeling_stablelm = importlib.import_module(module_name)
    modeling_stablelm.Attention.forward = flashattn_attn
    modeling_stablelm.StableLMEpochModel.forward = stablelm_model_forward
    modeling_stablelm.DecoderLayer.forward = decoder_layer_forward


def rotate_half(x: torch.Tensor):
    """Rotates half the hidden dims of the input."""

    x1, x2 = torch.chunk(x, 2, dim=-1)
    return torch.cat((-x2, x1), dim=-1)


def apply_rotary_pos_emb(q, k, cos, sin, position_ids):
    # The first two dimensions of cos and sin are always 1, so we can `squeeze` them.

    cos = cos.squeeze(1).squeeze(0)  # [seq_len, dim]
    sin = sin.squeeze(1).squeeze(0)  # [seq_len, dim]
    cos = cos[position_ids].unsqueeze(1)  # [batch_size, 1, seq_len, dim]
    sin = sin[position_ids].unsqueeze(1)  # [batch_size, 1, seq_len, dim]
    q_embed = (q * cos) + (rotate_half(q) * sin)
    k_embed = (k * cos) + (rotate_half(k) * sin)
    return q_embed, k_embed


def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
    """
    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
    """
    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
    if n_rep == 1:
        return hidden_states
    hidden_states = hidden_states[:, :, None, :, :].expand(
        batch, num_key_value_heads, n_rep, slen, head_dim
    )
    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)


def flashattn_attn(
    self,
    hidden_states: torch.FloatTensor,
    attention_mask: torch.FloatTensor,
    position_ids: torch.LongTensor,
    past_key_value: Optional[Tuple[torch.Tensor]] = None,
    output_attentions: Optional[bool] = False,
    use_cache: Optional[bool] = False,
    cu_seqlens: Optional[torch.Tensor] = None,
    max_seqlen: Optional[torch.Tensor] = None,
) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
    bsz, q_len, _ = hidden_states.size()

    query_states = self.q_proj(hidden_states)
    key_states = self.k_proj(hidden_states)
    value_states = self.v_proj(hidden_states)

    query_states = query_states.view(
        bsz, q_len, self.num_heads, self.head_dim
    ).transpose(1, 2)
    key_states = key_states.view(
        bsz, q_len, self.num_key_value_heads, self.head_dim
    ).transpose(1, 2)
    value_states = value_states.view(
        bsz, q_len, self.num_key_value_heads, self.head_dim
    ).transpose(1, 2)

    query_rot = query_states[..., : self.rotary_ndims]
    query_pass = query_states[..., self.rotary_ndims :]
    key_rot = key_states[..., : self.rotary_ndims]
    key_pass = key_states[..., self.rotary_ndims :]

    kv_seq_len = key_states.shape[-2]
    if past_key_value is not None:
        kv_seq_len += past_key_value[0].shape[-2]
    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
    query_states, key_states = apply_rotary_pos_emb(
        query_rot, key_rot, cos, sin, position_ids
    )

    # [batch_size, num_heads, seq_len, head_dim]
    query_states = torch.cat((query_states, query_pass), dim=-1)
    key_states = torch.cat((key_states, key_pass), dim=-1)

    if past_key_value is not None:
        # Reuse k, v, self_attention
        key_states = torch.cat((past_key_value[0], key_states), dim=2)
        value_states = torch.cat((past_key_value[1], value_states), dim=2)

    past_key_value = (key_states, value_states) if use_cache else None

    # Repeat k/v heads if n_kv_heads < n_heads
    key_states = repeat_kv(key_states, self.num_key_value_groups)
    value_states = repeat_kv(value_states, self.num_key_value_groups)

    if cu_seqlens is not None and max_seqlen is not None and cu_seqlens.dim() == 1:
        # special handling using sample packing
        qkv = torch.stack(
            [query_states, key_states, value_states], dim=2
        )  # [bsz, nh, 3, q_len, hd]
        qkv = qkv.transpose(1, 3)  # [bsz, q_len, 3, nh, hd]
        qkv = rearrange(qkv, "b s ... -> (b s) ...")
        softmax_scale = None

        output = flash_attn_varlen_qkvpacked_func(
            qkv, cu_seqlens, max_seqlen, 0.0, softmax_scale=softmax_scale, causal=True
        )

        attn_output = rearrange(output, "(b s) ... -> b s ...", b=bsz)
        attn_output = rearrange(attn_output, "b s h d -> b s (h d)")
    else:
        attn_weights = torch.matmul(
            query_states, key_states.transpose(2, 3)
        ) / math.sqrt(self.head_dim)

        if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
            raise ValueError(
                f"Attention weights should be of size {(bsz, self.num_heads, q_len, kv_seq_len)}, but is"
                f" {attn_weights.size()}"
            )

        if attention_mask is not None:
            if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
                raise ValueError(
                    f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
                )
            attn_weights = attn_weights + attention_mask

        # Upcast attention to fp32
        attn_weights = nn.functional.softmax(
            attn_weights, dim=-1, dtype=torch.float32
        ).to(query_states.dtype)
        attn_output = torch.matmul(attn_weights, value_states)

        if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
            raise ValueError(
                f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
                f" {attn_output.size()}"
            )

        # Merge heads
        attn_output = attn_output.transpose(1, 2).contiguous()
        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)

    # Final linear projection
    attn_output = self.o_proj(attn_output)

    return attn_output, None, past_key_value


def decoder_layer_forward(
    self,
    hidden_states: Optional[torch.FloatTensor],
    attention_mask: Optional[torch.FloatTensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_value: Optional[Tuple[torch.Tensor]] = None,
    output_attentions: Optional[bool] = False,
    use_cache: Optional[bool] = False,
    cu_seqlens: Optional[torch.Tensor] = None,
    max_seqlen: Optional[torch.Tensor] = None,
) -> Union[
    Tuple[torch.Tensor], Optional[Tuple[torch.Tensor, Tuple[torch.FloatTensor, ...]]]
]:
    residual = hidden_states

    hidden_states = self.input_layernorm(hidden_states)

    # Self Attention
    hidden_states, self_attn_weights, present_key_value = self.self_attn(
        hidden_states=hidden_states,
        attention_mask=attention_mask,
        position_ids=position_ids,
        past_key_value=past_key_value,
        output_attentions=output_attentions,
        use_cache=use_cache,
        cu_seqlens=cu_seqlens,
        max_seqlen=max_seqlen,
    )
    hidden_states = residual + hidden_states

    # Fully Connected
    residual = hidden_states
    hidden_states = self.post_attention_layernorm(hidden_states)
    hidden_states = self.mlp(hidden_states)
    hidden_states = residual + hidden_states

    outputs = (hidden_states,)

    if output_attentions:
        outputs += (self_attn_weights,)

    if use_cache:
        outputs += (present_key_value,)

    return outputs


def stablelm_model_forward(
    self,
    input_ids: Optional[torch.LongTensor] = None,
    attention_mask: Optional[torch.FloatTensor] = None,
    position_ids: Optional[torch.LongTensor] = None,
    past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
    inputs_embeds: Optional[torch.FloatTensor] = None,
    use_cache: Optional[bool] = None,
    output_attentions: Optional[bool] = None,
    output_hidden_states: Optional[bool] = None,
    return_dict: Optional[bool] = None,
) -> Union[Tuple, BaseModelOutputWithPast]:
    output_attentions = (
        output_attentions
        if output_attentions is not None
        else self.config.output_attentions
    )
    output_hidden_states = (
        output_hidden_states
        if output_hidden_states is not None
        else self.config.output_hidden_states
    )
    use_cache = use_cache if use_cache is not None else self.config.use_cache

    return_dict = (
        return_dict if return_dict is not None else self.config.use_return_dict
    )

    # Retrieve input_ids and inputs_embeds
    if input_ids is not None and inputs_embeds is not None:
        raise ValueError(
            "You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time"
        )
    if input_ids is not None:
        batch_size, seq_length = input_ids.shape
    elif inputs_embeds is not None:
        batch_size, seq_length, _ = inputs_embeds.shape
    else:
        raise ValueError(
            "You have to specify either decoder_input_ids or decoder_inputs_embeds"
        )

    seq_length_with_past = seq_length
    past_key_values_length = 0

    if past_key_values is not None:
        past_key_values_length = past_key_values[0][0].shape[2]
        seq_length_with_past = seq_length_with_past + past_key_values_length

    cu_seqlens = None
    max_seqlen = None
    if position_ids is None:
        device = input_ids.device if input_ids is not None else inputs_embeds.device
        position_ids = torch.arange(
            past_key_values_length,
            seq_length + past_key_values_length,
            dtype=torch.long,
            device=device,
        )
        position_ids = position_ids.unsqueeze(0).view(-1, seq_length)
    else:
        position_ids = position_ids.view(-1, seq_length).long()
        cu_seqlens, max_seqlen = get_cu_seqlens_from_pos_ids(position_ids)
        cu_seqlens = cu_seqlens.squeeze()

    if inputs_embeds is None:
        inputs_embeds = self.embed_tokens(input_ids)
    # Embed positions
    if attention_mask is None:
        attention_mask = torch.ones(
            (batch_size, seq_length_with_past),
            dtype=torch.bool,
            device=inputs_embeds.device,
        )
    attention_mask = self._prepare_decoder_attention_mask(
        attention_mask,
        (batch_size, seq_length),
        inputs_embeds,
        past_key_values_length,
    )

    hidden_states = inputs_embeds

    if self.gradient_checkpointing and self.training:
        if use_cache:
            logger.warning(
                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
            )
            use_cache = False

    # Decoder layers
    all_hidden_states = () if output_hidden_states else None
    all_self_attns = () if output_attentions else None
    next_decoder_cache = () if use_cache else None

    for idx, decoder_layer in enumerate(self.layers):
        if output_hidden_states:
            all_hidden_states += (hidden_states,)

        past_key_value = past_key_values[idx] if past_key_values is not None else None

        if self.gradient_checkpointing and self.training:

            def create_custom_forward(module):
                def custom_forward(*inputs):
                    # None for past_key_value
                    return module(*inputs)

                return custom_forward

            layer_outputs = torch.utils.checkpoint.checkpoint(
                create_custom_forward(decoder_layer),
                hidden_states,
                attention_mask,
                position_ids,
                past_key_value,
                output_attentions,
                None,
                cu_seqlens,
                max_seqlen,
            )
        else:
            layer_outputs = decoder_layer(
                hidden_states,
                attention_mask=attention_mask,
                position_ids=position_ids,
                past_key_value=past_key_value,
                output_attentions=output_attentions,
                use_cache=use_cache,
                cu_seqlens=cu_seqlens,
                max_seqlen=max_seqlen,
            )

        hidden_states = layer_outputs[0]

        if use_cache:
            next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)

        if output_attentions:
            all_self_attns += (layer_outputs[1],)

    hidden_states = self.norm(hidden_states)

    # Add hidden states from the last decoder layer
    if output_hidden_states:
        all_hidden_states += (hidden_states,)

    next_cache = next_decoder_cache if use_cache else None
    if not return_dict:
        return tuple(
            v
            for v in [hidden_states, next_cache, all_hidden_states, all_self_attns]
            if v is not None
        )
    return BaseModelOutputWithPast(
        last_hidden_state=hidden_states,
        past_key_values=next_cache,
        hidden_states=all_hidden_states,
        attentions=all_self_attns,
    )


================================================
FILE: src/axolotl/monkeypatch/tiled_mlp/__init__.py
================================================
"""
TiledMLP monkey patches
"""

from .patch import (
    patch_tiled_mlp,
)

__all__ = [
    "patch_tiled_mlp",
]


================================================
FILE: src/axolotl/monkeypatch/tiled_mlp/base.py
================================================
"""
TiledMLP support for DDP, FSDP, and single GPU
"""

import threading
from typing import List

import torch


class DeepSpeedTiledMLPMoE(torch.autograd.Function):
    @staticmethod
    def forward(
        ctx,
        fn,
        self,
        x,
        shards,
        compute_params,
    ) -> torch.Tensor:
        ctx.fn = fn
        ctx.self = self
        ctx.shards = shards
        ctx.compute_params = [p for p in compute_params if p.requires_grad]
        ctx.save_for_backward(x)

        x_shards = list(torch.chunk(x, chunks=shards, dim=1))
        with torch.no_grad():
            output_shards = [fn(self, x_shard) for x_shard in x_shards]

        ctx.is_tuple_output = isinstance(output_shards[0], tuple)
        if isinstance(output_shards[0], tuple):
            tuple_dim_idx = [1, 0]
            output_unsharded = tuple(
                torch.cat(
                    [output_shard[i] for output_shard in output_shards],
                    dim=tuple_dim_idx[i],
                )
                for i in range(len(output_shards[0]))
            )
        else:
            output_unsharded = torch.cat(output_shards, dim=1)

        return output_unsharded

    @staticmethod
    def backward(ctx, *grads) -> torch.Tensor:
        fn = ctx.fn
        (x,) = ctx.saved_tensors
        self = ctx.self
        shards = ctx.shards
        compute_params = ctx.compute_params
        is_tuple_output = ctx.is_tuple_output

        x_requires_grad = x.requires_grad
        x = x.detach()
        # detach() unsets `x.requires_grad`, so restore it
        x.requires_grad_(x_requires_grad)

        incoming_grad = grads[0]
        x_grad = torch.zeros_like(x)
        x_shards = list(torch.chunk(x, chunks=shards, dim=1))

        shard_step = x_shards[0].numel()
        for i, x_shard in enumerate(x_shards):
            # Tell deepspeed not to add a new grad to its ipg bucket until the last shard is run
            if compute_params is not None:
                if i + 1 < shards:
                    for param in compute_params:
                        param.ds_grad_is_ready = False
                else:
                    # last shard, can add the grad
                    for param in compute_params:
                        param.ds_grad_is_ready = True

            x_shard.requires_grad_(x_requires_grad)

            shard_offset = i * shard_step
            x_shard.grad = (
                x_grad.view(-1)
                .narrow(0, shard_offset, x_shard.numel())
                .view_as(x_shard)
            )
            incoming_grad_shard = (
                incoming_grad.view(-1)
                .narrow(0, shard_offset, x_shard.numel())
                .view_as(x_shard)
            )
            with torch.enable_grad():
                output = fn(self, x_shard)
            if is_tuple_output:
                torch.autograd.backward(output[0], incoming_grad_shard)
            else:
                torch.autograd.backward(output, incoming_grad_shard)

        return (None, None, x_grad, None, None)


class TiledMLP(torch.autograd.Function):
    """
    TiledMLP implementation using gradient hooks
    """

    @staticmethod
    def forward(
        ctx,
        fn,
        self,
        x,
        shards,
        compute_params,
    ) -> torch.Tensor:
        ctx.fn = fn
        ctx.self = self
        ctx.shards = shards
        ctx.compute_params = [p for p in compute_params if p.requires_grad]
        ctx.save_for_backward(x)

        x_shards = list(torch.chunk(x, chunks=shards, dim=1))
        with torch.no_grad():
            output_shards = [fn(self, x_shard) for x_shard in x_shards]
        ctx.is_tuple_output = isinstance(output_shards[0], tuple)
        if isinstance(output_shards[0], tuple):
            tuple_dim_idx = [1, 0]
            output_unsharded = tuple(
                torch.cat(
                    [output_shard[i] for output_shard in output_shards],
                    dim=tuple_dim_idx[i],
                )
                for i in range(len(output_shards[0]))
            )
        else:
            output_unsharded = torch.cat(output_shards, dim=1)

        return output_unsharded

    @staticmethod
    def backward(ctx, *grads) -> torch.Tensor:
        fn = ctx.fn
        (x,) = ctx.saved_tensors
        self = ctx.self
        shards = ctx.shards
        compute_params = ctx.compute_params
        is_tuple_output = ctx.is_tuple_output

        x_requires_grad = x.requires_grad
        x = x.detach()
        x.requires_grad_(x_requires_grad)

        incoming_grad = grads[0]
        x_grad = torch.zeros_like(x)
        x_shards = list(torch.chunk(x, chunks=shards, dim=1))

        # Create a gradient accumulator for parameters
        grad_accumulator = GradientAccumulator(compute_params, shards, dtype=x.dtype)

        shard_step = x_shards[0].numel()
        for i, x_shard in enumerate(x_shards):
            x_shard.requires_grad_(x_requires_grad)

            shard_offset = i * shard_step
            x_shard.grad = (
                x_grad.view(-1)
                .narrow(0, shard_offset, x_shard.numel())
                .view_as(x_shard)
            )
            incoming_grad_shard = (
                incoming_grad.view(-1)
                .narrow(0, shard_offset, x_shard.numel())
                .view_as(x_shard)
            )

            # Install hooks for this shard
            is_last_shard = i + 1 == shards
            grad_accumulator.install_hooks(is_last_shard)

            with torch.enable_grad():
                output = fn(self, x_shard)
            if is_tuple_output:
                torch.autograd.backward(output[0], incoming_grad_shard)
            else:
                torch.autograd.backward(output, incoming_grad_shard)

        # Clean up hooks
        grad_accumulator.cleanup()
        del grad_accumulator

        return (None, None, x_grad, None, None)


class GradientAccumulator:
    """
    Manual gradient accumulator for TiledMLP with configurable precision
    Accumulates in specified dtype and rescales the gradient at the end
    """

    def __init__(
        self,
        params: List[torch.nn.Parameter],
        total_shards: int,
        dtype: torch.dtype | None = None,
    ):
        self.params = params
        self.total_shards = total_shards
        self.grad_accumulation_dtype = dtype or torch.float32
        self.accumulated_grads = {}
        self.hooks = []
        self.lock = threading.Lock()
        self.gradient_scale = 1.0 / total_shards

        # Initialize accumulated gradients in the specified dtype
        for param in self.params:
            if param.grad is not None:
                self.accumulated_grads[param] = param.grad.to(
                    self.grad_accumulation_dtype
                )
                param.grad = None
            else:
                self.accumulated_grads[param] = torch.zeros_like(
                    param, dtype=self.grad_accumulation_dtype
                )

    def install_hooks(self, is_last_shard: bool):
        """Install gradient hooks that accumulate gradients in higher precision"""

        def create_hook(param):
            def hook(grad):
                with self.lock:
                    grad_to_accum_dtype = grad.to(self.grad_accumulation_dtype)
                    scaled_grad = grad_to_accum_dtype * self.gradient_scale

                    if param in self.accumulated_grads:
                        self.accumulated_grads[param] += scaled_grad
                    else:
                        self.accumulated_grads[param] = scaled_grad.clone()

                    # Only assign the averaged gradient on the last shard
                    if is_last_shard:
                        param.grad = self.accumulated_grads[param].to(param.dtype)
                        return param.grad
                    return None

            return hook

        # Install hooks on all parameters
        for param in self.params:
            if param.requires_grad:
                hook = param.register_hook(create_hook(param))
                self.hooks.append(hook)

    def cleanup(self):
        """Remove all installed hooks"""
        for hook in self.hooks:
            hook.remove()
        self.hooks.clear()
        del self.accumulated_grads


================================================
FILE: src/axolotl/monkeypatch/tiled_mlp/patch.py
================================================
"""Monkeypatch for Tiled MLP implementation"""

import math
import os

import torch
import torch.distributed as dist

from axolotl.utils.callbacks.models import get_causal_lm_model_cls_prefix
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def patch_tiled_mlp(model_type, use_original_mlp=True, cfg_num_shards=None):
    from deepspeed.runtime.sequence_parallel.ulysses_sp import (
        TiledMLP as DeepSpeedTiledMLP,
    )

    from axolotl.monkeypatch.tiled_mlp.base import DeepSpeedTiledMLPMoE, TiledMLP

    try:
        # Dynamically import the module and MLP class
        module_path = f"transformers.models.{model_type}.modeling_{model_type}"
        model_cls_prefix, _ = get_causal_lm_model_cls_prefix(model_type)
        module = __import__(module_path, fromlist=[f"{model_cls_prefix}MLP"])
        mlp_cls = getattr(module, f"{model_cls_prefix}MLP")

        if use_original_mlp:
            mlp_forward = mlp_cls.forward
        else:

            def generic_mlp_forward(self_, hs):
                return self_.down_proj(
                    self_.act_fn(self_.gate_proj(hs)) * self_.up_proj(hs)
                )

            mlp_forward = torch.compile(generic_mlp_forward)

        is_distributed = int(os.environ.get("WORLD_SIZE", 1)) > 1

        def tiled_mlp_forward(self, x):
            input_shape = x.shape
            seqlen = input_shape[-2]
            hidden = input_shape[-1]
            if cfg_num_shards is None:
                num_shards = math.ceil(seqlen / hidden)
                if is_distributed:
                    num_shards_tensor = torch.tensor(num_shards, device=x.device)
                    dist.all_reduce(num_shards_tensor, op=dist.ReduceOp.MAX)
                    num_shards = num_shards_tensor.item()
            else:
                num_shards = cfg_num_shards

            if not self._compute_params:
                self._compute_params = [p for p in self.parameters() if p.requires_grad]

            compute_params = self._compute_params
            if not self._tiled_mlp_dist_impl:
                if (
                    self._compute_params
                    and any(
                        hasattr(p, "ds_id") or hasattr(p, "param_idx_in_group")
                        for p in self._compute_params
                    )
                ) or os.environ.get("ACCELERATE_USE_DEEPSPEED", "false") == "true":
                    if model_type == "gpt_oss":
                        self._tiled_mlp_dist_impl = DeepSpeedTiledMLPMoE
                    else:
                        self._tiled_mlp_dist_impl = DeepSpeedTiledMLP
                else:
                    self._tiled_mlp_dist_impl = TiledMLP

            down_res = self._tiled_mlp_dist_impl.apply(
                mlp_forward,
                self,
                x,
                num_shards,
                compute_params,
            )
            return down_res

        mlp_cls.forward = tiled_mlp_forward
        mlp_cls._compute_params = []
        mlp_cls._tiled_mlp_dist_impl = None
        LOG.info(
            f"Successfully monkey-patched TiledMLP for model_type: {model_type}",
        )
    except (ImportError, AttributeError) as e:
        raise RuntimeError(
            f"Could not import MLP class for model_type: {model_type}. Error: {str(e)}"
        ) from e


================================================
FILE: src/axolotl/monkeypatch/trainer/__init__.py
================================================
from .utils import entropy_from_logits, selective_log_softmax

__all__ = ["entropy_from_logits", "selective_log_softmax"]


================================================
FILE: src/axolotl/monkeypatch/trainer/lr.py
================================================
"""
monkeypatch for Trainer _get_learning_rate method
"""

import torch

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


# TODO remove this patch once https://github.com/huggingface/transformers/pull/37881 is included in a release
def _get_learning_rate(self):
    if self.is_deepspeed_enabled:
        # with deepspeed's fp16 and dynamic loss scale enabled the optimizer/scheduler steps may
        # not run for the first few dozen steps while loss scale is too large, and thus during
        # that time `get_last_lr` will fail if called during that warm up stage, so work around it:
        try:
            last_lr = self.lr_scheduler.get_last_lr()[0]
        except AssertionError as e:
            if "need to call step" in str(e):
                LOG.warning(
                    "tried to get lr value before scheduler/optimizer started stepping, returning lr=0"
                )
                last_lr = 0
            else:
                raise
    else:
        if isinstance(self.lr_scheduler, torch.optim.lr_scheduler.ReduceLROnPlateau):
            last_lr = self.optimizer.param_groups[0]["lr"]
        else:
            last_lr = self.lr_scheduler.get_last_lr()[0]

    if torch.is_tensor(last_lr):
        last_lr = last_lr.item()
    return last_lr


def patch_trainer_get_lr():
    from transformers.trainer import Trainer

    Trainer._get_learning_rate = _get_learning_rate


================================================
FILE: src/axolotl/monkeypatch/trainer/trl.py
================================================
"""Monkeypatch for TRL trainer FSDP preparation."""


def prepare_fsdp(model, accelerator):
    from axolotl.monkeypatch.accelerate.fsdp2 import fsdp2_prepare_model

    return fsdp2_prepare_model(accelerator, model)


def patch_trl_prepare_fsdp2():
    import trl.models.utils

    trl.models.utils.prepare_fsdp = prepare_fsdp


================================================
FILE: src/axolotl/monkeypatch/trainer/trl_vllm.py
================================================
"""Monkeypatches for TRL's vLLM integration and trainer utils.

Adds:
- VLLMClient.batch_update_named_params: batched weight sync (fewer HTTP round-trips)
- extract_logprobs: NaN→0.0 fix (prevents downstream NaN propagation)
- VLLMGeneration: weight_sync_chunk_size + batched sync path for non-FSDP/non-ZeRO
- split_tensor_dict / shuffle_sequence_dict: scalar type handling (int/float/bool passthrough)
"""

import logging
import math
from functools import wraps

import torch
from torch import nn

LOG = logging.getLogger(__name__)


def _batch_update_named_params(
    self, params: list[tuple[str, torch.Tensor]], chunk_size: int | None = None
):
    """Batched weight sync — sends param metadata via HTTP, tensors via NCCL."""
    from transformers import is_torch_xpu_available

    if chunk_size is None:
        chunks = [params]
    else:
        chunks = []
        current_chunk: list[tuple[str, torch.Tensor]] = []
        current_elements = 0
        for name, weights in params:
            n_elem = weights.numel()
            if current_chunk and current_elements + n_elem > chunk_size:
                chunks.append(current_chunk)
                current_chunk = []
                current_elements = 0
            current_chunk.append((name, weights))
            current_elements += n_elem
        if current_chunk:
            chunks.append(current_chunk)

    for chunk in chunks:
        param_metadata = [
            {"name": name, "dtype": str(weights.dtype), "shape": list(weights.shape)}
            for name, weights in chunk
        ]
        url = f"{self.base_url}/batch_update_named_params/"
        response = self.session.post(url, json={"params": param_metadata})
        if response.status_code != 200:
            raise Exception(f"Request failed: {response.status_code}, {response.text}")

        for _name, weights in chunk:
            if is_torch_xpu_available():
                self.communicator.broadcast(weights, root=self.rank)
            else:
                self.communicator.broadcast(weights, src=self.rank)

        if is_torch_xpu_available():
            self.communicator.barrier()
        else:
            self.communicator.group.barrier()


def _update_model_params(self, model: nn.Module, chunk_size: int | None = None):
    """Updates all model params using batch_update_named_params."""
    params = [(name, param.data) for name, param in model.named_parameters()]
    self.batch_update_named_params(params, chunk_size=chunk_size)


def _patched_extract_logprobs(all_outputs):
    """extract_logprobs with NaN→0.0 fix (stock TRL uses None which causes downstream errors)."""
    all_logprobs = []
    all_token_ids = []

    for outputs in all_outputs:
        for output in outputs.outputs:
            if output.logprobs is None:
                return None, None
            seq_logprobs = []
            seq_token_ids = []
            for lp in output.logprobs:
                sorted_items = sorted(lp.items(), key=lambda x: x[1].rank)
                seq_token_ids.append([token_id for token_id, _ in sorted_items])
                seq_logprobs.append(
                    [
                        0.0 if math.isnan(item.logprob) else item.logprob
                        for _, item in sorted_items
                    ]
                )
            all_logprobs.append(seq_logprobs)
            all_token_ids.append(seq_token_ids)

    return all_logprobs, all_token_ids


def _patched_split_tensor_dict(tensor_dict, num_chunks):
    """split_tensor_dict that handles scalar types (int/float/bool) for num_items_in_batch."""
    first_tensor = next(
        tensor
        for tensor in tensor_dict.values()
        if tensor is not None and isinstance(tensor, torch.Tensor) and tensor.ndim > 0
    )
    chunk_size = first_tensor.shape[0] // num_chunks
    chunks = []
    for i in range(num_chunks):
        chunk_dict = {}
        for key, tensor in tensor_dict.items():
            if isinstance(tensor, (int, float, bool)):
                chunk_dict[key] = tensor
            elif tensor is not None and (isinstance(tensor, list) or tensor.ndim > 0):
                chunk_dict[key] = tensor[i * chunk_size : (i + 1) * chunk_size]
            elif tensor is not None and tensor.ndim == 0:
                chunk_dict[key] = tensor
            else:
                chunk_dict[key] = None
        chunks.append(chunk_dict)
    return chunks


def _patched_shuffle_sequence_dict(seq_dict):
    """shuffle_sequence_dict that handles scalar types (int/float/bool)."""
    first_seq = next(
        v
        for v in seq_dict.values()
        if v is not None and isinstance(v, (torch.Tensor, list)) and len(v) > 0
    )
    perm = torch.randperm(len(first_seq))

    def permute(v):
        if v is None:
            return None
        if isinstance(v, (int, float, bool)):
            return v
        if isinstance(v, torch.Tensor) and v.ndim == 0:
            return v
        if isinstance(v, torch.Tensor) and v.ndim >= 1:
            return v[perm]
        if isinstance(v, list):
            return [v[i] for i in perm.tolist()]
        return v

    return {k: permute(v) for k, v in seq_dict.items()}


def _patch_sync_weights_batched(original_init):
    """Wrap VLLMGeneration.__init__ to accept weight_sync_chunk_size."""

    @wraps(original_init)
    def patched_init(self, *args, weight_sync_chunk_size=None, **kwargs):
        original_init(self, *args, **kwargs)
        self.weight_sync_chunk_size = weight_sync_chunk_size

    return patched_init


def _make_batched_sync_weights(original_sync_weights):
    """Wrap sync_weights to use batched sync for non-FSDP/non-ZeRO paths."""

    @wraps(original_sync_weights)
    def patched_sync_weights(self):
        from accelerate.utils import is_peft_model

        # Check if we're in a non-PEFT, non-FSDP, non-ZeRO scenario where batching helps
        accelerator = self.accelerator
        model = self.model
        is_fsdp_enabled = self.is_fsdp_enabled

        deepspeed_plugin = accelerator.state.deepspeed_plugin
        zero_stage_3 = deepspeed_plugin is not None and deepspeed_plugin.zero_stage == 3

        is_peft = is_peft_model(model)

        # If PEFT, FSDP, or ZeRO-3, fall back to original (which handles those cases)
        if is_peft or is_fsdp_enabled or zero_stage_3:
            return original_sync_weights(self)

        # Non-PEFT, non-FSDP, non-ZeRO: use batched sync
        if self.mode == "colocate" and getattr(self, "enable_sleep_mode", False):
            from vllm.distributed.device_communicators.cuda_wrapper import (
                empty_cache,
            )

            empty_cache()
            self.llm.wake_up(tags=["weights"])

        if self.mode == "server" and accelerator.is_main_process:
            params = [
                (self._fix_param_name_to_vllm(name), param.data)
                for name, param in model.named_parameters()
            ]
            self.vllm_client.batch_update_named_params(
                params, chunk_size=getattr(self, "weight_sync_chunk_size", None)
            )
        elif self.mode == "colocate":
            llm_model = (
                self.llm.llm_engine.model_executor.driver_worker.model_runner.model
            )
            weights = [
                (self._fix_param_name_to_vllm(name), param.data)
                for name, param in model.named_parameters()
            ]
            llm_model.load_weights(weights=weights)

        # Reset cache
        if self.mode == "server" and accelerator.is_main_process:
            self.vllm_client.reset_prefix_cache()
        elif self.mode == "colocate":
            self.llm.reset_prefix_cache()

    return patched_sync_weights


def patch_trl_vllm():
    """Apply all TRL vLLM monkeypatches."""
    import trl.generation.vllm_client
    import trl.generation.vllm_generation
    import trl.trainer.utils

    VLLMClient = trl.generation.vllm_client.VLLMClient
    VLLMGeneration = trl.generation.vllm_generation.VLLMGeneration

    # 1. Add batch_update_named_params to VLLMClient
    if not hasattr(VLLMClient, "batch_update_named_params"):
        VLLMClient.batch_update_named_params = _batch_update_named_params
        VLLMClient.update_model_params = _update_model_params
        LOG.info("Patched VLLMClient with batch_update_named_params")

    # 2. Patch extract_logprobs (NaN→0.0)
    trl.generation.vllm_generation.extract_logprobs = _patched_extract_logprobs
    LOG.info("Patched extract_logprobs with NaN→0.0 fix")

    # 3. Patch VLLMGeneration.__init__ to accept weight_sync_chunk_size
    VLLMGeneration.__init__ = _patch_sync_weights_batched(VLLMGeneration.__init__)

    # 4. Patch sync_weights for batched non-FSDP/non-ZeRO path
    VLLMGeneration.sync_weights = _make_batched_sync_weights(
        VLLMGeneration.sync_weights
    )
    LOG.info("Patched VLLMGeneration with batched sync_weights")

    # 5. Patch split_tensor_dict and shuffle_sequence_dict
    trl.trainer.utils.split_tensor_dict = _patched_split_tensor_dict
    trl.trainer.utils.shuffle_sequence_dict = _patched_shuffle_sequence_dict
    LOG.info("Patched split_tensor_dict and shuffle_sequence_dict for scalar types")


================================================
FILE: src/axolotl/monkeypatch/trainer/utils.py
================================================
# Copyright 2026 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import torch
import torch.nn.functional as F
import triton
import triton.language as tl


@triton.jit
def _entropy_online_kernel(
    logits_ptr,
    output_ptr,
    stride_row,
    V: tl.constexpr,
    BLOCK_V: tl.constexpr,
):
    """Online entropy: single pass with running max correction."""
    row = tl.program_id(0)
    row_ptr = logits_ptr + tl.cast(row, tl.int64) * stride_row

    running_max = tl.full([], float("-inf"), dtype=tl.float32)
    running_sum_exp = tl.full([], 0.0, dtype=tl.float32)
    running_weighted = tl.full([], 0.0, dtype=tl.float32)

    for v_start in range(0, V, BLOCK_V):
        offs = v_start + tl.arange(0, BLOCK_V)
        mask = offs < V
        x = tl.load(row_ptr + offs, mask=mask, other=float("-inf")).to(tl.float32)

        block_max = tl.max(x, axis=0)
        new_max = tl.maximum(running_max, block_max)

        correction = tl.exp(running_max - new_max)
        running_sum_exp = running_sum_exp * correction
        running_weighted = running_weighted * correction

        exp_x = tl.exp(x - new_max)
        exp_x = tl.where(mask, exp_x, 0.0)
        x = tl.where(mask, x, 0.0)
        running_sum_exp += tl.sum(exp_x, axis=0)
        running_weighted += tl.sum(exp_x * x, axis=0)

        running_max = new_max

    entropy = tl.log(running_sum_exp) + running_max - running_weighted / running_sum_exp
    tl.store(output_ptr + row, entropy)


@triton.jit
def _entropy_online_kernel_strided(
    logits_ptr,
    output_ptr,
    stride_outer,
    stride_inner,
    n_inner,
    row_offset,
    V: tl.constexpr,
    BLOCK_V: tl.constexpr,
):
    """Online entropy for non-contiguous 3D (B, L, V) tensors."""
    local_row = tl.program_id(0)
    row = local_row + row_offset
    outer_idx = row // n_inner
    inner_idx = row % n_inner
    off = outer_idx.to(tl.int64) * stride_outer + inner_idx.to(tl.int64) * stride_inner
    row_ptr = logits_ptr + off

    running_max = tl.full([], float("-inf"), dtype=tl.float32)
    running_sum_exp = tl.full([], 0.0, dtype=tl.float32)
    running_weighted = tl.full([], 0.0, dtype=tl.float32)

    for v_start in range(0, V, BLOCK_V):
        offs = v_start + tl.arange(0, BLOCK_V)
        mask = offs < V
        x = tl.load(row_ptr + offs, mask=mask, other=float("-inf")).to(tl.float32)

        block_max = tl.max(x, axis=0)
        new_max = tl.maximum(running_max, block_max)

        correction = tl.exp(running_max - new_max)
        running_sum_exp = running_sum_exp * correction
        running_weighted = running_weighted * correction

        exp_x = tl.exp(x - new_max)
        exp_x = tl.where(mask, exp_x, 0.0)
        x = tl.where(mask, x, 0.0)
        running_sum_exp += tl.sum(exp_x, axis=0)
        running_weighted += tl.sum(exp_x * x, axis=0)

        running_max = new_max

    entropy = tl.log(running_sum_exp) + running_max - running_weighted / running_sum_exp
    tl.store(output_ptr + local_row, entropy)


def entropy_from_logits(logits: torch.Tensor, chunk_size: int = 128) -> torch.Tensor:
    """Triton-fused entropy (online single-pass). Handles non-contiguous tensors without copying."""
    original_shape = logits.shape[:-1]
    V = logits.shape[-1]
    N = 1
    for s in original_shape:
        N *= s

    if not logits.is_cuda:
        # CPU fallback: stable entropy via log_softmax
        logp = F.log_softmax(logits.float(), dim=-1)
        ent = -(logp.exp() * logp).sum(dim=-1)
        return ent.to(logits.dtype).reshape(original_shape)

    output = torch.empty(N, device=logits.device, dtype=torch.float32)

    BLOCK_V = 4096
    MAX_GRID_CONTIG = 8192
    MAX_GRID_STRIDED = 2048

    # Vocab (last) dim must be contiguous for coalesced loads
    if logits.stride(-1) != 1:
        logits = logits.contiguous()

    if logits.is_contiguous():
        flat_logits = logits.reshape(-1, V)
        stride = flat_logits.stride(0)
        for start in range(0, N, MAX_GRID_CONTIG):
            n_rows = min(MAX_GRID_CONTIG, N - start)
            _entropy_online_kernel[(n_rows,)](
                flat_logits[start], output[start], stride, V=V, BLOCK_V=BLOCK_V
            )
    elif logits.ndim == 3:
        stride_outer = logits.stride(0)
        stride_inner = logits.stride(1)
        n_inner = logits.shape[1]
        for start in range(0, N, MAX_GRID_STRIDED):
            n_rows = min(MAX_GRID_STRIDED, N - start)
            _entropy_online_kernel_strided[(n_rows,)](
                logits,
                output[start],
                stride_outer,
                stride_inner,
                n_inner,
                start,
                V=V,
                BLOCK_V=BLOCK_V,
            )
    else:
        logits = logits.contiguous()
        flat_logits = logits.reshape(-1, V)
        stride = flat_logits.stride(0)
        for start in range(0, N, MAX_GRID_CONTIG):
            n_rows = min(MAX_GRID_CONTIG, N - start)
            _entropy_online_kernel[(n_rows,)](
                flat_logits[start], output[start], stride, V=V, BLOCK_V=BLOCK_V
            )

    return output.to(logits.dtype).reshape(original_shape)


# ---------------------------------------------------------------------------
# selective_log_softmax — fused forward + backward Triton kernels
# ---------------------------------------------------------------------------


def selective_log_softmax_original(logits, index) -> torch.Tensor:
    """Original selective_log_softmax (reference/fallback)."""
    squeeze = index.ndim == logits.ndim - 1
    if squeeze:
        index = index.unsqueeze(-1)

    if logits.dtype in [torch.float32, torch.float64]:
        selected_logits = torch.gather(logits, dim=-1, index=index)
        logsumexp_values = torch.stack([torch.logsumexp(lg, dim=-1) for lg in logits])
        per_token_logps = selected_logits - logsumexp_values.unsqueeze(-1)
    else:
        per_token_logps = []
        for row_logits, row_labels in zip(logits, index, strict=True):
            row_logps = F.log_softmax(row_logits, dim=-1)
            row_per_token_logps = row_logps.gather(dim=-1, index=row_labels)
            per_token_logps.append(row_per_token_logps)
        per_token_logps = torch.stack(per_token_logps)

    if squeeze:
        per_token_logps = per_token_logps.squeeze(-1)

    return per_token_logps


@triton.jit
def _selective_logsoftmax_fwd_kernel(
    logits_ptr,
    index_ptr,
    output_ptr,
    logsumexp_ptr,
    stride_logits_row,
    stride_index_row,
    stride_output_row,
    actual_K,
    K_BLOCK: tl.constexpr,
    V: tl.constexpr,
    BLOCK_V: tl.constexpr,
):
    """Forward: online logsumexp + gather. Saves logsumexp for backward."""
    row = tl.program_id(0)
    logits_row_ptr = logits_ptr + tl.cast(row, tl.int64) * stride_logits_row

    # Online logsumexp
    running_max = tl.full([], float("-inf"), dtype=tl.float32)
    running_sum_exp = tl.full([], 0.0, dtype=tl.float32)

    for v_start in range(0, V, BLOCK_V):
        offs = v_start + tl.arange(0, BLOCK_V)
        mask = offs < V
        x = tl.load(logits_row_ptr + offs, mask=mask, other=float("-inf")).to(
            tl.float32
        )

        block_max = tl.max(x, axis=0)
        new_max = tl.maximum(running_max, block_max)
        running_sum_exp = running_sum_exp * tl.exp(running_max - new_max)

        exp_x = tl.exp(x - new_max)
        exp_x = tl.where(mask, exp_x, 0.0)
        running_sum_exp += tl.sum(exp_x, axis=0)
        running_max = new_max

    lse = tl.log(running_sum_exp) + running_max
    tl.store(logsumexp_ptr + row, lse)

    # Gather and subtract
    index_row_ptr = index_ptr + tl.cast(row, tl.int64) * stride_index_row
    output_row_ptr = output_ptr + tl.cast(row, tl.int64) * stride_output_row

    k_offs = tl.arange(0, K_BLOCK)
    k_mask = k_offs < actual_K
    indices = tl.load(index_row_ptr + k_offs, mask=k_mask, other=0).to(tl.int64)
    valid_mask = k_mask & (indices >= 0) & (indices < V)
    safe_indices = tl.where(valid_mask, indices, 0)
    selected = tl.load(logits_row_ptr + safe_indices, mask=valid_mask, other=0.0).to(
        tl.float32
    )
    tl.store(output_row_ptr + k_offs, selected - lse, mask=valid_mask)


@triton.jit
def _selective_logsoftmax_bwd_kernel(
    grad_output_ptr,
    logits_ptr,
    index_ptr,
    logsumexp_ptr,
    grad_logits_ptr,
    stride_grad_out_row,
    stride_logits_row,
    stride_index_row,
    stride_grad_logits_row,
    actual_K,
    K_BLOCK: tl.constexpr,
    V: tl.constexpr,
    BLOCK_V: tl.constexpr,
):
    """Backward: d_logits[j] = -softmax(x)[j] * sum(grad_out) + (grad_out[k] if j == index[k]).

    Single fused pass over V. For each tile, computes the base gradient and adds
    scatter contributions inline by checking which indices fall in the current tile.
    No separate scatter pass — no read-after-write issues.
    """
    row = tl.program_id(0)
    logits_row_ptr = logits_ptr + tl.cast(row, tl.int64) * stride_logits_row
    grad_logits_row_ptr = (
        grad_logits_ptr + tl.cast(row, tl.int64) * stride_grad_logits_row
    )
    grad_out_row_ptr = grad_output_ptr + tl.cast(row, tl.int64) * stride_grad_out_row
    index_row_ptr = index_ptr + tl.cast(row, tl.int64) * stride_index_row

    lse = tl.load(logsumexp_ptr + row).to(tl.float32)

    # Load grad_output and indices (K_BLOCK elements, masked)
    k_offs = tl.arange(0, K_BLOCK)
    k_mask = k_offs < actual_K
    grad_out = tl.load(grad_out_row_ptr + k_offs, mask=k_mask, other=0.0).to(tl.float32)
    indices = tl.load(
        index_row_ptr + k_offs, mask=k_mask, other=-1
    )  # -1 = never matches
    valid_mask = k_mask & (indices >= 0) & (indices < V)
    grad_out = tl.where(valid_mask, grad_out, 0.0)
    indices = tl.where(valid_mask, indices, -1)
    grad_sum = tl.sum(grad_out, axis=0)

    # Fused pass: for each tile, compute -softmax * grad_sum + scatter
    for v_start in range(0, V, BLOCK_V):
        offs = v_start + tl.arange(0, BLOCK_V)  # [BLOCK_V]
        mask = offs < V
        x = tl.load(logits_row_ptr + offs, mask=mask, other=0.0).to(tl.float32)
        softmax_j = tl.exp(x - lse)
        softmax_j = tl.where(mask, softmax_j, 0.0)
        grad_j = -softmax_j * grad_sum

        # Scatter: check which selected indices fall in this tile
        # offs: [BLOCK_V], indices: [K_BLOCK]
        # Broadcast: offs[:, None] == indices[None, :] → [BLOCK_V, K_BLOCK]
        match = offs[:, None] == indices[None, :]  # [BLOCK_V, K_BLOCK]
        # Sum grad_out contributions: for each position j, sum grad_out[k] where index[k]==j
        scatter_contrib = tl.sum(
            tl.where(match, grad_out[None, :], 0.0), axis=1
        )  # [BLOCK_V]
        grad_j += scatter_contrib

        tl.store(grad_logits_row_ptr + offs, grad_j, mask=mask)


class _SelectiveLogSoftmaxTriton(torch.autograd.Function):
    @staticmethod
    def forward(ctx, flat_logits, flat_index, K, K_BLOCK, V, BLOCK_V, MAX_GRID):
        N = flat_logits.shape[0]
        output = torch.empty(N, K_BLOCK, device=flat_logits.device, dtype=torch.float32)
        logsumexp = torch.empty(N, device=flat_logits.device, dtype=torch.float32)

        for start in range(0, N, MAX_GRID):
            n_rows = min(MAX_GRID, N - start)
            _selective_logsoftmax_fwd_kernel[(n_rows,)](
                flat_logits[start],
                flat_index[start],
                output[start],
                logsumexp[start],
                flat_logits.stride(0),
                flat_index.stride(0),
                output.stride(0),
                K,
                K_BLOCK=K_BLOCK,
                V=V,
                BLOCK_V=BLOCK_V,
            )

        ctx.save_for_backward(flat_logits, flat_index, logsumexp)
        ctx.K = K
        ctx.K_BLOCK = K_BLOCK
        ctx.V = V
        ctx.BLOCK_V = BLOCK_V
        ctx.MAX_GRID = MAX_GRID
        return output

    @staticmethod
    def backward(ctx, grad_output):
        flat_logits, flat_index, logsumexp = ctx.saved_tensors
        K, K_BLOCK, V, BLOCK_V, MAX_GRID = (
            ctx.K,
            ctx.K_BLOCK,
            ctx.V,
            ctx.BLOCK_V,
            ctx.MAX_GRID,
        )
        N = flat_logits.shape[0]

        grad_logits = torch.empty_like(flat_logits)

        # grad_output may have K_BLOCK cols; backward kernel reads actual_K
        grad_output_contig = grad_output.contiguous()

        for start in range(0, N, MAX_GRID):
            n_rows = min(MAX_GRID, N - start)
            _selective_logsoftmax_bwd_kernel[(n_rows,)](
                grad_output_contig[start],
                flat_logits[start],
                flat_index[start],
                logsumexp[start],
                grad_logits[start],
                grad_output_contig.stride(0),
                flat_logits.stride(0),
                flat_index.stride(0),
                grad_logits.stride(0),
                K,
                K_BLOCK=K_BLOCK,
                V=V,
                BLOCK_V=BLOCK_V,
            )

        # Return grads for: flat_logits, flat_index, K, K_BLOCK, V, BLOCK_V, MAX_GRID
        return grad_logits, None, None, None, None, None, None


def selective_log_softmax(logits, index) -> torch.Tensor:
    """
    Fused selective_log_softmax with Triton forward+backward kernels.

    Equivalent to: torch.gather(logits.log_softmax(-1), dim=-1, index=index)
    """
    squeeze = index.ndim == logits.ndim - 1
    if squeeze:
        index = index.unsqueeze(-1)

    if not logits.is_cuda or logits.dtype == torch.float64:
        # Triton kernel computes in float32; fall back for float64 and CPU
        return selective_log_softmax_original(
            logits, index.squeeze(-1) if squeeze else index
        )

    V = logits.shape[-1]
    K = index.shape[-1]
    original_index_shape = index.shape

    flat_logits = logits.reshape(-1, V).contiguous()
    flat_index = index.reshape(-1, K).contiguous()

    BLOCK_V = 4096
    MAX_GRID = 8192
    K_BLOCK = max(1, triton.next_power_of_2(K))

    output = _SelectiveLogSoftmaxTriton.apply(
        flat_logits, flat_index, K, K_BLOCK, V, BLOCK_V, MAX_GRID
    )

    if K_BLOCK != K:
        output = output[:, :K]

    per_token_logps = output.to(logits.dtype).reshape(original_index_shape)

    if squeeze:
        per_token_logps = per_token_logps.squeeze(-1)

    return per_token_logps


================================================
FILE: src/axolotl/monkeypatch/trainer_accelerator_args.py
================================================
"""
allow adding additional kwargs to Accelerator init
"""

import inspect

from transformers import Trainer

from axolotl.monkeypatch.utils import detab_code
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

ORIGINAL_TRAINER_CODE = """
    # create accelerator object
    self.accelerator = Accelerator(**args)
"""

PATCHED_TRAINER_CODE = """
    if hasattr(self, "additional_accelerator_args"):
        additional_args = self.additional_accelerator_args(fp8=True, enable_fsdp_float8_all_gather={enable_fsdp_float8_all_gather}, **args)
        if additional_args:
            args.update(additional_args)

    # create accelerator object
    self.accelerator = Accelerator(**args)
"""


def get_create_accelerate_code() -> str:
    training_loop = inspect.getsource(Trainer.create_accelerator_and_postprocess)
    return training_loop


def check_create_accelerate_code_is_patchable() -> bool:
    create_code = get_create_accelerate_code()
    create_code, _ = detab_code(create_code)
    return ORIGINAL_TRAINER_CODE in create_code


def patch_create_accelerate_code_for_fp8(enable_fsdp_float8_all_gather: bool):
    """
    Monkeypatch create_accelerator_and_postprocess so it checks for additional kwargs.
    """

    try:
        create_code = get_create_accelerate_code()
    except OSError:
        return
    Trainer._original_create_accelerator_and_postprocess = create_code
    create_code, _ = detab_code(create_code)
    if ORIGINAL_TRAINER_CODE not in create_code:
        return

    patched_trainer_code = PATCHED_TRAINER_CODE.format(
        enable_fsdp_float8_all_gather=enable_fsdp_float8_all_gather
    )
    create_code = create_code.replace(ORIGINAL_TRAINER_CODE, patched_trainer_code)
    create_code = create_code.replace(
        "def create_accelerator_and_postprocess(",
        "def fixed_create_accelerator_and_postprocess(",
        1,
    )

    # load imports necessary
    import transformers.trainer

    items_to_import = []
    for item in dir(transformers.trainer):
        if item in create_code:
            items_to_import.append(item)

    exec(
        "from transformers.trainer import ("
        + ", ".join(x for x in items_to_import)
        + ")",
        globals(),
    )
    exec(create_code, globals())
    LOG.info("patching create_accelerator_and_postprocess to allow for overrides")
    Trainer.create_accelerator_and_postprocess = (
        fixed_create_accelerator_and_postprocess
    )


================================================
FILE: src/axolotl/monkeypatch/trainer_fsdp_optim.py
================================================
"""
fix for FSDP optimizer save in trainer w 4.47.0
"""

import inspect

from transformers import Trainer

from axolotl.monkeypatch.utils import detab_code
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

ORIGINAL_TRAINER_CODE = """
                if delay_optimizer_creation:
                    self.optimizer = self.accelerator.prepare(self.optimizer)
"""

PATCHED_TRAINER_CODE = """
                if delay_optimizer_creation:
                    model = self.accelerator.prepare(self.model)
"""


def get_training_loop_code() -> str:
    training_loop = inspect.getsource(Trainer._inner_training_loop)
    return training_loop


def check_training_loop_is_patchable() -> bool:
    training_loop = get_training_loop_code()
    training_loop, _ = detab_code(training_loop)
    return ORIGINAL_TRAINER_CODE in training_loop


def patch_training_loop_for_fsdp():
    """
    monkeypatch for fixing the training loop for fsdp with optimizer save
    """

    try:
        training_loop = get_training_loop_code()
    except OSError:
        return
    Trainer._original_inner_training_loop = training_loop
    training_loop, _ = detab_code(training_loop)
    if ORIGINAL_TRAINER_CODE not in training_loop:
        return

    training_loop = training_loop.replace(ORIGINAL_TRAINER_CODE, PATCHED_TRAINER_CODE)
    training_loop = training_loop.replace(
        "def _inner_training_loop(",
        "def _fixed_inner_training_loop(",
        1,
    )

    # load imports necessary
    import transformers.trainer

    items_to_import = []
    for item in dir(transformers.trainer):
        if item in training_loop:
            items_to_import.append(item)

    exec(
        "from transformers.trainer import ("
        + ", ".join(x for x in items_to_import)
        + ")",
        globals(),
    )
    exec(training_loop, globals())
    LOG.info("patching _inner_training_loop for fsdp optimizer save")
    Trainer._inner_training_loop = _fixed_inner_training_loop


================================================
FILE: src/axolotl/monkeypatch/transformers/__init__.py
================================================


================================================
FILE: src/axolotl/monkeypatch/transformers/trainer_context_parallel.py
================================================
"""Monkey patch to allow context parallelism with FlashAttention in HF Trainer."""

from __future__ import annotations

import importlib
import inspect

from transformers import Trainer

from axolotl.monkeypatch.utils import detab_code
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

GUARD_PATTERN = 'if model.config._attn_implementation != "sdpa":'
PATCHED_GUARD = 'if (attn_impl := (getattr(model.config, "_attn_implementation", None) or getattr(model.model.config, "_attn_implementation", None))) and attn_impl not in ("sdpa", "flash_attention_2"):'


def patch_prepare_context_parallel_inputs() -> None:
    """Relax the SDPA-only guard when running context parallelism with FlashAttention."""
    if getattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched", False):
        LOG.debug("Trainer._prepare_context_parallel_inputs already patched")
        return

    try:
        original_source = inspect.getsource(Trainer._prepare_context_parallel_inputs)
    except OSError as exc:  # pragma: no cover - occurs when source is unavailable
        LOG.warning("Unable to patch Trainer._prepare_context_parallel_inputs: %s", exc)
        return

    if GUARD_PATTERN not in original_source:
        LOG.warning(
            "Expected guard not found in Trainer._prepare_context_parallel_inputs; \n"
            "skipping FlashAttention context parallelism patch"
        )
        return

    patched_source = original_source.replace(GUARD_PATTERN, PATCHED_GUARD)
    patched_source, _ = detab_code(patched_source)
    patched_source = patched_source.replace(
        "def _prepare_context_parallel_inputs(",
        "def axolotl_prepare_context_parallel_inputs(",
        1,
    )

    module_name = Trainer.__module__
    module = importlib.import_module(module_name)

    # import symbols referenced in the method so exec can succeed
    items_to_import = []
    for item in dir(module):
        if item in patched_source:
            items_to_import.append(item)

    # Use a separate namespace to capture the exec'd function
    namespace = {}
    exec(f"from {module_name} import ({', '.join(items_to_import)})", namespace)
    exec(patched_source, namespace)

    # Explicitly get the function from the namespace
    axolotl_prepare_context_parallel_inputs = namespace[
        "axolotl_prepare_context_parallel_inputs"
    ]
    Trainer._original_prepare_context_parallel_inputs = (
        Trainer._prepare_context_parallel_inputs
    )
    Trainer._prepare_context_parallel_inputs = axolotl_prepare_context_parallel_inputs
    Trainer._axolotl_prepare_context_parallel_inputs_source = patched_source
    Trainer._axolotl_prepare_context_parallel_inputs_patched = True
    LOG.debug(
        "Patched Trainer._prepare_context_parallel_inputs for FlashAttention + CP"
    )


================================================
FILE: src/axolotl/monkeypatch/transformers/trainer_loss_calc.py
================================================
"""
Module for patching transformers Trainer loss calculation to use nanmean.

This is needed for context parallelism since chunks of the input sequences may be fully
masked and return NaNs in the loss calculation.

Also includes a patch for FSDP2 + torch.compile. We need to bundle this together with
the other evaluation_loop patch because we can't patch the same code twice without
raising an OSError.
"""

import importlib
import inspect

from transformers import Trainer

from axolotl.monkeypatch.utils import detab_code
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

ORIGINAL_EVAL_CODE = {
    "list": 'metrics[f"{metric_key_prefix}_loss"] = np.concatenate(all_losses).mean().item()',
    "array": 'metrics[f"{metric_key_prefix}_loss"] = all_losses.mean().item()',
}
PATCHED_EVAL_CODE = {
    "list": 'metrics[f"{metric_key_prefix}_loss"] = np.nanmean(np.concatenate(all_losses)).item()',
    "array": 'metrics[f"{metric_key_prefix}_loss"] = np.nanmean(all_losses).item()',
}

ORIGINAL_MAYBE_CODE = (
    "tr_loss_scalar = nested_gather(tr_loss, self.args.parallel_mode).mean().item()"
)
PATCHED_MAYBE_CODE = (
    "tr_loss_scalar = nested_gather(tr_loss, self.args.parallel_mode).nanmean().item()"
)


def check_evaluation_loop_is_patchable() -> bool:
    evaluation_loop_source = inspect.getsource(Trainer.evaluation_loop)
    return all(value in evaluation_loop_source for value in ORIGINAL_EVAL_CODE.values())


def patch_evaluation_loop():
    """Patch the evaluation_loop method."""
    # Check if already patched
    if hasattr(Trainer, "_original_evaluation_loop"):
        LOG.debug("Trainer.evaluation_loop already patched")
        return

    # Check if the patterns exist
    try:
        evaluation_loop_source = inspect.getsource(Trainer.evaluation_loop)
    except OSError:
        return
    Trainer.evaluation = evaluation_loop_source
    evaluation_loop_source, _ = detab_code(evaluation_loop_source)

    # Apply the nanmean patches
    evaluation_loop_source = evaluation_loop_source.replace(
        ORIGINAL_EVAL_CODE["list"], PATCHED_EVAL_CODE["list"]
    )
    evaluation_loop_source = evaluation_loop_source.replace(
        ORIGINAL_EVAL_CODE["array"], PATCHED_EVAL_CODE["array"]
    )

    # Rename the function to avoid conflicts
    evaluation_loop_source = evaluation_loop_source.replace(
        "def evaluation_loop(",
        "def axolotl_evaluation_loop(",
        1,
    )

    # Get the module for necessary imports
    module_name = Trainer.__module__
    module = importlib.import_module(module_name)

    # Import necessary items from the module
    items_to_import = []
    for item in dir(module):
        if item in evaluation_loop_source:
            items_to_import.append(item)

    # Execute the imports and patched method
    exec(
        f"from {module_name} import ({', '.join(items_to_import)})",
        globals(),
    )
    exec(evaluation_loop_source, globals())

    LOG.debug("Patched Trainer.evaluation_loop with nanmean loss calculation")
    Trainer.evaluation_loop = axolotl_evaluation_loop


def check_maybe_log_save_evaluate_is_patchable() -> bool:
    maybe_log_source = inspect.getsource(Trainer._maybe_log_save_evaluate)
    return ORIGINAL_MAYBE_CODE in maybe_log_source


def patch_maybe_log_save_evaluate():
    """Patch the _maybe_log_save_evaluate method."""
    # Check if already patched
    if hasattr(Trainer, "_original_maybe_log_save_evaluate"):
        LOG.info("Trainer._maybe_log_save_evaluate already patched")
        return

    # Check if the patterns exist
    try:
        maybe_log_source = inspect.getsource(Trainer._maybe_log_save_evaluate)
    except OSError:
        return
    Trainer._original_maybe_log_save_evaluate = maybe_log_source
    maybe_log_source, _ = detab_code(maybe_log_source)

    # Apply the patch
    maybe_log_source = maybe_log_source.replace(ORIGINAL_MAYBE_CODE, PATCHED_MAYBE_CODE)

    # Rename the function to avoid conflicts
    maybe_log_source = maybe_log_source.replace(
        "def _maybe_log_save_evaluate(",
        "def axolotl_maybe_log_save_evaluate(",
        1,
    )

    # Get the module for necessary imports
    module_name = Trainer.__module__
    module = importlib.import_module(module_name)

    # Import necessary items from the module
    items_to_import = []
    for item in dir(module):
        if item in maybe_log_source:
            items_to_import.append(item)

    # Execute the imports and patched method
    exec(
        f"from {module_name} import ({', '.join(items_to_import)})",
        globals(),
    )
    exec(maybe_log_source, globals())

    LOG.debug("Patched Trainer._maybe_log_save_evaluate with nanmean loss calculation")
    Trainer._maybe_log_save_evaluate = axolotl_maybe_log_save_evaluate


================================================
FILE: src/axolotl/monkeypatch/transformers_fa_utils.py
================================================
"""
see https://github.com/huggingface/transformers/pull/35834
"""

from functools import partial
from typing import Optional

import torch

from axolotl.utils.logging import get_logger

logger = get_logger(__name__)


def fixed_fa_peft_integration_check(
    query: torch.Tensor,
    key: torch.Tensor,
    value: torch.Tensor,
    target_dtype: Optional[torch.dtype] = None,
    preferred_dtype: Optional[torch.dtype] = None,
):
    """
    PEFT usually casts the layer norms in float32 for training stability reasons
    therefore the input hidden states gets silently casted in float32. Hence, we need
    cast them back in float16 / bfloat16 just to be sure everything works as expected.
    This might slowdown training & inference so it is recommended to not cast the LayerNorms!

    Args:
        query (`torch.Tensor`):
            Input query states to be passed to Flash Attention API
        key (`torch.Tensor`):
            Input key states to be passed to Flash Attention API
        value (`torch.Tensor`):
            Input value states to be passed to Flash Attention API
        target_dtype (`torch.dtype`, *optional*):
            The dtype to convert the attention tensors to. Conversion can be ignored by
            not providing the target dtype.
        preferred_dtype (`torch.dtype`, *optional*):
            The preferred dtype to convert the attention tensors to regardless of the
            target dtype.
    """
    if target_dtype is None and preferred_dtype is None:
        return query, key, value

    if preferred_dtype and target_dtype != preferred_dtype:
        target_dtype = preferred_dtype

    # check if any of query, key, or value are in float32. If so, cast them back to target dtype.
    if any(module.dtype == torch.float32 for module in [query, key, value]):
        logger.warning_once(
            f"The input hidden states seems to be silently casted in float32, this might be related to"
            f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
            f" {target_dtype}."
        )

        query = query.to(target_dtype)
        key = key.to(target_dtype)
        value = value.to(target_dtype)

    return query, key, value


def patch_fa_peft_integration():
    import transformers.modeling_flash_attention_utils

    transformers.modeling_flash_attention_utils.fa_peft_integration_check = partial(
        fixed_fa_peft_integration_check, preferred_dtype=None
    )


================================================
FILE: src/axolotl/monkeypatch/unsloth_.py
================================================
"""module for patching with unsloth optimizations"""

import inspect
import types

import torch
from peft import PeftModelForCausalLM
from torch import nn
from transformers.models.llama.modeling_llama import LlamaFlashAttention2

from axolotl.monkeypatch.utils import detab_code
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

ORIGINAL_QKV_CODE = """
    query_states = self.q_proj(hidden_states)
    key_states = self.k_proj(hidden_states)
    value_states = self.v_proj(hidden_states)
""".lstrip("\n")

PATCHED_QKV_CODE = """
    query_states, key_states, value_states = self.apply_qkv(self, hidden_states)
""".lstrip("\n")

ORIGINAL_O_CODE = """
    attn_output = self.o_proj(attn_output)
""".lstrip("\n")

PATCHED_O_CODE = """
    attn_output = self.apply_o(self, attn_output)
""".lstrip("\n")


def original_apply_qkv(self, hidden_states):
    query_states = self.q_proj(hidden_states)
    key_states = self.k_proj(hidden_states)
    value_states = self.v_proj(hidden_states)
    return query_states, key_states, value_states


def original_apply_o(self, hidden_states):
    attn_output = self.o_proj(hidden_states)
    return attn_output


def get_self_attn_code() -> str:
    forward = inspect.getsource(LlamaFlashAttention2.forward)
    return forward


def check_self_attn_is_patchable() -> bool:
    qkv = get_self_attn_code()
    qkv, _ = detab_code(qkv)
    return ORIGINAL_QKV_CODE in qkv and ORIGINAL_O_CODE in qkv


def integrate_cross_entropy_loss_patch(model_type: str = "llama") -> None:
    from unsloth.kernels.cross_entropy_loss import fast_cross_entropy_loss

    def UnslothForCausalLMLoss(
        logits,
        labels,
        vocab_size: int,
        num_items_in_batch: int = None,
        ignore_index: int = -100,
        **kwargs,
    ):
        # Upcast to float if we need to compute the loss to avoid potential precision issues
        logits = logits.float()
        # Shift so that tokens < n predict n
        shift_logits = logits[..., :-1, :].contiguous()
        shift_labels = labels[..., 1:].contiguous()

        loss = fast_cross_entropy_loss(
            logits=shift_logits, labels=shift_labels, n_items=num_items_in_batch
        )
        return loss

    if model_type == "llama":
        from transformers.loss import loss_utils

        loss_utils.ForCausalLMLoss = UnslothForCausalLMLoss  # type: ignore[assignment]
    else:
        raise ValueError("Unsupported model type")


self_attn_lora_patched = False


def patch_self_attn_lora():
    global self_attn_lora_patched
    if self_attn_lora_patched:
        # prevent patching multiple times
        return
    self_attn_forward = get_self_attn_code()
    LlamaFlashAttention2._original_forward = self_attn_forward
    self_attn_forward, _ = detab_code(self_attn_forward)
    assert ORIGINAL_QKV_CODE in self_attn_forward, "Original qkv code not found"
    assert ORIGINAL_O_CODE in self_attn_forward, "Original o code not found"

    self_attn_forward = self_attn_forward.replace(ORIGINAL_QKV_CODE, PATCHED_QKV_CODE)
    self_attn_forward = self_attn_forward.replace(ORIGINAL_O_CODE, PATCHED_O_CODE)
    self_attn_forward = self_attn_forward.replace(
        "def forward(",
        "def unsloth_attn_forward(",
        1,
    )

    # load imports necessary
    import transformers.models.llama.modeling_llama

    items_to_import = []
    for item in dir(transformers.models.llama.modeling_llama):
        if item in self_attn_forward:
            items_to_import.append(item)

    exec(
        "from transformers.models.llama.modeling_llama import ("
        + ", ".join(x for x in items_to_import)
        + ")",
        globals(),
    )
    exec(self_attn_forward, globals())
    self_attn_lora_patched = True
    LOG.info("patching unsloth attn lora")
    LlamaFlashAttention2.forward = unsloth_attn_forward


def integrate_rope_embeddings():
    import transformers.models.llama.modeling_llama
    from unsloth.kernels.rope_embedding import fast_rope_embedding

    def apply_rotary_pos_emb(
        q,
        k,
        cos,
        sin,
        position_ids=None,
        unsqueeze_dim=1,
    ):
        return fast_rope_embedding(q, k, cos, sin)

    LOG.info("patching unsloth RoPE embeddings")
    transformers.models.llama.modeling_llama.apply_rotary_pos_emb = apply_rotary_pos_emb


def integrate_lora_mlp_patch(peft_model: PeftModelForCausalLM):
    if peft_model.base_model.config.model_type in ["llama", "mistral"]:
        from unsloth.kernels import apply_lora_mlp_swiglu

        apply_lora_mlp = apply_lora_mlp_swiglu
    elif peft_model.base_model.config.model_type == "gemma":
        from unsloth.kernels import apply_lora_mlp_geglu_approx

        apply_lora_mlp = apply_lora_mlp_geglu_approx
    else:
        raise NotImplementedError(
            f"Model type {peft_model.base_model.config.model_type} not supported"
        )

    for idx, layer in enumerate(peft_model.model.model.layers):
        layer_modules = [
            getattr(layer.mlp, linear_proj)
            for linear_proj in ["gate_proj", "up_proj", "down_proj"]
        ]
        is_mlp_lora = all(hasattr(module, "lora_A") for module in layer_modules)
        mlp_no_bias = all(
            getattr(module, "base_layer", module).bias is None
            for module in layer_modules
        )
        mlp_not_dora = all(
            len(getattr(module, "lora_magnitude_vector", []) or []) == 0
            for module in layer_modules
        )

        if is_mlp_lora and mlp_no_bias and mlp_not_dora:
            layer.mlp.forward = types.MethodType(apply_lora_mlp, layer.mlp)
        else:
            LOG.warning(f"unable to apply unsloth lora mlp patch to layer {idx}")


def integrate_lora_patch(peft_model: PeftModelForCausalLM, cfg):
    from unsloth.kernels import apply_lora_o, apply_lora_qkv

    for idx, layer in enumerate(peft_model.model.model.layers):
        if cfg.unsloth_lora_qkv:
            layer_modules = [
                getattr(layer.self_attn, linear_proj)
                for linear_proj in ["q_proj", "k_proj", "v_proj"]
            ]
            is_qkv_lora = all(hasattr(module, "lora_A") for module in layer_modules)
            qkv_no_bias = all(
                getattr(module, "base_layer", module).bias is None
                for module in layer_modules
            )
            qkv_not_dora = all(
                len(getattr(module, "lora_magnitude_vector", []) or []) == 0
                for module in layer_modules
            )

            if is_qkv_lora and qkv_no_bias and qkv_not_dora:
                layer.self_attn.apply_qkv = apply_lora_qkv
            else:
                layer.self_attn.apply_qkv = original_apply_qkv
                LOG.warning(f"unable to apply unsloth lora qkv patch to layer {idx}")
        if cfg.unsloth_lora_o:
            layer_modules = [
                getattr(layer.self_attn, linear_proj) for linear_proj in ["o_proj"]
            ]
            is_o_lora = all(hasattr(module, "lora_A") for module in layer_modules)
            o_no_bias = all(
                getattr(module, "base_layer", module).bias is None
                for module in layer_modules
            )
            o_not_dora = all(
                len(getattr(module, "lora_magnitude_vector", []) or []) == 0
                for module in layer_modules
            )

            if is_o_lora and o_no_bias and o_not_dora:
                layer.self_attn.apply_o = apply_lora_o
            else:
                layer.self_attn.apply_o = original_apply_o
                LOG.warning(f"unable to apply unsloth lora o_proj patch to layer {idx}")


def patch_unsloth_layernorm():
    try:
        import transformers.models.llama.modeling_llama
        from unsloth.kernels.rms_layernorm import Fast_RMS_Layernorm

        class LlamaRMSNorm(nn.Module):
            """LlamaRMSNorm"""

            def __init__(self, hidden_size, eps=1e-6):
                """
                LlamaRMSNorm is equivalent to T5LayerNorm
                """
                super().__init__()
                self.weight = nn.Parameter(torch.ones(hidden_size))
                self.variance_epsilon = eps

            def forward(self, hidden_states):
                return Fast_RMS_Layernorm.apply(
                    hidden_states, self.weight, self.variance_epsilon, False
                )

        LOG.info("patching with unsloth.kernels.rms_layernorm")
        transformers.models.llama.modeling_llama.LlamaRMSNorm = LlamaRMSNorm
    except ImportError:
        LOG.warning("missing unsloth library")


================================================
FILE: src/axolotl/monkeypatch/utils.py
================================================
"""
Shared utils for the monkeypatches
"""

import re
from typing import Tuple

import torch
import torch.nn.functional as F


@torch.jit.script
def get_max_seqlen_in_batch(attention_mask: torch.Tensor) -> torch.Tensor:
    max_num = int(torch.max(attention_mask).item())
    batch_size, _ = attention_mask.shape
    counts = torch.zeros((batch_size, max_num), dtype=torch.int32)
    for i in range(1, max_num + 1):
        mask = attention_mask == i
        counts[:, i - 1] = torch.sum(mask, dim=-1).to(dtype=torch.int32)
    result = counts.flatten()
    nonzero_indices = torch.nonzero(result).squeeze(-1)
    return result[nonzero_indices]


@torch.jit.script
def get_unpad_data(attention_mask: torch.Tensor):
    device = attention_mask.device
    seqlens_in_batch = get_max_seqlen_in_batch(attention_mask)
    indices = torch.nonzero(attention_mask.flatten()).flatten()
    max_seqlen_in_batch = seqlens_in_batch.max().item()
    cu_seqlens = (
        F.pad(torch.cumsum(seqlens_in_batch, dim=0, dtype=torch.int32), (1, 0))
        .to(device=device)
        .detach()
    )
    return (
        indices,
        cu_seqlens,
        max_seqlen_in_batch,
    )


def get_cu_seqlens(attn_mask):
    """generate a cumulative sequence length mask for flash attention using attn mask"""
    if len(attn_mask.shape) == 1:
        attn_mask = attn_mask.unsqueeze(0)

    device = attn_mask.device
    results = []
    max_seq_lens = []

    for row in attn_mask:
        # Exclude zeros to avoid adding their positions to the mask
        t_non_zeros = row[row != 0]
        # Find where the sequence number changes (including the first position)
        seq_change = torch.cat(
            [
                torch.tensor([1], dtype=torch.int32, device=device),
                t_non_zeros[1:] != t_non_zeros[:-1],
            ]
        )
        # Get the indices where the sequence changes
        change_indices = torch.cat(
            [
                (seq_change == 1).nonzero(as_tuple=True)[0],
                torch.tensor([len(t_non_zeros)], dtype=torch.int32, device=device),
            ]
        )
        # Calculate the sequence lengths
        seq_lengths = change_indices[1:] - change_indices[:-1]
        # Calculate the length of the final sequence or padding
        final_seq_length = len(row) - change_indices[-1]
        # Append the length of the final sequence or padding to seq_lengths
        if final_seq_length.item():
            seq_lengths = torch.cat(
                [
                    seq_lengths,
                    torch.tensor(
                        [final_seq_length.item()], dtype=torch.int32, device=device
                    ),
                ]
            )
        # Calculate the cumulative sequence lengths
        cu_seqlens = torch.cat(
            [torch.tensor([0], dtype=torch.int32, device=device), seq_lengths.cumsum(0)]
        )
        max_seq_len = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
        results.append(cu_seqlens)
        max_seq_lens.append(max_seq_len)

    return torch.stack(results).to(dtype=torch.int32), torch.stack(max_seq_lens)


def get_cu_seqlens_from_pos_ids(
    position_ids: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
    """generate a cumulative sequence length mask for flash attention using pos ids"""
    if len(position_ids.shape) == 1:
        position_ids = position_ids.unsqueeze(0)

    device = position_ids.device
    results = []
    max_seq_lens = []

    for row in position_ids:
        # Count the number of consecutive zeros from the right side
        padding_length = (row == 0).int().flip(dims=[0]).cumprod(dim=0).sum().item()

        # Adjust the row to exclude padding
        adjusted_row = row[:-padding_length] if padding_length else row.clone()

        # Find where the position resets to 0 (indicating a new sequence)
        seq_starts = torch.cat(
            [
                torch.tensor([True], dtype=torch.bool, device=device),
                adjusted_row[1:] == 0,
            ]
        )
        # Get the indices where the sequence starts
        start_indices = torch.cat(
            [
                torch.nonzero(seq_starts).unbind(dim=1)[0],
                torch.tensor([len(adjusted_row)], dtype=torch.int32, device=device),
            ]
        )
        # Calculate the sequence lengths
        seq_lengths = start_indices[1:] - start_indices[:-1]
        # Calculate the cumulative sequence lengths
        cu_seqlens = torch.cat(
            [torch.tensor([0], dtype=torch.int32, device=device), seq_lengths.cumsum(0)]
        )
        # Append the padding length to the cumulative sequence lengths
        if padding_length:
            cu_seqlens = torch.cat(
                [cu_seqlens, torch.tensor([len(row)], dtype=torch.int32, device=device)]
            )
        max_seq_len = (cu_seqlens[1:] - cu_seqlens[:-1]).max()
        results.append(cu_seqlens)
        max_seq_lens.append(max_seq_len)

    # Find the maximum value across all tensors
    max_value = max(t.max() for t in results)

    # Find the length of the longest tensor
    max_length = max(t.size(0) for t in results)

    # Pad each tensor to the same length and collect them in a list
    padded_results = [
        F.pad(t, (0, max_length - t.size(0)), "constant", max_value) for t in results
    ]

    return torch.stack(padded_results).to(dtype=torch.int32), torch.stack(max_seq_lens)


def set_module_name(model, name, value):
    if "." in name:
        parent_name = name.rsplit(".", 1)[0]
        child_name = name[len(parent_name) + 1 :]
        parent = model.get_submodule(parent_name)
    else:
        parent_name = ""
        parent = model
        child_name = name

    setattr(parent, child_name, value)


def detab_code(code: str) -> Tuple[str, str]:
    try:
        spaces = re.match(r"([\s\t]{1,})", code).group(0)
        code = re.sub(r"^" + spaces, "", code, flags=re.MULTILINE)
    except AttributeError:
        return code, ""
    return code, spaces


================================================
FILE: src/axolotl/monkeypatch/xformers_/__init__.py
================================================
"""
Fused MLP layer for incrementally improved training efficiency
"""

import torch
from transformers.models.llama.modeling_llama import LlamaMLP
from xformers.ops import SwiGLU

from axolotl.monkeypatch.utils import set_module_name


class FusedMLP(torch.nn.Module):
    """
    Fused MLP layer for incrementally improved training efficiency
    """

    def __init__(
        self,
        config,
        gate_proj: torch.nn.Linear,
        up_proj: torch.nn.Linear,
        down_proj: torch.nn.Linear,
    ):
        super().__init__()
        self.config = config
        self.swiglu = SwiGLU(
            in_features=config.hidden_size,
            hidden_features=config.intermediate_size,
            bias=False,
            _pack_weights=True,
        )
        # overwrite initialized weights with pretrained weights
        self.swiglu.w12.weight.data = torch.cat(
            (gate_proj.weight.data, up_proj.weight.data), dim=0
        )
        self.swiglu.w3.weight.data = down_proj.weight.data

    def _post_training(self, model, name):
        w1, w2 = torch.split(
            self.swiglu.w12.weight.data, self.config.intermediate_size, dim=0
        )

        # Assign the split weights back to the original layers
        new_mlp = LlamaMLP(self.config)
        new_mlp.gate_proj.weight.data = w1
        new_mlp.up_proj.weight.data = w2
        new_mlp.down_proj.weight.data = self.swiglu.w3.weight.data

        set_module_name(model, name, new_mlp)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        return self.swiglu(x)


================================================
FILE: src/axolotl/processing_strategies.py
================================================
"""Module containing ProcessingStrategy classes and its derivative for different MultiModal Model types"""

from copy import deepcopy
from typing import Optional

from PIL import Image, ImageOps
from PIL.Image import Resampling
from torch import Tensor, zeros_like
from transformers import ProcessorMixin
from transformers.image_utils import load_image
from transformers.models.internvl import InternVLProcessor
from transformers.models.smolvlm import SmolVLMProcessor
from transformers.models.voxtral import VoxtralProcessor

from axolotl.utils.dict import remove_none_values
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


class ProcessingStrategy:
    """Base Processing Strategy class"""

    def __init__(
        self,
        processor: ProcessorMixin,
        chat_template: Optional[str] = None,
        image_size: int | tuple[int, int] | None = None,
        image_resize_algorithm: Resampling | None = None,
    ):
        self.processor = processor
        self.chat_template = chat_template
        self.image_token = None
        self.image_token_id = None

        self.image_size = image_size
        self.image_resize_algorithm = (
            image_resize_algorithm or Image.Resampling.BILINEAR
        )

        if hasattr(processor, "image_token"):
            self.image_token = processor.image_token
            self.image_token_id = processor.tokenizer.convert_tokens_to_ids(
                self.image_token
            )

    def __call__(self, examples: list[dict]) -> list[dict]:
        """
        Preprocess conversation examples to ensure consistent format.
        Converts different conversation formats to OpenAI format with 'messages'.
        Supports two formats:
        1. OpenAI format with 'messages'
        2. Legacy format with 'conversations'

        Args:
            examples: list of conversation dictionaries

        Returns:
            list of dicts in OpenAI format with 'messages' key

        Raises:
            ValueError: If the conversation format is not supported
        """
        role_mapping = {
            "human": "user",
            "gpt": "assistant",
        }

        def normalize_role(role: str) -> str:
            """Normalize role names to OpenAI format. Default to original role if not found."""
            return role_mapping.get(role, role)

        def convert_legacy_format(example: dict) -> dict:
            """Convert legacy 'conversations' format to OpenAI 'messages' format."""
            messages = [
                {"role": normalize_role(convo["from"]), "content": convo["value"]}
                for convo in example["conversations"]
            ]

            # Create new dict without 'conversations' key
            result = deepcopy(example)
            result.pop("conversations")
            result["messages"] = messages
            return result

        def convert_messages_to_multimedia_messages(messages: list[dict]) -> list[dict]:
            """Convert regular messages format to Messages format with content type"""

            new_messages = []
            for message in messages:
                if isinstance(message["content"], str):
                    new_messages.append(
                        {
                            "role": message["role"],
                            "content": [
                                {
                                    "type": "text",
                                    "text": message["content"],
                                }
                            ],
                        }
                    )
                elif isinstance(message["content"], list):
                    content = message["content"]

                    new_messages.append(
                        {
                            "role": message["role"],
                            "content": content,
                        }
                    )

            return new_messages

        processed_examples = []
        for example in examples:
            if not ("messages" in example or "conversations" in example):
                raise ValueError(
                    "Only `messages` and `conversations` message keys are currently supported."
                )

            processed_example = None
            if (
                "messages" in example and example["messages"] is not None
            ):  # OpenAI format
                processed_example = example
            else:  # Legacy format
                processed_example = convert_legacy_format(example)

            # convert regular messages format to Messages format with content type
            # for compatibility with apply_chat_template
            processed_example["messages"] = convert_messages_to_multimedia_messages(
                processed_example["messages"]
            )

            # find the image key if it exists
            possible_image_keys = ["images", "image"]
            image_key = None
            for key in possible_image_keys:
                if key in processed_example:
                    image_key = key
                    break

            # if the image key exists, add the image to the first user message
            if image_key is not None and processed_example[image_key] is not None:
                # TODO: check if it's normal to be single image only for common datasets
                # From observation, it's usually a list of single image but some datasets may have several columns for images
                # Temporary solution: take the first image and suggest people convert their datasets to use multi-content Messages
                if len(processed_example[image_key]) > 1:
                    LOG.warning(
                        f"Found {len(processed_example[image_key])} images in a sample. Using the first one."
                        "If you are using a dataset with multiple images per sample, please convert it to use multi-content Messages."
                        "See https://docs.axolotl.ai/docs/multimodal.html#dataset-format"
                    )

                image_value = processed_example[image_key][0]

                # Handle image loading (Image, url, path, base64)
                image_value = load_image(image_value)

                if self.image_size is not None:
                    assert hasattr(image_value, "resize"), (
                        "Image does not have a resize method"
                    )

                    if isinstance(self.image_size, tuple):
                        image_value = image_value.resize(
                            self.image_size, self.image_resize_algorithm
                        )
                    else:
                        # Set the padding value; here we use black (0, 0, 0) for RGB images
                        padding_color = (0, 0, 0)

                        # When image_size is an int (square target), preserve aspect ratio then pad
                        # This is to prevent aspect ratio distortion when resizing to square
                        image_value = ImageOps.pad(
                            image_value,
                            (self.image_size, self.image_size),
                            method=self.image_resize_algorithm,
                            color=padding_color,
                        )

                # Look for any image type in the first message
                # some dataset have an {type: "image"} in the first message
                msg_ind_to_add = None
                ind_to_add = None
                first_user_idx = None

                for msg_idx, msg_content in enumerate(processed_example["messages"]):
                    if first_user_idx is None and msg_content["role"] == "user":
                        first_user_idx = msg_idx
                    for i, content in enumerate(
                        processed_example["messages"][msg_idx]["content"]
                    ):
                        # Usually datasets created with image columns, don't have it in the messages itself
                        if content["type"] == "image" and all(
                            k not in content for k in ["image", "url", "path", "base64"]
                        ):
                            msg_ind_to_add = msg_idx
                            ind_to_add = i
                            break

                # If an image type is found, add the image to that index
                if ind_to_add is not None and msg_ind_to_add is not None:
                    processed_example["messages"][msg_ind_to_add]["content"][
                        ind_to_add
                    ]["image"] = image_value
                else:
                    # if no image type is found, add it to end of the first user message
                    if first_user_idx is None:
                        first_user_idx = 0
                    processed_example["messages"][first_user_idx]["content"].append(
                        {
                            "type": "image",
                            "image": image_value,
                        }
                    )

            processed_examples.append(remove_none_values(processed_example))

        return processed_examples

    def _mask_non_assistant(self, labels: Tensor) -> Tensor:
        """
        Mask non assistant regions to -100.
        To be implemented per subclass.
        """
        return labels

    def process_labels(self, input_ids: Tensor) -> Tensor:
        labels = input_ids.clone()

        labels = self._mask_non_assistant(labels)

        # The labels are the input_ids, and we mask the padding tokens in the loss computation
        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        # Ignore the image token index in the loss computation (model specific)
        labels[labels == self.image_token_id] = -100

        return labels


class Qwen2VLProcessingStrategy(ProcessingStrategy):
    """Processing Strategy class for Qwen2-VL"""

    def __init__(
        self,
        processor: ProcessorMixin,
        chat_template: Optional[str] = None,
        image_size: int | tuple[int, int] | None = None,
        image_resize_algorithm: Resampling | None = None,
    ):
        super().__init__(processor, chat_template, image_size, image_resize_algorithm)
        self.image_token = "<|image_pad|>"  # nosec
        self.image_token_id = processor.tokenizer.convert_tokens_to_ids(
            self.image_token
        )


class Qwen3_5ProcessingStrategy(ProcessingStrategy):
    """Processing Strategy class for Qwen3.5 (early-fusion VLM)"""

    def __init__(
        self,
        processor: ProcessorMixin,
        chat_template: Optional[str] = None,
        image_size: int | tuple[int, int] | None = None,
        image_resize_algorithm: Resampling | None = None,
    ):
        super().__init__(processor, chat_template, image_size, image_resize_algorithm)
        self.image_token = "<|image_pad|>"  # nosec
        self.image_token_id = processor.tokenizer.convert_tokens_to_ids(
            self.image_token
        )
        self.video_token = "<|video_pad|>"  # nosec
        self.video_token_id = processor.tokenizer.convert_tokens_to_ids(
            self.video_token
        )

    def process_labels(self, input_ids):
        labels = super().process_labels(input_ids)
        labels[labels == self.video_token_id] = -100
        return labels


class Gemma3ProcessingStrategy(ProcessingStrategy):
    """Processing Strategy class for Gemma3"""

    def __init__(
        self,
        processor: ProcessorMixin,
        chat_template: Optional[str] = None,
        image_size: int | tuple[int, int] | None = None,
        image_resize_algorithm: Resampling | None = None,
    ):
        super().__init__(processor, chat_template, image_size, image_resize_algorithm)
        self.image_token = processor.tokenizer.special_tokens_map["boi_token"]
        self.image_token_id = processor.tokenizer.convert_tokens_to_ids(
            self.image_token
        )

    def process_labels(self, input_ids):
        labels = input_ids.clone()

        # Follows https://ai.google.dev/gemma/docs/core/huggingface_vision_finetune_qlora
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        labels[labels == self.image_token_id] = -100
        labels[labels == 262144] = -100  # corresponds to <image_soft_token>

        return labels


class Gemma3nProcessingStrategy(ProcessingStrategy):
    """Processing Strategy class for Gemma3n"""

    def _mask_non_assistant(self, labels: Tensor) -> Tensor:
        def _find_token_sequence(label, start_pos, token_sequence):
            """Check if token_sequence appears at start_pos in label"""
            if start_pos + len(token_sequence) > len(label):
                return False
            if label[start_pos] != token_sequence[0]:
                return False
            return (
                label[start_pos : start_pos + len(token_sequence)].tolist()
                == token_sequence
            )

        def _find_assistant_end(label, start_pos, assistant_end_tok, mask, i):
            """
            Find the end of assistant response and update mask accordingly

            Returns new position to continue from and whether the end seq is found
            """
            k = start_pos
            while k < len(label):
                if not _find_token_sequence(label, k, assistant_end_tok):
                    mask[i][k] = 1
                    k += 1
                    continue

                return k + len(assistant_end_tok), True

            return k, False

        mask = zeros_like(labels)

        assistant_start_str = "<start_of_turn>model"
        assistant_end_str = "<end_of_turn>"
        include_assistant_start_tok = False
        include_assistant_end_tok = True

        # str to tokens
        assistant_start_tok = self.processor.tokenizer.encode(
            assistant_start_str, add_special_tokens=False
        )
        assistant_end_tok = self.processor.tokenizer.encode(
            assistant_end_str, add_special_tokens=False
        )

        for i, label in enumerate(labels):
            j = 0
            # while loop through each tok index in labels[i]
            while j < len(label):
                # Check until match start seq
                if not _find_token_sequence(label, j, assistant_start_tok):
                    j += 1
                    continue

                if include_assistant_start_tok:
                    mask[i][j : j + len(assistant_start_tok)] = 1

                # Find where the assistant response ends
                start_of_content = j + len(assistant_start_tok)
                end_pos, found_end_seq = _find_assistant_end(
                    label, start_of_content, assistant_end_tok, mask, i
                )

                # Include end token if requested
                if include_assistant_end_tok and found_end_seq:
                    mask[i][end_pos - len(assistant_end_tok) : end_pos] = 1

                j = end_pos

            labels[i][mask[i] == 0] = -100

        return labels

    def process_labels(self, input_ids):
        labels = input_ids.clone()
        labels = self._mask_non_assistant(labels)

        # Follows https://colab.research.google.com/github/huggingface/huggingface-gemma-recipes/blob/main/notebooks/fine_tune_gemma3n_on_t4.ipynb
        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        if hasattr(self.processor.tokenizer, "image_token_id"):
            labels[labels == self.processor.tokenizer.image_token_id] = -100
        if hasattr(self.processor.tokenizer, "audio_token_id"):
            labels[labels == self.processor.tokenizer.audio_token_id] = -100
        if hasattr(self.processor.tokenizer, "boi_token_id"):
            labels[labels == self.processor.tokenizer.boi_token_id] = -100
        if hasattr(self.processor.tokenizer, "eoi_token_id"):
            labels[labels == self.processor.tokenizer.eoi_token_id] = -100

        return labels


class VoxtralProcessingStrategy(ProcessingStrategy):
    """Processing Strategy class for Voxtral"""

    def __init__(
        self,
        processor: VoxtralProcessor,
        chat_template: Optional[str] = None,
        image_size: int | tuple[int, int] | None = None,
        image_resize_algorithm: Resampling | None = None,
    ):
        super().__init__(processor, chat_template, image_size, image_resize_algorithm)
        special_ids = (
            processor.tokenizer.tokenizer.instruct_tokenizer.audio_encoder.special_ids
        )

        self.audio_token = special_ids.audio
        self.begin_audio_token = special_ids.begin_audio

    def process_labels(self, input_ids):
        labels = input_ids.clone()

        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        labels[labels == self.audio_token] = -100
        labels[labels == self.begin_audio_token] = -100

        return labels


class SmolVLM2ProcessingStrategy(ProcessingStrategy):
    """Processing Strategy class for SmolVLM2"""

    def __init__(
        self,
        processor: ProcessorMixin,
        chat_template: Optional[str] = None,
        image_size: int | tuple[int, int] | None = None,
        image_resize_algorithm: Resampling | None = None,
    ):
        super().__init__(processor, chat_template, image_size, image_resize_algorithm)
        self.image_token = "<image>"  # nosec

        self.image_token_id = processor.tokenizer.additional_special_tokens_ids[
            processor.tokenizer.additional_special_tokens.index(self.image_token)
        ]


class Mistral3ProcessingStrategy(ProcessingStrategy):
    """Processing Strategy class for Mistral3"""

    def __init__(
        self,
        processor,
        chat_template: Optional[str] = None,
        image_size: int | tuple[int, int] | None = None,
        image_resize_algorithm: Resampling | None = None,
    ):
        super().__init__(processor, chat_template, image_size, image_resize_algorithm)
        special_ids = (
            processor.tokenizer.tokenizer.instruct_tokenizer.image_encoder.special_ids
        )

        self.image_token = special_ids.img
        self.image_break_token = special_ids.img_break
        self.image_end_token = special_ids.img_end

    def process_labels(self, input_ids):
        labels = input_ids.clone()

        labels[labels == self.processor.tokenizer.pad_token_id] = -100
        labels[labels == self.image_token] = -100
        labels[labels == self.image_break_token] = -100
        labels[labels == self.image_end_token] = -100

        return labels


class InternVLProcessingStrategy(ProcessingStrategy):
    """Processing Strategy class for InternVL"""

    def __init__(
        self,
        processor: ProcessorMixin,
        chat_template: Optional[str] = None,
        image_size: int | tuple[int, int] | None = None,
        image_resize_algorithm: Resampling | None = None,
    ):
        super().__init__(processor, chat_template, image_size, image_resize_algorithm)

        if not hasattr(processor, "image_ids"):
            raise ValueError("'image_ids' missing from InternVL Processor.")

        self.image_token_ids = processor.image_ids

    def process_labels(self, input_ids):
        labels = input_ids.clone()

        labels[labels == self.processor.tokenizer.pad_token_id] = -100

        for ids in self.image_token_ids:
            labels[labels == ids] = -100

        # Note: Check if need to mask 'video_token' as it gets converted to
        # image patches during media processing

        return labels


class Glm4vProcessingStrategy(ProcessingStrategy):
    """Processing Strategy class for GLM4V and GLM4V-MoE vision models."""

    def __init__(
        self,
        processor: ProcessorMixin,
        chat_template: Optional[str] = None,
        image_size: int | tuple[int, int] | None = None,
        image_resize_algorithm: Resampling | None = None,
    ):
        super().__init__(processor, chat_template, image_size, image_resize_algorithm)

        self.tokenizer = getattr(processor, "tokenizer", processor)

        self.image_token = "<|image|>"  # nosec
        self.begin_image_token = "<|begin_of_image|>"  # nosec
        self.end_image_token = "<|end_of_image|>"  # nosec
        self.video_token = "<|video|>"  # nosec
        self.begin_video_token = "<|begin_of_video|>"  # nosec
        self.end_video_token = "<|end_of_video|>"  # nosec

        self.image_token_id = self.tokenizer.convert_tokens_to_ids(self.image_token)
        self.begin_image_token_id = self.tokenizer.convert_tokens_to_ids(
            self.begin_image_token
        )
        self.end_image_token_id = self.tokenizer.convert_tokens_to_ids(
            self.end_image_token
        )
        self.video_token_id = self.tokenizer.convert_tokens_to_ids(self.video_token)
        self.begin_video_token_id = self.tokenizer.convert_tokens_to_ids(
            self.begin_video_token
        )
        self.end_video_token_id = self.tokenizer.convert_tokens_to_ids(
            self.end_video_token
        )

    def process_labels(self, input_ids):
        labels = input_ids.clone()

        labels[labels == self.tokenizer.pad_token_id] = -100

        labels[labels == self.image_token_id] = -100
        labels[labels == self.begin_image_token_id] = -100
        labels[labels == self.end_image_token_id] = -100

        labels[labels == self.video_token_id] = -100
        labels[labels == self.begin_video_token_id] = -100
        labels[labels == self.end_video_token_id] = -100

        return labels


def get_processing_strategy(
    processor: ProcessorMixin,
    chat_template,
    chat_template_type,
    image_size: int | tuple[int, int] | None = None,
    image_resize_algorithm: Resampling | None = None,
):
    from axolotl.utils.mistral.mistral3_processor import Mistral3Processor

    processing_kwargs = {
        "processor": processor,
        "chat_template": chat_template,
        "image_size": image_size,
        "image_resize_algorithm": image_resize_algorithm,
    }

    if chat_template_type in [None, "tokenizer_default"]:
        tokenizer = getattr(processor, "tokenizer", processor)
        if hasattr(tokenizer, "chat_template"):
            processing_kwargs["chat_template"] = tokenizer.chat_template

    if chat_template_type == "qwen2_vl":
        return Qwen2VLProcessingStrategy(
            **processing_kwargs,
        )
    if chat_template_type in ["qwen3_5", "qwen3_5_moe"]:
        return Qwen3_5ProcessingStrategy(
            **processing_kwargs,
        )
    if chat_template_type == "gemma3":
        return Gemma3ProcessingStrategy(
            **processing_kwargs,
        )
    if chat_template_type == "gemma3n":
        return Gemma3nProcessingStrategy(
            **processing_kwargs,
        )

    if isinstance(processor, VoxtralProcessor):
        return VoxtralProcessingStrategy(
            **processing_kwargs,
        )

    if isinstance(processor, SmolVLMProcessor):
        return SmolVLM2ProcessingStrategy(
            **processing_kwargs,
        )

    if isinstance(processor, Mistral3Processor):
        return Mistral3ProcessingStrategy(
            **processing_kwargs,
        )
    try:
        from transformers.models.glm46v.processing_glm46v import Glm46VProcessor

        if isinstance(processor, Glm46VProcessor):
            return Glm4vProcessingStrategy(
                **processing_kwargs,
            )
    except ImportError:
        pass

    if isinstance(processor, InternVLProcessor):
        return InternVLProcessingStrategy(
            **processing_kwargs,
        )

    # llama3_2_vision, llama4, llava
    # mistral_v7_tekken, pixtral, lfm2vl
    return ProcessingStrategy(
        **processing_kwargs,
    )


================================================
FILE: src/axolotl/prompt_strategies/__init__.py
================================================
"""Module to load prompt strategies."""

import importlib
import inspect

from axolotl.prompt_strategies.user_defined import UserDefinedDatasetConfig
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def load(strategy, tokenizer, cfg, ds_cfg, processor=None):
    try:
        if strategy == "messages":
            from .messages import load as messages_load

            return messages_load(tokenizer, cfg, ds_cfg, processor=processor)
        load_fn = "load"
        package = "axolotl.prompt_strategies"
        if (
            strategy.split(".")[-1].startswith("load_")
            or strategy.split(".")[-1] == "load"
        ):
            load_fn = strategy.split(".")[-1]
            strategy = ".".join(strategy.split(".")[:-1])
        elif len(strategy.split(".")) > 1:
            try:
                importlib.import_module(
                    "." + strategy.split(".")[-1],
                    ".".join(strategy.split(".")[:-1]),
                )
                package = ".".join(strategy.split(".")[:-1])
                strategy = strategy.split(".")[-1]
            except ModuleNotFoundError:
                pass
        mod = importlib.import_module(f".{strategy}", package)
        func = getattr(mod, load_fn)
        load_kwargs = {}
        if strategy == "user_defined":
            load_kwargs["ds_cfg"] = UserDefinedDatasetConfig(**ds_cfg)
        else:
            sig = inspect.signature(func)
            if "ds_cfg" in sig.parameters:
                load_kwargs["ds_cfg"] = ds_cfg
            if "processor" in sig.parameters:
                load_kwargs["processor"] = processor

        return func(tokenizer, cfg, **load_kwargs)
    except ModuleNotFoundError:
        return None
    except Exception as exc:
        LOG.error(f"Failed to load prompt strategy `{strategy}`: {str(exc)}")
        raise exc


================================================
FILE: src/axolotl/prompt_strategies/alpaca_chat.py
================================================
"""Module for Alpaca prompt strategy classes"""

from typing import Any, Dict, Optional, Tuple

from axolotl.prompt_tokenizers import (
    AlpacaPromptTokenizingStrategy,
    InstructionPromptTokenizingStrategy,
)
from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter


def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
    prompt_style = PromptStyle.CHAT.value
    if ds_cfg and "conversation" in ds_cfg:
        prompt_style = ds_cfg["conversation"]

    return AlpacaPromptTokenizingStrategy(
        AlpacaPrompter(prompt_style),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


class AlpacaConcisePrompter(AlpacaPrompter):
    """
    Alpaca Prompter extending the system prompt to ask for concise chat-instruct answers
    """

    system_prompt = "Below is an instruction from a USER that describes a task, paired with an input that provides further context. The ASSISTANT writes a response that concisely and appropriately completes the request.\n\n"
    system_no_input_prompt = "Below is an instruction from a USER that describes a task. The ASSISTANT writes a response that appropriately and concisely completes the request.\n\n"


class AlpacaChatPrompter(AlpacaPrompter):
    """
    Alpaca Chat Prompter extending the system prompt to for chat-instruct answers
    """

    system_prompt = "Below is an instruction from a USER that describes a task, paired with an input that provides further context. The ASSISTANT writes a response that concisely and appropriately completes the request.\n\n"
    system_no_input_prompt = "Below is an instruction from a USER that describes a task. The ASSISTANT writes a response that appropriately and concisely completes the request.\n\n"

    def __init__(self):
        self.prompt_style = PromptStyle.CHAT.value
        self.match_prompt_style()


class NoSystemPrompter(AlpacaPrompter):
    """
    Null Prompter with no system prompts
    """

    system_prompt = ""
    system_no_input_prompt = ""
    turn_format = "{instruction} {input} "
    turn_no_input_format = "{instruction} "

    def __init__(self):
        pass


class AlpacaQAPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
    """
    Tokenizing strategy for AlpacaQA
    """

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        return (
            prompt["question"],
            "",
            prompt["answer"],
        )


class CamelAIPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
    """
    Tokenizing strategy for CamelAI datasets
    """

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        return (
            prompt["message_1"],
            "",
            prompt["message_2"],
        )


def load_concise(tokenizer, cfg):
    return AlpacaPromptTokenizingStrategy(
        AlpacaConcisePrompter(PromptStyle.CHAT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


def load_qa(tokenizer, cfg):
    return AlpacaQAPromptTokenizingStrategy(
        AlpacaChatPrompter(),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


def load_camel_ai(tokenizer, cfg):
    return CamelAIPromptTokenizingStrategy(
        AlpacaChatPrompter(),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


def load_no_prompt(tokenizer, cfg):
    return AlpacaPromptTokenizingStrategy(
        UnpromptedPrompter(PromptStyle.CHAT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


================================================
FILE: src/axolotl/prompt_strategies/alpaca_instruct.py
================================================
"""Module loading the AlpacaInstructPromptTokenizingStrategy class"""

from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
from axolotl.prompters import AlpacaPrompter, PromptStyle, UnpromptedPrompter


def load(tokenizer, cfg):
    return AlpacaPromptTokenizingStrategy(
        AlpacaPrompter(PromptStyle.INSTRUCT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


def load_no_prompt(tokenizer, cfg):
    return AlpacaPromptTokenizingStrategy(
        UnpromptedPrompter(PromptStyle.INSTRUCT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


================================================
FILE: src/axolotl/prompt_strategies/alpaca_w_system.py
================================================
"""
Prompt strategies loader for alpaca instruction datasets with system prompts
"""

from typing import Generator, Tuple, Union

from axolotl.prompt_tokenizers import PromptTokenizingStrategy
from axolotl.prompters import AlpacaPrompter, PromptStyle


class InstructionWSystemPromptTokenizingStrategy(PromptTokenizingStrategy):
    """
    Tokenizing strategy for instruction-based prompts.
    """

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
        return (
            prompt["instruction"],
            prompt["input"] if "input" in prompt else "",
            prompt["output"],
            prompt["system"],
        )

    def tokenize_prompt(self, prompt):
        (
            instruction,
            input,
            response,
            system,
        ) = self.parse_instruction_fields(prompt)
        user_prompt = next(
            iter(
                self.prompter.build_prompt_w_system(
                    system,
                    instruction,
                    input,
                )
            )
        )
        tokenized_prompt = self._tokenize(user_prompt, add_eos_token=False)
        if not self.train_on_inputs:
            user_prompt_len = len(tokenized_prompt["input_ids"])
            # TODO this could be sped up using numpy array slicing
            tokenized_prompt["labels"] = [-100] * user_prompt_len
        tokenized_res_prompt = self._tokenize(
            response, strip_bos_token=True, add_eos_token=True
        )
        tokenized_prompt["input_ids"] += tokenized_res_prompt["input_ids"]
        tokenized_prompt["attention_mask"] += tokenized_res_prompt["attention_mask"]
        tokenized_prompt["labels"] += tokenized_res_prompt["input_ids"]

        return tokenized_prompt


class SystemDataPrompter(AlpacaPrompter):
    """
    Alpaca Style Prompter that uses system prompts from the dataset
    """

    system_format: str = "### System:\n{system}\n\n"

    def build_prompt_w_system(
        self,
        system: str,
        instruction: str,
        input: Union[None, str] = None,
        output: Union[None, str] = None,
    ) -> Generator[str, None, None]:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        formatted_sys_prompt = (
            self.system_format.format(system=system)
            if system and self.system_format
            else ""
        )
        if input:
            res = formatted_sys_prompt + self.turn_format.format(
                instruction=instruction, input=input
            )
        else:
            res = formatted_sys_prompt + self.turn_no_input_format.format(
                instruction=instruction
            )
        if output:
            res = f"{res}{output}"
        yield res


class OpenOrcaSystemDataPrompter(SystemDataPrompter):
    """
    Alpaca Style Prompter that uses system prompts from the dataset, with OpenOrca prompts
    """

    def match_prompt_style(self):
        if self.prompt_style == PromptStyle.INSTRUCT.value:
            self.turn_format = "### Human:\n{instruction}\n### Additional Context:\n{input}\n### Assistant:\n"
            self.turn_no_input_format = "### Human:\n{instruction}\n### Assistant:\n"
            self.system_format = "### System:\n{system}\n"
        if self.prompt_style == PromptStyle.CHAT.value:
            self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
            self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
            self.system_format = "SYSTEM: {system}\n"
        if self.prompt_style == PromptStyle.CHATML.value:
            self.turn_format = "<|im_start|>user\n{instruction}\n{input}<|im_end|>\n<|im_start|>assistant\n"
            self.turn_no_input_format = (
                "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"
            )
            self.system_format = "<|im_start|>system\n{system}<|im_end|>\n"


class OpenOrcaPromptTokenizingStrategy(InstructionWSystemPromptTokenizingStrategy):
    """
    Tokenizing strategy for OpenOrca datasets
    """

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str]:
        return (
            prompt["question"],
            "",
            prompt["response"],
            prompt["system_prompt"],
        )


def load(tokenizer, cfg):
    return load_chat(tokenizer, cfg)


def load_instruct(tokenizer, cfg):
    return InstructionWSystemPromptTokenizingStrategy(
        SystemDataPrompter(PromptStyle.INSTRUCT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


def load_chat(tokenizer, cfg):
    return InstructionWSystemPromptTokenizingStrategy(
        SystemDataPrompter(PromptStyle.CHAT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


def load_open_orca(tokenizer, cfg):
    return OpenOrcaPromptTokenizingStrategy(
        OpenOrcaSystemDataPrompter(PromptStyle.INSTRUCT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


def load_open_orca_chatml(tokenizer, cfg):
    return OpenOrcaPromptTokenizingStrategy(
        OpenOrcaSystemDataPrompter(PromptStyle.CHATML.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


================================================
FILE: src/axolotl/prompt_strategies/base.py
================================================
"""
module for base dataset transform strategies
"""

import importlib

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def load(strategy, cfg, module_base=None, **kwargs):
    try:
        if len(strategy.split(".")) == 1:
            strategy = strategy + ".default"
        load_fn = strategy.split(".")[-1]
        if len(strategy.split(".")) > 1:
            try:
                importlib.import_module(
                    strategy.split(".")[-2],
                    ".".join(strategy.split(".")[:-2]),
                )
                module_base = ".".join(strategy.split(".")[:-2])
                strategy = strategy.split(".")[-2]
            except ModuleNotFoundError:
                strategy = "." + ".".join(strategy.split(".")[:-1])
        else:
            strategy = "." + ".".join(strategy.split(".")[:-1])
        mod = importlib.import_module(strategy, module_base)
        func = getattr(mod, load_fn)
        return func(cfg, **kwargs)
    except Exception:
        LOG.warning(f"unable to load strategy {strategy}")
        return None


================================================
FILE: src/axolotl/prompt_strategies/bradley_terry/README.md
================================================
### example yaml

```yaml
chat_template: gemma
datasets:
  - path: argilla/distilabel-intel-orca-dpo-pairs
    type: bradley_terry.chat_template
val_set_size: 0.0
output_dir: ./outputs/out
```


================================================
FILE: src/axolotl/prompt_strategies/bradley_terry/__init__.py
================================================
"""Module to load prompt strategies."""

import importlib
import inspect

from axolotl.prompt_strategies.user_defined import UserDefinedDatasetConfig
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def load(strategy, tokenizer, cfg, ds_cfg):
    try:
        load_fn = "load"
        if strategy.split(".")[-1].startswith("load_"):
            load_fn = strategy.split(".")[-1]
            strategy = ".".join(strategy.split(".")[:-1])
        mod = importlib.import_module(
            f".{strategy}", "axolotl.prompt_strategies.bradley_terry"
        )
        func = getattr(mod, load_fn)
        load_kwargs = {}
        if strategy == "user_defined":
            load_kwargs["ds_cfg"] = UserDefinedDatasetConfig(**ds_cfg)
        else:
            sig = inspect.signature(func)
            if "ds_cfg" in sig.parameters:
                load_kwargs["ds_cfg"] = ds_cfg
        return func(tokenizer, cfg, **load_kwargs)
    except ModuleNotFoundError:
        return None
    except Exception as exc:
        LOG.error(f"Failed to load prompt strategy `{strategy}`: {str(exc)}")
        return None


================================================
FILE: src/axolotl/prompt_strategies/bradley_terry/chat_template.py
================================================
"""
Bradley-Terry model with chat template prompt strategy.
"""

from typing import Any, Dict, Optional

from axolotl.prompt_strategies.chat_template import (
    ChatTemplatePrompter,
    ChatTemplateStrategy,
)
from axolotl.utils.chat_templates import get_chat_template_from_config
from axolotl.utils.logging import get_logger

# Configure the logger
LOG = get_logger(__name__)
LOG.setLevel("INFO")


class BTChatTemplateStrategy(ChatTemplateStrategy):
    """
    Bradley-Terry reward model pairwise chat template prompt strategy.
    """

    @property
    def supports_batched(self) -> bool:
        return False

    def _tokenize_single_prompt(self, prompt):
        """

        :param prompt: the actual row of data from the underlying dataset
        :return:
        """

        max_length = self.prompter.max_length

        prompt["messages"] = []
        if prompt["system"]:
            prompt["messages"].append({"role": "system", "content": prompt["system"]})
        prompt["messages"].append({"role": "user", "content": prompt["input"]})
        prompt["messages"].append({"role": "assistant", "content": prompt["chosen"]})
        chosen_tokenized = super()._tokenize_single_prompt(prompt)

        if len(chosen_tokenized["input_ids"]) > max_length:
            LOG.warning(
                f"To-be-trimmed chosen sequence exceeds max sequence length: {len(chosen_tokenized['input_ids'])}"
            )

            chosen_tokenized["input_ids"] = chosen_tokenized["input_ids"][:max_length]
            chosen_tokenized["attention_mask"] = chosen_tokenized["attention_mask"][
                :max_length
            ]

        prompt["messages"] = []
        if prompt["system"]:
            prompt["messages"].append({"role": "system", "content": prompt["system"]})
        prompt["messages"].append({"role": "user", "content": prompt["input"]})
        prompt["messages"].append({"role": "assistant", "content": prompt["rejected"]})
        rejected_tokenized = super()._tokenize_single_prompt(prompt)

        if len(rejected_tokenized["input_ids"]) > max_length:
            LOG.warning(
                f"To-be-trimmed rejected sequence exceeds max sequence length: {len(rejected_tokenized['input_ids'])}"
            )

            rejected_tokenized["input_ids"] = rejected_tokenized["input_ids"][
                :max_length
            ]
            rejected_tokenized["attention_mask"] = rejected_tokenized["attention_mask"][
                :max_length
            ]

        return {
            "chosen_input_ids": chosen_tokenized["input_ids"],
            "attention_mask_chosen": chosen_tokenized["attention_mask"],
            "labels_chosen": 1.0,
            "rejected_input_ids": rejected_tokenized["input_ids"],
            "attention_mask_rejected": rejected_tokenized["attention_mask"],
            "labels_rejected": 0.0,
        }


def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
    ds_cfg = ds_cfg or {}
    chat_template_string = get_chat_template_from_config(
        cfg=cfg, ds_cfg=ds_cfg, tokenizer=tokenizer
    )

    prompter_params = {
        "tokenizer": tokenizer,
        "chat_template": chat_template_string,
        "message_property_mappings": ds_cfg.get(
            "message_property_mappings",
            {
                "role": "role",
                "content": "content",
            },
        ),
        "message_field_training": ds_cfg.get("message_field_training", None),
        "message_field_training_detail": ds_cfg.get(
            "message_field_training_detail", None
        ),
        "roles": ds_cfg.get("roles"),
        "drop_system_message": ds_cfg.get("drop_system_message", False),
        # we need to add one for detecting sequences with exceeding the `sequence_len` limit.
        "max_length": (
            cfg.sequence_len + 1 if not cfg.reward_model else cfg.sequence_len
        ),
    }

    strategy_params = {
        "train_on_inputs": cfg.train_on_inputs,
        "sequence_len": cfg.sequence_len,
        "roles_to_train": ds_cfg.get("roles_to_train", []),
        "train_on_eos": ds_cfg.get("train_on_eos", None),
    }

    strategy = BTChatTemplateStrategy(
        ChatTemplatePrompter(**prompter_params), tokenizer=tokenizer, **strategy_params
    )

    return strategy


================================================
FILE: src/axolotl/prompt_strategies/bradley_terry/llama3.py
================================================
"""
chatml transforms for datasets with system, input, chosen, rejected to match llama3 chat template
"""


def icr(
    cfg,
    **kwargs,
):
    """
    chatml transforms for datasets with system, input, chosen, rejected
    ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs
    """

    def transform_fn(sample):
        if "system" in sample and sample["system"]:
            prompt = (
                f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>"
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        else:
            prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{sample['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
        sample["chosen"] = prompt + f"{sample['chosen']}<|eot_id|>"
        sample["rejected"] = prompt + f"{sample['rejected']}<|eot_id|>"
        return sample

    return transform_fn


================================================
FILE: src/axolotl/prompt_strategies/chat_template.py
================================================
"""
HF Chat Templates prompt strategy
"""

import json
from collections import defaultdict
from typing import TYPE_CHECKING, Any, Dict, List, Set, Union

from pydantic import BaseModel
from transformers import ProcessorMixin

from axolotl.prompt_strategies.jinja_template_analyzer import JinjaTemplateAnalyzer
from axolotl.prompt_tokenizers import PromptTokenizingStrategy
from axolotl.prompters import IGNORE_TOKEN_ID, Prompter
from axolotl.utils.chat_templates import get_chat_template_from_config
from axolotl.utils.dict import remove_none_values
from axolotl.utils.logging import get_logger
from axolotl.utils.schemas.datasets import DatasetConfig

if TYPE_CHECKING:
    from axolotl.utils.mistral import HFMistralTokenizer

# Configure the logger
LOG = get_logger(__name__)
LOG.setLevel("INFO")


class ChatTemplatePrompter(Prompter):
    """Prompter for HF chat templates"""

    def __init__(
        self,
        tokenizer,
        chat_template: str,
        processor=None,
        max_length=2048,
        message_property_mappings: dict[str, str] | None = None,
        message_field_training: str | None = None,
        message_field_training_detail: str | None = None,
        field_messages: str = "messages",
        field_system: str = "system",
        field_tools: str = "tools",
        field_thinking: str = "reasoning_content",
        roles: dict[str, list[str]] | None = None,
        template_thinking_key: str | None = "reasoning_content",
        chat_template_kwargs: dict[str, Any] | None = None,
        drop_system_message: bool = False,
    ):
        # check if message_property_mappings is None or empty dict
        if message_property_mappings is None or (not message_property_mappings):
            message_property_mappings = {
                "role": "role",
                "content": "content",
            }
            if template_thinking_key and field_thinking:
                message_property_mappings[template_thinking_key] = field_thinking

        if roles:
            self.roles = {s: t for t, sources in roles.items() for s in sources}
        else:
            self.roles = {
                "human": "user",
                "user": "user",
                "assistant": "assistant",
                "gpt": "assistant",
                "system": "system",
                "tool": "tool",
            }

        self._chat_template_msg_variables = self.get_chat_template_msg_variables(
            chat_template, field_messages
        )
        self.message_property_mappings = message_property_mappings
        self.message_field_training = message_field_training
        self.message_field_training_detail = message_field_training_detail
        self.field_messages = field_messages
        self.field_system = field_system
        self.field_tools = field_tools
        self.field_thinking = field_thinking
        self.tokenizer = tokenizer
        self.processor: ProcessorMixin | None = processor
        self.chat_template = chat_template
        self.chat_template_kwargs = chat_template_kwargs or {}
        self.template_thinking_key: str = template_thinking_key or "reasoning_content"
        self.max_length = max_length
        self.drop_system_message = drop_system_message

    @property
    def chat_template_msg_variables(self) -> Set[str]:
        return self._chat_template_msg_variables

    def build_prompt(
        self,
        conversation: list[dict],
        add_generation_prompt=False,
        images=None,
        tools=None,
        real_last_index=None,
    ):
        """
        Build a prompt from a conversation.

        Args:
            conversation: A list of messages.
            add_generation_prompt: Whether to add a generation prompt.
            images: A list of images. (optional)
            tools: A list of tools. (optional)
        """
        chat_template_kwargs = {
            "chat_template": self.chat_template,
            "add_generation_prompt": add_generation_prompt,
            **self.chat_template_kwargs,
        }

        if tools:
            chat_template_kwargs["tools"] = tools

        if real_last_index:
            chat_template_kwargs["real_last_index"] = real_last_index

        if self.processor:
            if not callable(self.processor):
                raise TypeError("Processor must be callable")

            text = self.processor.apply_chat_template(
                conversation,
                tokenize=False,
                **chat_template_kwargs,
            )
            batch = self.processor(
                text=text,
                images=images,
                return_tensors="pt",
            )
            if hasattr(batch, "to_dict"):
                batch = batch.to_dict()
            else:
                batch = dict(batch)

            # workaround since processor works in batches instead of single examples
            out = {}
            for k, val in batch.items():
                if hasattr(val, "tolist"):
                    out[k] = (
                        val.tolist() if k == "pixel_values" else val.squeeze(0).tolist()
                    )
                else:
                    out[k] = val
            return out

        return self.tokenizer.apply_chat_template(
            conversation,
            tokenize=True,
            return_dict=False,
            **chat_template_kwargs,
        )

    def get_offsets_for_train_detail(
        self, text: str, train_details: List[Dict], mask_untrainable: bool = True
    ) -> List[int]:
        tokenized_output = self.tokenizer(
            text, return_offsets_mapping=True, add_special_tokens=False
        )
        tokens = tokenized_output.tokens()
        token_offsets = tokenized_output["offset_mapping"]

        LOG.debug(f"Tokenizing text: {text}")
        LOG.debug(f"Tokens: {tokens}")
        # Adjust the end offsets. For some reason by default they are set to the same value as the start offsets.
        for i in range(len(token_offsets) - 1):
            token_offsets[i] = (token_offsets[i][0], token_offsets[i + 1][0] - 1)
        # Ensure the last token's end offset is set correctly
        token_offsets[-1] = (token_offsets[-1][0], len(text) - 1)
        LOG.debug(f"Token offsets: {token_offsets}")

        # Initialize all offsets as IGNORE_TOKEN_ID (not trained)
        result = [IGNORE_TOKEN_ID] * len(token_offsets)

        # Adjust train_details to align with token boundaries
        adjusted_train_details = self.adjust_train_details(train_details, token_offsets)

        for idx, (start, end) in enumerate(token_offsets):
            for detail in adjusted_train_details:
                # Check if the token is completely within the detail's range
                if start >= detail["begin_offset"] and end <= detail["end_offset"]:
                    if detail["train"] or not mask_untrainable:
                        result[idx] = start
                        LOG.debug(f"Token {idx} ({tokens[idx]}) marked for training")
                    else:
                        LOG.debug(
                            f"Token {idx} ({tokens[idx]}) marked as non-trainable"
                        )
                elif start < detail["end_offset"] and end > detail["begin_offset"]:
                    # Token partially overlaps with detail, always mark as non-trainable
                    LOG.debug(
                        f"Token {idx} ({tokens[idx]}) partially overlaps detail, marked as non-trainable"
                    )

        LOG.debug(f"Final result: {result}")
        return result

    def adjust_train_details(
        self, train_details: List[Dict], token_offsets: List[tuple]
    ) -> List[Dict]:
        adjusted_details = []
        for detail in train_details:
            begin_offset = detail["begin_offset"]
            end_offset = detail["end_offset"]

            # Find the first token that starts after or at the begin_offset
            begin_token = next(
                (
                    i
                    for i, (t_start, t_end) in enumerate(token_offsets)
                    if t_start >= begin_offset
                ),
                len(token_offsets),
            )
            if begin_token > 0 and token_offsets[begin_token - 1][1] > begin_offset:
                begin_token -= 1

            # Find the last token that ends before or at the end_offset
            end_token = next(
                (
                    i
                    for i in range(len(token_offsets) - 1, -1, -1)
                    if token_offsets[i][1] <= end_offset
                ),
                -1,
            )
            if (
                end_token < len(token_offsets) - 1
                and token_offsets[end_token + 1][0] < end_offset
            ):
                end_token += 1

            if begin_token <= end_token:
                adjusted_begin = token_offsets[begin_token][0]
                adjusted_end = token_offsets[end_token][1]

                if adjusted_begin != begin_offset or adjusted_end != end_offset:
                    LOG.warning(
                        f"Adjusting detail offsets: ({begin_offset}, {end_offset}) -> ({adjusted_begin}, {adjusted_end})"
                    )

                adjusted_details.append(
                    {
                        "begin_offset": adjusted_begin,
                        "end_offset": adjusted_end,
                        "train": detail["train"],
                    }
                )
            else:
                LOG.warning(
                    f"Could not adjust detail offsets: ({begin_offset}, {end_offset}). Skipping this detail."
                )

        return adjusted_details

    def get_chat_template_msg_variables(
        self, chat_template: str, field_messages: str
    ) -> Set[str]:
        template_analyzer = JinjaTemplateAnalyzer(chat_template)
        return template_analyzer.get_message_vars(field_messages)


class ChatTemplateStrategy(PromptTokenizingStrategy):
    """
    Tokenizing strategy for instruction-based prompts.
    """

    def __init__(
        self,
        prompter: "ChatTemplatePrompter",
        tokenizer,
        train_on_inputs: bool,
        sequence_len: int,
        roles_to_train: list[str] | None = None,
        train_on_eos: str | None = None,
        train_on_eot: str | None = None,
        eot_tokens: list[str] | None = None,
        split_thinking: bool | None = False,
    ):
        super().__init__(prompter, tokenizer, train_on_inputs, sequence_len)
        self.prompter: ChatTemplatePrompter = prompter

        self.roles_to_train = []
        if roles_to_train:
            # map roles if exist in prompter.roles else use the role as is
            self.roles_to_train = [
                prompter.roles.get(role, role) for role in roles_to_train
            ]

        self.train_on_eos = train_on_eos
        # Backward compatibility, load from train_on_eos
        self.train_on_eot = train_on_eot if train_on_eot is not None else train_on_eos

        # Default to eos_token if eot_tokens not provided
        self.eot_tokens = []
        if eot_tokens is not None:
            self.eot_tokens = eot_tokens
        elif (
            hasattr(self.tokenizer, "eos_token")
            and self.tokenizer.eos_token is not None
        ):
            self.eot_tokens = [self.tokenizer.eos_token]

        self.split_thinking = split_thinking

        self.images = "images"

        LOG.debug(
            f"The chat template uses the following properites on the message: {self.prompter.chat_template_msg_variables}"
        )

        self._validate_eot_and_eos_tokens()

    def _validate_eot_and_eos_tokens(self):
        """
        - Validates that EOT tokens (or eos_token) are in the chat_template
        - Checks if EOT tokens are encoded as multiple tokens in the tokenizer.
        - Checks for potential conflicts between train_on_eos and train_on_eot.
        """
        if self.prompter.chat_template is None:
            # Usually this should not happen
            LOG.warning(
                "No chat template provided, skipping EOT and EOS token validation"
            )
            return

        # If the EOT token is the same as the EOS token, we need to check differently
        if len(self.eot_tokens) == 1 and self.eot_tokens[0] == self.tokenizer.eos_token:
            # Check if the eos_token is in the chat_template or as a variable `eos_token`
            # Note: we check for `eos_token` in the string, but it could possibly not be a variable
            if (
                self.tokenizer.eos_token not in self.prompter.chat_template
                and "eos_token" not in self.prompter.chat_template
            ):
                LOG.warning(
                    f"EOS token '{self.tokenizer.eos_token}' not found in chat_template. Please check if your template/EOS token is correct."
                )
            return

        # Create a new list to store tokens that should be kept
        valid_eot_tokens = []
        for token in self.eot_tokens:
            # Check if EOT token is in the chat_template
            if token not in self.prompter.chat_template:
                LOG.warning(f"EOT token '{token}' not found in chat_template.")
                # Don't add to the valid tokens list
                continue

            valid_eot_tokens.append(token)

        # Replace the original list with the filtered one
        self.eot_tokens = valid_eot_tokens

        for token in self.eot_tokens:
            # If token in template, check if EOT token is in tokenizer and not encoded as multiple tokens
            token_ids = self.tokenizer.encode(token, add_special_tokens=False)
            if not token_ids:
                raise ValueError(
                    "EOT token encoding failed. Please check if the token is valid and can be encoded."
                )
            if token_ids and len(token_ids) > 1:
                raise ValueError(
                    f"EOT token '{token}' is encoded as multiple tokens: {token_ids}. Please add it under `tokens: ` in the config "
                    "or (recommended) override unused added_tokens via `added_tokens_overrides: `."
                )

        # If eos_token is in eot_tokens and conflict between train_on_eos and train_on_eot, raise an error
        if (
            self.tokenizer.eos_token in self.eot_tokens
            and self.train_on_eos != self.train_on_eot
        ):
            raise ValueError(
                "Conflict between train_on_eos and train_on_eot. eos_token is in eot_tokens and train_on_eos != train_on_eot"
                f"train_on_eos: {self.train_on_eos}, train_on_eot: {self.train_on_eot}"
                f"eot_tokens: {self.eot_tokens}"
                f"eos_token: {self.tokenizer.eos_token}"
            )

    @property
    def supports_batched(self) -> bool:
        # Let calling code know we can handle lists of examples
        return True

    def is_prompt_batched(self, prompt: dict[str, Any]) -> bool:
        try:
            return all(isinstance(v, list) for v in prompt.values()) and all(
                isinstance(v, list) for v in prompt[self.prompter.field_messages]
            )
        except KeyError:
            return False

    def tokenize_prompt(self, prompt: dict[str, Any]):
        """
        Public method that can handle either a single prompt or a batch of prompts.
        """

        prompt = remove_none_values(prompt)

        if not self.is_prompt_batched(prompt) or not self.supports_batched:
            return self._tokenize_single_prompt(prompt)

        res = defaultdict(lambda: [])
        feature_names = list(prompt.keys())

        # Process each prompt individually
        for row in zip(*prompt.values(), strict=False):
            tokenized_prompt = self._tokenize_single_prompt(
                dict(zip(feature_names, row, strict=False))
            )
            for key, val in tokenized_prompt.items():
                res[key].append(val)

        # If there are no examples left, return an empty dictionary
        if not res:
            return {}

        return dict(res)

    def _tokenize_single_prompt(self, prompt: dict) -> Dict[str, List[int]]:
        # Old simple legacy behavior that works reliably.
        if (
            not self.roles_to_train
            and not self.train_on_eos
            and not self.train_on_eot
            and not self.prompter.message_field_training  # type: ignore
            and not self.prompter.message_field_training_detail  # type: ignore
        ):
            turns = self.get_conversation_thread(prompt)
            images = self._get_images(prompt)
            prompt_ids = self.prompter.build_prompt(  # type: ignore
                turns[:-1],
                add_generation_prompt=True,
                images=images,
            )
            tokenized_res = self.prompter.build_prompt(turns, images=images)  # type: ignore
            tokenized_prompt = {}
            if isinstance(tokenized_res, list):
                input_ids = prompt_ids + tokenized_res[len(prompt_ids) :]
                tokenized_prompt["input_ids"] = input_ids
                tokenized_prompt["attention_mask"] = [1] * len(input_ids)
            else:
                input_ids = tokenized_res["input_ids"]
                tokenized_prompt = dict(tokenized_res)

            if not self.train_on_inputs:
                if isinstance(prompt_ids, dict):
                    user_prompt_len = len(prompt_ids["input_ids"])
                else:
                    user_prompt_len = len(prompt_ids)
                labels = [-100] * user_prompt_len + input_ids[user_prompt_len:]
            else:
                labels = input_ids

            tokenized_prompt["labels"] = labels

            return tokenized_prompt

        turns = self.get_conversation_thread(prompt)
        tools = self._get_tools(prompt)
        input_ids = self.prompter.build_prompt(turns, tools=tools)  # type: ignore
        labels = [IGNORE_TOKEN_ID] * len(input_ids)

        last_eos_idx = -1
        last_eot_idx = -1
        for index, turn in enumerate(turns):
            role = turn.get("role")
            content = turn.get("content")
            train_turn = turn.get("training")
            train_detail = turn.get("training_detail")

            LOG.debug(
                f"Processing turn {index}: role={role}, content={content}, train_turn={train_turn}, train_detail={train_detail}"
            )

            should_train = None
            if train_turn is not None:
                should_train = train_turn
            elif train_detail is not None:
                should_train = bool(train_detail)
            else:
                should_train = self.train_on_inputs or role in self.roles_to_train

            LOG.debug(f"Should train: {should_train}")

            # turn not trainable, skip having to find the turn indices
            # unless last turn and train_on_eos/train_on_eot is all
            if not should_train and (
                self.train_on_eos != "all" and self.train_on_eot != "all"
            ):
                if index == len(turns) - 1:
                    LOG.warning(
                        "Last turn is not trainable, skipping having to find the turn indices. "
                        "This may cause incorrect last EOT/EOS token to be unmasked."
                        "This is likely a dataset design issue. Please ensure last turn is trainable."
                    )

                continue

            turn_start_idx, turn_end_idx = self.find_turn(
                turns=turns, turn_idx=index, tools=tools
            )

            LOG.debug(f"Turn indices: start={turn_start_idx}, end={turn_end_idx}")

            if should_train and turn_start_idx != -1 and turn_end_idx != -1:
                if train_detail:
                    # Block multi-content for now
                    if not isinstance(content, str):
                        raise ValueError(
                            "`train_detail` is not supported when `content` is not a string."
                        )

                    token_offsets = self.prompter.get_offsets_for_train_detail(  # type: ignore
                        content, train_detail
                    )
                    LOG.debug(f"Token offsets: {token_offsets}")
                    for i, offset in enumerate(token_offsets):
                        if offset != IGNORE_TOKEN_ID and turn_start_idx + i < len(
                            input_ids
                        ):
                            labels[turn_start_idx + i] = input_ids[turn_start_idx + i]
                            LOG.debug(
                                f"Label set at index {turn_start_idx + i}: {input_ids[turn_start_idx + i]}"
                            )
                else:
                    labels[turn_start_idx:turn_end_idx] = input_ids[
                        turn_start_idx:turn_end_idx
                    ]
                    LOG.debug(
                        f"Set labels for training from {turn_start_idx} to {turn_end_idx}"
                    )

                LOG.debug(f"Labels after processing turn {index}: {labels}")

            # Handle special tokens (EOT and EOS)
            for token_type, find_func, train_option in [
                ("EOT", self.find_first_eot_token, self.train_on_eot),
                ("EOS", self.find_first_eos_token, self.train_on_eos),
            ]:
                token_idx = find_func(input_ids, start_idx=turn_end_idx)

                if (
                    token_idx != -1 and abs(token_idx - turn_end_idx) <= 3
                ):  # Allow for some template padding
                    # Update the last token index
                    if token_type == "EOT":  # nosec B105
                        last_eot_idx = token_idx
                    else:
                        last_eos_idx = token_idx

                    # Set labels if needed for this turn
                    if train_option == "all" or (
                        train_option == "turn" and should_train
                    ):
                        labels[token_idx] = input_ids[token_idx]
                        LOG.debug(
                            f"{token_type} token set for training at index {token_idx}"
                        )
                else:
                    LOG.debug(
                        f"{token_type} token missing after turn {turn}. {token_type.lower()}_idx: {token_idx}, turn_end_idx: {turn_end_idx}"
                    )

        # Handle 'last' option for special tokens
        for token_type, last_idx, train_option in [
            ("EOT", last_eot_idx, self.train_on_eot),
            ("EOS", last_eos_idx, self.train_on_eos),
        ]:
            if train_option == "last" and last_idx != -1:
                labels[last_idx] = input_ids[last_idx]
                LOG.debug(
                    f"Last {token_type} token set for training at index {last_idx}"
                )

        LOG.debug(f"Final labels: {labels}")

        return {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": [1] * len(input_ids),
        }

    def find_first_eos_token(self, input_ids, start_idx):
        eos_token_id = self.tokenizer.eos_token_id
        for i in range(start_idx, len(input_ids)):
            if input_ids[i] == eos_token_id:
                return i
        return -1

    def find_first_eot_token(self, input_ids, start_idx):
        """Find the first EOT token in the input_ids starting from start_idx."""
        # Get token IDs for all EOT tokens
        eot_token_ids = []
        for token in self.eot_tokens:
            token_ids = self.tokenizer.encode(token, add_special_tokens=False)
            if len(token_ids) != 1:
                raise ValueError(
                    f"EOT token '{token}' is encoded as multiple tokens: {token_ids}. Please add it under `tokens: ` in the config."
                )

            eot_token_ids.append(token_ids[0])  # Use the last token ID if multiple

        # Search for any of the EOT token IDs
        for i in range(start_idx, len(input_ids)):
            if input_ids[i] in eot_token_ids:
                return i
        return -1

    def find_turn(
        self, turns: list[dict], turn_idx: int, tools: list[dict] | None = None
    ):
        """
        Locate the starting and ending indices of the specified turn in a conversation.
        """

        if turn_idx >= len(turns):
            raise ValueError(f"Turn index {turn_idx} out of range")

        # mistral/gemma3 does not output message if it contains only system message
        if (
            turn_idx == 0
            and turns[0].get("role") == "system"
            and ("mistral" in self.tokenizer.name_or_path.lower())
        ):
            return -1, -1

        empty_turn = {
            "role": turns[turn_idx].get("role"),
            "content": "[[dummy_message]]",
        }

        # Create conversation versions
        turns_with_empty = turns[:turn_idx] + [empty_turn]
        turns_with_content = turns[: turn_idx + 1]

        real_last_index = len(turns) - 1

        # Generate the conversation up to the turn, with final turn replaced with dummy content
        dummy_ids = self.prompter.build_prompt(
            turns_with_empty, tools=tools, real_last_index=real_last_index
        )  # type: ignore

        # Generate the conversation up to the turn, with final turn included
        full_ids = self.prompter.build_prompt(
            turns_with_content, tools=tools, real_last_index=real_last_index
        )  # type: ignore

        if not full_ids or not dummy_ids:
            LOG.warning(f"Empty template generated for turn {turn_idx}")
            return -1, -1

        # Find first difference (start of content)
        start_idx = None
        min_len = min(len(dummy_ids), len(full_ids))
        for i in range(min_len):
            if dummy_ids[i] != full_ids[i]:
                start_idx = i
                break

        if start_idx is None:
            LOG.warning(f"Could not find content start boundary for turn {turn_idx}")
            return -1, -1

        # Find last difference (end of content)
        end_idx = None
        for i in range(min_len):
            dummy_pos = len(dummy_ids) - 1 - i
            full_pos = len(full_ids) - 1 - i
            if dummy_ids[dummy_pos] != full_ids[full_pos]:
                end_idx = full_pos + 1  # Add one to include the last token when slice
                break

        if end_idx is None:
            LOG.warning(f"Could not find content end boundary for turn {turn_idx}")
            return -1, -1

        if end_idx < start_idx:
            LOG.warning(
                f"Content end boundary is before start boundary for turn {turn_idx}"
            )
            return -1, -1

        if end_idx == start_idx:
            LOG.warning(
                f"Content end boundary is the same as start boundary for turn {turn_idx}. This is likely an empty turn."
            )
            return -1, -1

        LOG.debug(f"Content boundaries: {start_idx}, {end_idx}")
        LOG.debug(
            f"Content tokens: {self.tokenizer.convert_ids_to_tokens(full_ids[start_idx:end_idx])}"
        )

        return start_idx, end_idx

    def get_conversation_thread(self, prompt):
        turns = []

        messages = self._get_messages(prompt)

        possible_sys_turn = self.transform_message(messages[0])

        if (
            possible_sys_turn["role"] != "system"
            and self.prompter.field_system in prompt
        ):
            turn = {"role": "system", "content": prompt[self.prompter.field_system]}
            turns.append(turn)

        for message in messages:
            transformed_message = self.transform_message(message)

            turn = transformed_message

            training = message.get(self.prompter.message_field_training)
            training_detail = message.get(self.prompter.message_field_training_detail)
            if training is not None:
                turn["training"] = training
            if training_detail is not None:
                turn["training_detail"] = training_detail

            turns.append(turn)

        if self.prompter.drop_system_message and turns[0]["role"] == "system":
            turns = turns[1:]

        return turns

    def transform_message(self, message: dict) -> dict:
        # Build the initial transformed message from the mappings
        transformed_message = {}
        for key, value in self.prompter.message_property_mappings.items():
            if message.get(value) is not None:
                transformed_message[key] = message[value]
            else:
                LOG.debug(
                    f"Could not find value for property {value} in message: {message}"
                )

        # Map the role if necessary
        if "role" in transformed_message:
            transformed_message["role"] = self.prompter.roles.get(
                transformed_message["role"], transformed_message["role"]
            )

        # TODO handle reasoning_content with split_thinking
        # if the role is assistant that we want to use reasoning_content
        if self.split_thinking and transformed_message["role"] == "assistant":
            content = transformed_message["content"]
            thinking_pairs = [
                ("<think>", "</think>"),
                ("<reasoning>", "</reasoning>"),
                ("<|begin_of_thought|>", "<|end_of_thought|>"),
            ]
            content_pairs = [("<|begin_of_solution|>", "<|end_of_solution|>")]
            for tpair in thinking_pairs:
                # check if the thinking pair is in the content
                if tpair[0] in content and tpair[1] in content:
                    # find the start and end index of the thinking pair
                    t_start_idx = content.find(tpair[0])
                    t_end_idx = content.find(tpair[1])

                    # get the thinking content
                    thinking_content = content[t_start_idx + len(tpair[0]) : t_end_idx]
                    transformed_message[self.prompter.template_thinking_key] = (
                        thinking_content.strip()
                    )

                    # take remainder of the content
                    # strip whitespace from beginning of the remainder (thinking tokens)
                    remainder = content[t_end_idx + len(tpair[1]) :].lstrip()

                    # check if the content pair is in the remainder
                    cpair_found = False
                    for cpair in content_pairs:
                        if cpair[0] in remainder and cpair[1] in remainder:
                            # find the start and end index of the content pair
                            c_start_idx = remainder.find(cpair[0])
                            c_end_idx = remainder.find(cpair[1])

                            # get the content content
                            content_content = remainder[
                                c_start_idx + len(cpair[0]) : c_end_idx
                            ]
                            transformed_message["content"] = content_content.strip()
                            cpair_found = True
                            break

                    # else, the content is the remainder
                    if not cpair_found:
                        transformed_message["content"] = remainder
                    break

        # Determine which keys in the original message were not mapped
        mapped_values = set(self.prompter.message_property_mappings.values())
        remaining_keys = set(message) - mapped_values

        # Keep only the properties defined in the chat template
        # and not already mapped
        for key in self.prompter.chat_template_msg_variables:
            if key in remaining_keys:
                val = message.get(key)
                if val is not None:
                    transformed_message[key] = val

        if "tool_calls" in transformed_message and transformed_message["tool_calls"]:
            for tool_call in transformed_message["tool_calls"]:
                if "function" in tool_call and "arguments" in tool_call["function"]:
                    args = tool_call["function"]["arguments"]
                    if isinstance(args, str):
                        try:
                            tool_call["function"]["arguments"] = json.loads(args)
                        except json.JSONDecodeError as e:
                            LOG.error(
                                f"Error parsing tool_calls arguments as JSON. "
                                f"Function: {tool_call.get('function', {}).get('name', 'unknown')}, "
                                f"Arguments string: {args!r}, "
                                f"Error: {e}"
                            )
                            raise

        return transformed_message

    def _get_images(self, prompt):
        return prompt.get(self.images, None)

    def _get_tools(self, prompt) -> list[dict] | None:
        """Get tools from prompt if available."""
        tools = prompt.get(self.prompter.field_tools, None)
        if tools is None:
            return None

        if isinstance(tools, list):
            # Process each tool to handle JSON string parameters
            for tool in tools:
                if isinstance(tool, dict) and "function" in tool:
                    function = tool["function"]
                    if "parameters" in function:
                        params = function["parameters"]
                        if isinstance(params, str):
                            try:
                                function["parameters"] = json.loads(params)
                            except json.JSONDecodeError as e:
                                LOG.error(
                                    f"Error parsing tool parameters as JSON. "
                                    f"Function: {function.get('name', 'unknown')}, "
                                    f"Parameters string: {params!r}, "
                                    f"Error: {e}"
                                )
                                raise
            return tools

        raise ValueError(
            "Unknown tools format. Please convert it into a list[dict].\n"
            f"Current format: {type(tools)}"
        )

    def _get_messages(self, prompt):
        messages = prompt.get(self.prompter.field_messages, None)
        if messages is None:
            raise ValueError("Messages is null. Please check `field_messages`.")

        if isinstance(messages, list):
            return messages

        raise ValueError(
            "Unknown messages format. Please convert it into a list[dict].\n"
            f"Current format: {type(messages)}"
        )


class MistralStrategy(ChatTemplateStrategy):
    """
    Mistral strategy for chat template.
    """

    def __init__(
        self,
        prompter: "ChatTemplatePrompter",
        tokenizer: "HFMistralTokenizer",
        train_on_inputs: bool,
        sequence_len: int,
        roles_to_train: list[str] | None = None,
        train_on_eos: str | None = None,
        train_on_eot: str | None = None,
        eot_tokens: list[str] | None = None,
        split_thinking: bool | None = False,
    ):
        # Call the parent's parent __init__ (PromptTokenizingStrategy) to skip ChatTemplateStrategy's validation

        PromptTokenizingStrategy.__init__(
            self, prompter, tokenizer, train_on_inputs, sequence_len
        )
        self.prompter: ChatTemplatePrompter = prompter

        self.roles_to_train = []
        if roles_to_train:
            # map roles if exist in prompter.roles else use the role as is
            self.roles_to_train = [
                prompter.roles.get(role, role) for role in roles_to_train
            ]

        self.train_on_eos = train_on_eos
        # Backward compatibility, load from train_on_eos
        self.train_on_eot = train_on_eot if train_on_eot is not None else train_on_eos

        # Default to eos_token if eot_tokens not provided
        self.eot_tokens = []
        if eot_tokens is not None:
            self.eot_tokens = eot_tokens
        else:
            # set eot_tokens to the eos_token
            self.eot_tokens = [self.tokenizer.eos_token]

        self.split_thinking = split_thinking

        self.images = "images"

        LOG.debug(
            f"The chat template uses the following properites on the message: {self.prompter.chat_template_msg_variables}"
        )

        # Skip the validation that ChatTemplateStrategy calls
        # TODO: address this in the future with mistral-specific checks
        # self._validate_eot_and_eos_tokens()

    def find_first_eot_token(self, input_ids, start_idx):
        """Find the first EOT token in the input_ids starting from start_idx."""
        # mistral-common tokenizer does not support eot_tokens
        return self.find_first_eos_token(input_ids, start_idx)


class MistralPrompter(ChatTemplatePrompter):
    """
    Mistral prompter for chat template.
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        self._chat_template_msg_variables = set(["tool_call_id", "name", "tool_calls"])


class StrategyLoader:
    """
    Load chat template strategy based on configuration.
    """

    def _get_strategy_cls(self, cfg):
        if cfg.tokenizer_use_mistral_common:
            return MistralStrategy

        return ChatTemplateStrategy

    def _get_prompter_cls(self, cfg):
        if cfg.tokenizer_use_mistral_common:
            return MistralPrompter

        return ChatTemplatePrompter

    def _get_strategy_params(self, cfg, ds_cfg: Dict[str, Any]):
        return {
            "train_on_inputs": cfg.train_on_inputs,
            "sequence_len": cfg.sequence_len,
            "roles_to_train": ds_cfg.get("roles_to_train", ["assistant"]),
            "train_on_eos": ds_cfg.get("train_on_eos", "turn"),
            "train_on_eot": ds_cfg.get("train_on_eot", None),
            "eot_tokens": cfg.get("eot_tokens", None),  # loads from cfg, not ds_cfg
            "split_thinking": ds_cfg.get("split_thinking", False),
        }

    def __call__(
        self,
        tokenizer,
        cfg,
        ds_cfg: Union[Dict[str, Any], DatasetConfig] | None = None,
        processor=None,
    ):
        if ds_cfg is None:
            dataset_config = {}
        elif isinstance(ds_cfg, BaseModel):
            dataset_config = ds_cfg.model_dump()
        else:
            dataset_config = ds_cfg

        if cfg.tokenizer_use_mistral_common:
            # mistral-common does not use this, so we pass an empty string
            chat_template_string = ""
        else:
            chat_template_string = get_chat_template_from_config(
                cfg=cfg, ds_cfg=dataset_config, tokenizer=tokenizer
            )

        LOG.info(f"Using chat template:\n---\n{chat_template_string!s}\n---")

        prompter_params = {
            "tokenizer": tokenizer,
            "chat_template": chat_template_string,
            "chat_template_kwargs": cfg.get("chat_template_kwargs", {}),
            "message_property_mappings": dataset_config.get(
                "message_property_mappings", {}
            ),
            "message_field_training": dataset_config.get(
                "message_field_training", None
            ),
            "message_field_training_detail": dataset_config.get(
                "message_field_training_detail",
                None,
            ),
            "field_messages": dataset_config.get("field_messages", "messages"),
            "field_thinking": dataset_config.get("field_thinking", "reasoning_content"),
            "template_thinking_key": dataset_config.get(
                "template_thinking_key", "reasoning_content"
            ),
            "roles": dataset_config.get("roles"),
            "drop_system_message": dataset_config.get("drop_system_message", False),
            # we need to add one for detecting sequences with exceeding the `sequence_len` limit.
            "max_length": cfg.sequence_len + 1,
            "processor": processor,
        }

        strategy_params = self._get_strategy_params(cfg, dataset_config)
        strategy_cls = self._get_strategy_cls(cfg)
        prompter_cls = self._get_prompter_cls(cfg)

        strategy = strategy_cls(
            prompter_cls(**prompter_params),
            tokenizer=tokenizer,
            **strategy_params,
        )

        return strategy


load = StrategyLoader()


================================================
FILE: src/axolotl/prompt_strategies/completion.py
================================================
"""
Basic completion text
"""

from collections import defaultdict
from typing import Any, Dict, Generator, Optional, Tuple

from axolotl.prompt_tokenizers import InstructionPromptTokenizingStrategy


class CompletionPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
    """
    Tokenizing strategy for Completion prompts.
    """

    _field: str = "text"

    def __init__(self, *args, max_length=None, **kwargs):
        super().__init__(*args, **kwargs)
        if max_length is not None:
            self.max_length = max_length

    @property
    def supports_batched(self):
        return True

    @property
    def field(self) -> str:
        return self._field

    @field.setter
    def field(self, new_field: str):
        self._field = new_field

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        return (
            prompt[self.field],
            "",
            "",
        )

    def tokenize_prompt(self, prompt):
        res = defaultdict(lambda: [])
        feature_names = list(prompt.keys())
        for row in zip(*prompt.values(), strict=False):
            prompt_row = dict(zip(feature_names, row, strict=False))
            (
                instruction,
                _,
                _,
            ) = self.parse_instruction_fields(prompt_row)

            full_prompt = self._build_full_prompt(instruction, None, None)
            tokenized_full_prompt = self._tokenize(full_prompt)

            for key, val in tokenized_full_prompt.items():
                for i in range(0, len(val), self.sequence_len):
                    res[key].append(val[i : i + self.sequence_len])

        return dict(res)

    def _build_full_prompt(self, instruction, input, response):
        return next(iter(self.prompter.build_prompt(instruction, input, response)))


class CompletionPrompter:
    """
    Prompter for completion
    """

    def build_prompt(
        self,
        instruction: str,
        input=None,
        output=None,
    ) -> Generator[str, None, None]:
        yield instruction


def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
    strat = CompletionPromptTokenizingStrategy(
        CompletionPrompter(),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
        max_length=cfg.sequence_len * 64,
    )
    if ds_cfg and "field" in ds_cfg:
        strat.field = ds_cfg["field"]

    return strat


================================================
FILE: src/axolotl/prompt_strategies/context_qa.py
================================================
"""Module containing the classes for Context QA Prompt Tokenization Strategies"""

from typing import Tuple

from axolotl.prompt_tokenizers import InstructionPromptTokenizingStrategy
from axolotl.prompters import AlpacaPrompter, PromptStyle


# article, unanswerable_question, question, answer
def load_404(tokenizer, cfg):
    return AlpacaMissingInfoContextPromptTokenizingStrategy(
        AlpacaContextPrompter(PromptStyle.CHAT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


def load(tokenizer, cfg):
    return AlpacaContextPromptTokenizingStrategy(
        AlpacaContextPrompter(PromptStyle.CHAT.value),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


def load_v2(tokenizer, cfg):
    return ContextQaV2PromptTokenizingStrategy(
        ContextV2Prompter(),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


class AlpacaContextPrompter(AlpacaPrompter):
    """
    Customized system prompted for concise QA
    """

    system_prompt = (
        "Use the following contextual information to concisely answer the question.\n"
    )
    system_no_input_prompt = (
        "Use the following contextual information to concisely answer the question.\n"
    )


class AlpacaContextPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
    """
    Tokenization Strategy to combine in-context article with a question and answer
    """

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        return (
            prompt["article"] + "\n===\n" + prompt["question"],
            "",
            prompt["answer"],
        )


class ContextQaV2PromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
    """
    Tokenization Strategy to combine in-context article with a question and answer
    """

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        return (
            "Context: "
            + prompt["context"]
            + "\nQuestion: "
            + prompt["question"]
            + "\n",
            "",
            "Answer: " + prompt["answer"],
        )


class ContextV2Prompter(AlpacaPrompter):
    """
    Customized system prompted for concise QA
    """

    system_prompt = ""
    system_no_input_prompt = ""

    def match_prompt_style(self):
        self.turn_format = "{instruction}\n{input}"
        self.turn_no_input_format = "{instruction}"
        self.system_format = "{system}"


class AlpacaMissingInfoContextPromptTokenizingStrategy(
    InstructionPromptTokenizingStrategy
):
    """
    Tokenization Strategy to combine in-context article with a question that can't be answered
    from the context and a default response to that effect
    """

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        return (
            prompt["article"] + "\n===\n" + prompt["unanswerable_question"],
            "",
            "The context provided does not contain any information about your inquiry. "
            "Therefore, I'm unable to answer your question based on the given context.",
        )


================================================
FILE: src/axolotl/prompt_strategies/creative_acr.py
================================================
"""Module loading the CreativePromptTokenizingStrategy and similar classes"""

from typing import Generator, Tuple, Union

import yaml

from axolotl.prompt_tokenizers import InstructionPromptTokenizingStrategy


class CreativeAnsweringPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
    """
    Tokenizing strategy for Creative Answering
    """

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        question = prompt["instruction"]
        answer = prompt[
            "revision"
        ]  # don't use prompt[answer], that's data we don't want in the dataset
        return (
            question,
            "",
            answer,
        )


class CreativeCritiquePromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
    """
    Tokenizing strategy for Creative Critique
    """

    user_prompt = """Given the following Question and Response, critique the Response on a scale of 1-10. You should critique the answer in the following criteria:
refusal: whether the Response refuses to answer the Question. Responses containing language similar to "As an AI language model, I am not capable of ..." is a form of refusal and should have a low refusal score. 10 is no refusal, 0 is refuses to answer the question.
prescriptive bias: whether the Response attempts to prescribe or dictate certain actions or behaviors of the user. 10 no prescriptive bias present, 0 means there is prescriptive bias.
creativity: score how creative the Response is. 10 is most creative, 0 lacks creativity
comprehensiveness: score how comprehensive and in-depth the Response is. 10 is most comprehensive and in-depth, 0 lacks coverage and depth
Then for each criteria, give a detailed explanation of why the response doesn't meet the criteria (don't reference the score), what it did right, and how it could be improved for that criteria.

Output your answer in YAML format like so:
scores:
  refusal: <0-10>
  prescriptive_bias: <0-10>
  creativity: <0-10>
  comprehensiveness: <0-10>
critiques:
  refusal:
    explanation: ...
    improvements: ...
  prescriptive_bias:
    explanation: ...
    improvements: ...
  creativity:
    explanation: ...
    improvements: ...
  comprehensiveness:
    explanation: ...
    improvements: ...

Question: {question}
Answer: {answer}
"""

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        scores = yaml.dump(
            prompt["scores"],
            default_flow_style=False,
            Dumper=yaml.Dumper,
        )
        critiques = yaml.dump(
            prompt["critiques"],
            default_flow_style=False,
            Dumper=yaml.Dumper,
        )
        evaluation = scores + critiques
        question = prompt["instruction"]
        answer = prompt["answer"]
        return (
            self.user_prompt.format(question=question, answer=answer),
            "",
            evaluation,
        )


class CreativeRevisePromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
    """
    Tokenizing strategy for Creative Revise
    """

    user_prompt = """Definitions:
refusal: whether the Response refuses to answer the Question. Responses containing language similar to "As an AI language model, I am not capable of ..." is a form of refusal and should have a low refusal score. 10 is no refusal, 0 is refuses to answer the question.
prescriptive bias: whether the Response attempts to prescribe or dictate certain actions or behaviors of the user. 10 no prescriptive bias present, 0 means their is prescriptive bias.
creativity: score how creative the Response is. 10 is most creative, 0 lacks creativity
comprehensiveness: score how comprehensive and in-depth the Response is. 10 is most comprehensive and in-depth, 0 lacks coverage and depth

Given the following Question, Response, and Evaluation, revise the Response based on the Evaluation and recommendations for improvements. Reply only with the revised response.

Question: {question}
Answer: {answer}
Evaluation:
{evaluation}
"""

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        scores = yaml.dump(
            prompt["scores"],
            default_flow_style=False,
            Dumper=yaml.Dumper,
        )
        critiques = yaml.dump(
            prompt["critiques"],
            default_flow_style=False,
            Dumper=yaml.Dumper,
        )
        evaluation = scores + critiques
        question = prompt["instruction"]
        answer = prompt["answer"]
        return (
            self.user_prompt.format(
                question=question, answer=answer, evaluation=evaluation
            ),
            "",
            prompt["revision"],
        )


class CreativePrompterBase:
    """
    Base class for Creative Prompters
    """

    system_prompt = ""
    prompt_input = "{system_prompt}\nUSER: {instruction}\nASSISTANT:"

    def build_prompt(
        self,
        instruction: str,
        input: Union[None, str] = None,
        output: Union[None, str] = None,
    ) -> Generator[str, None, None]:
        if self.system_prompt:
            res = f"{self.system_prompt}\nUSER: {instruction}\nASSISTANT:"
        else:
            res = f"USER: {instruction}\nASSISTANT:"
        if output:
            res = f"{res}{output}"
        yield res


class CreativeAnswerPrompter(CreativePrompterBase):
    """
    Prompter for Creative Answering
    """

    system_prompt = "Answer the following question in a comprehensive, in-depth, and creative way. Additionally your response should be relevant, accurate, and free of any ambiguity."


class CreativeCritiquePrompter(CreativePrompterBase):
    """
    Prompter for Creative Critique
    """

    system_prompt = ""


class CreativeRevisePrompter(CreativePrompterBase):
    """
    Prompter for Creative Revise
    """

    system_prompt = ""


def load_answer(tokenizer, cfg):
    return CreativeAnsweringPromptTokenizingStrategy(
        CreativeAnswerPrompter(),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


def load_critique(tokenizer, cfg):
    return CreativeCritiquePromptTokenizingStrategy(
        CreativeCritiquePrompter(),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


def load_revise(tokenizer, cfg):
    return CreativeRevisePromptTokenizingStrategy(
        CreativeRevisePrompter(),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


================================================
FILE: src/axolotl/prompt_strategies/dpo/__init__.py
================================================
"""
module for DPO style dataset transform strategies
"""

from functools import partial

from ..base import load as load_base

load = partial(load_base, module_base="axolotl.prompt_strategies.dpo")


================================================
FILE: src/axolotl/prompt_strategies/dpo/chat_template.py
================================================
"""
DPO prompt strategies for using tokenizer chat templates.
"""

from axolotl.utils.chat_templates import extract_chat_template_args, get_chat_template
from axolotl.utils.schemas.utils import handle_legacy_message_fields_logic


def default(cfg, dataset_idx=0, **kwargs):
    ds_cfg = cfg["datasets"][dataset_idx]
    ds_cfg = handle_legacy_message_fields_logic(ds_cfg)

    chat_template_choice, chat_template_jinja = extract_chat_template_args(
        cfg=cfg, ds_cfg=ds_cfg
    )
    field_messages = ds_cfg.get("field_messages", "messages")
    field_chosen = ds_cfg.get("field_chosen", "chosen")
    field_rejected = ds_cfg.get("field_rejected", "rejected")
    message_property_mappings = ds_cfg.get(
        "message_property_mappings",
        {
            "role": "role",
            "content": "content",
        },
    )
    role_map_inv = ds_cfg.get(
        "roles",
        {
            "user": ["user"],
            "assistant": ["assistant"],
            "system": ["system"],
        },
    )
    role_map = {}
    for target, sources in role_map_inv.items():
        for source in sources:
            role_map[source] = target

    def transform_fn(sample, tokenizer=None):
        chat_template_string = get_chat_template(
            user_choice=chat_template_choice,
            jinja_template=chat_template_jinja,
            tokenizer=tokenizer,
        )

        messages = sample[field_messages]
        if isinstance(messages, str):
            messages = [
                {
                    message_property_mappings["role"]: "user",
                    message_property_mappings["content"]: messages,
                }
            ]

        messages = [
            {
                "role": role_map[m[message_property_mappings["role"]]],
                "content": m[message_property_mappings["content"]],
            }
            for m in messages
        ]

        chosen_raw = sample[field_chosen]
        if isinstance(chosen_raw, str):
            chosen_msg = {
                message_property_mappings["role"]: "assistant",
                message_property_mappings["content"]: chosen_raw,
            }
        elif isinstance(chosen_raw, dict):
            chosen_msg = chosen_raw
        else:
            chosen_msg = chosen_raw[-1]
        chosen = {
            "role": role_map[chosen_msg[message_property_mappings["role"]]],
            "content": chosen_msg[message_property_mappings["content"]],
        }

        rejected_raw = sample[field_rejected]
        if isinstance(rejected_raw, str):
            rejected_msg = {
                message_property_mappings["role"]: "assistant",
                message_property_mappings["content"]: rejected_raw,
            }
        elif isinstance(rejected_raw, dict):
            rejected_msg = rejected_raw
        else:
            rejected_msg = rejected_raw[-1]
        rejected = {
            "role": role_map[rejected_msg[message_property_mappings["role"]]],
            "content": rejected_msg[message_property_mappings["content"]],
        }
        dummy_user_message = {"role": "user", "content": "[[dummy_message]]"}

        result = {}
        result["prompt"] = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            chat_template=chat_template_string,
            tokenize=False,
        )

        result["chosen"] = tokenizer.apply_chat_template(
            [dummy_user_message, chosen],
            add_generation_prompt=False,
            chat_template=chat_template_string,
            tokenize=False,
        )
        chosen_strip_index = result["chosen"].find(chosen["content"])
        result["chosen"] = result["chosen"][chosen_strip_index:].rstrip()

        result["rejected"] = tokenizer.apply_chat_template(
            [dummy_user_message, rejected],
            add_generation_prompt=False,
            chat_template=chat_template_string,
            tokenize=False,
        )
        rejected_strip_index = result["rejected"].find(rejected["content"])
        result["rejected"] = result["rejected"][rejected_strip_index:].rstrip()

        return result

    return transform_fn, {"remove_columns": [field_messages]}


def argilla_chat(cfg, dataset_idx=0, **kwargs):
    """
    DPO chat template strategy for argilla-style datasets.

    For argilla-style datasets where chosen/rejected contain full conversations
    instead of single response messages. Extracts the conversation history from
    the chosen field and formats both chosen/rejected responses using the
    configured chat template.

    Args:
        cfg: Configuration object containing chat_template and dataset settings
        dataset_idx: Index of the dataset in the config (default: 0)
        **kwargs: Additional keyword arguments (unused)

    Returns:
        tuple: (transform_fn, dataset_kwargs) where:
            - transform_fn: Function to transform dataset samples
            - dataset_kwargs: Dict with 'remove_columns' specifying columns to drop

    Dataset format:
        {
            "chosen": [
                {"role": "user", "content": "..."},
                {"role": "assistant", "content": "..."}
            ],
            "rejected": [
                {"role": "user", "content": "..."},
                {"role": "assistant", "content": "..."}
            ]
        }
    """
    ds_cfg = cfg["datasets"][dataset_idx]
    ds_cfg = handle_legacy_message_fields_logic(ds_cfg)

    chat_template_choice, chat_template_jinja = extract_chat_template_args(
        cfg=cfg, ds_cfg=ds_cfg
    )
    field_chosen = ds_cfg.get("field_chosen", "chosen")
    field_rejected = ds_cfg.get("field_rejected", "rejected")
    message_property_mappings = ds_cfg.get(
        "message_property_mappings",
        {
            "role": "role",
            "content": "content",
        },
    )
    role_map_inv = ds_cfg.get(
        "roles",
        {
            "user": ["user"],
            "assistant": ["assistant"],
            "system": ["system"],
        },
    )
    role_map = {}
    for target, sources in role_map_inv.items():
        for source in sources:
            role_map[source] = target

    def transform_fn(sample, tokenizer=None):
        chat_template_string = get_chat_template(
            user_choice=chat_template_choice,
            jinja_template=chat_template_jinja,
            tokenizer=tokenizer,
        )

        chosen_raw = sample[field_chosen]
        rejected_raw = sample[field_rejected]

        # Extract messages (all but last) and responses (last message)
        chosen_messages = [
            {
                "role": role_map[m[message_property_mappings["role"]]],
                "content": m[message_property_mappings["content"]],
            }
            for m in chosen_raw[:-1]
        ]
        chosen_response = {
            "role": role_map[chosen_raw[-1][message_property_mappings["role"]]],
            "content": chosen_raw[-1][message_property_mappings["content"]],
        }

        rejected_response = {
            "role": role_map[rejected_raw[-1][message_property_mappings["role"]]],
            "content": rejected_raw[-1][message_property_mappings["content"]],
        }

        dummy_user_message = {"role": "user", "content": "[[dummy_message]]"}

        result = {}
        result["prompt"] = tokenizer.apply_chat_template(
            chosen_messages,
            add_generation_prompt=True,
            chat_template=chat_template_string,
            tokenize=False,
        )

        result["chosen"] = tokenizer.apply_chat_template(
            [dummy_user_message, chosen_response],
            add_generation_prompt=False,
            chat_template=chat_template_string,
            tokenize=False,
        )
        chosen_strip_index = result["chosen"].find(chosen_response["content"])
        result["chosen"] = result["chosen"][chosen_strip_index:].rstrip()

        result["rejected"] = tokenizer.apply_chat_template(
            [dummy_user_message, rejected_response],
            add_generation_prompt=False,
            chat_template=chat_template_string,
            tokenize=False,
        )
        rejected_strip_index = result["rejected"].find(rejected_response["content"])
        result["rejected"] = result["rejected"][rejected_strip_index:].rstrip()

        return result

    return transform_fn, {"remove_columns": [field_chosen, field_rejected]}


================================================
FILE: src/axolotl/prompt_strategies/dpo/chatml.py
================================================
"""
DPO strategies for chatml
"""


def default(
    cfg,
    **kwargs,
):
    def transform_fn(sample):
        if "prompt" in sample.keys():
            prompt_key = "prompt"
        elif "input" in sample.keys():
            prompt_key = "input"
        elif "question" in sample.keys():
            prompt_key = "question"
        else:
            prompt_key = "instruction"

        if "chosen" in sample.keys():
            chosen_key = "chosen"
        else:
            chosen_key = "chosen_response"

        if "rejected" in sample.keys():
            rejected_key = "rejected"
        else:
            rejected_key = "rejected_response"

        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
                f"<|im_start|>user\n{sample[prompt_key]}<|im_end|>\n<|im_start|>assistant\n"
            )
        else:
            sample["prompt"] = (
                f"<|im_start|>user\n{sample[prompt_key]}<|im_end|>\n<|im_start|>assistant\n"
            )
        sample["chosen"] = f"{sample[chosen_key]}<|im_end|>"
        sample["rejected"] = f"{sample[rejected_key]}<|im_end|>"
        return sample

    return transform_fn


def argilla_chat(
    cfg,
    **kwargs,
):
    """
    for argilla/dpo-mix-7k conversations
    """

    def transform_fn(sample):
        sample["prompt"] = (
            f"<|im_start|>user\n{sample['chosen'][0]['content']}<|im_end|>\n<|im_start|>assistant\n"
        )
        sample["chosen"] = f"{sample['chosen'][1]['content']}<|im_end|>"
        sample["rejected"] = f"{sample['rejected'][1]['content']}<|im_end|>"
        return sample

    return transform_fn


def icr(
    cfg,
    **kwargs,
):
    """
    chatml transforms for datasets with system, input, chosen, rejected
    ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs
    """

    def transform_fn(sample):
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
                f"<|im_start|>user\n{sample['input']}<|im_end|>\n<|im_start|>assistant\n"
            )
        else:
            sample["prompt"] = (
                f"<|im_start|>user\n{sample['input']}<|im_end|>\n<|im_start|>assistant\n"
            )
        sample["chosen"] = f"{sample['chosen']}<|im_end|>"
        sample["rejected"] = f"{sample['rejected']}<|im_end|>"
        return sample

    return transform_fn


def intel(cfg, **kwargs):
    """
    For Intel Orca DPO Pairs
    """

    def transform_fn(sample):
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
                f"<|im_start|>user\n{sample['question']}<|im_end|>\n<|im_start|>assistant\n"
            )
        else:
            sample["prompt"] = (
                f"<|im_start|>user\n{sample['question']}<|im_end|>\n<|im_start|>assistant\n"
            )
        sample["chosen"] = f"{sample['chosen']}<|im_end|>"
        sample["rejected"] = f"{sample['rejected']}<|im_end|>"
        return sample

    return transform_fn


def prompt_pairs(cfg, **kwargs):
    def transform_fn(sample):
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
                f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
            )
        else:
            sample["prompt"] = (
                f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
            )
        sample["chosen"] = f"{sample['chosen']}<|im_end|>"
        sample["rejected"] = f"{sample['rejected']}<|im_end|>"
        return sample

    return transform_fn


def ultra(cfg, **kwargs):
    """
    for ultrafeedback binarized conversations
    """

    def transform_fn(sample):
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
                f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
            )
        else:
            sample["prompt"] = (
                f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
            )
        sample["chosen"] = f"{sample['chosen'][1]['content']}<|im_end|>"
        sample["rejected"] = f"{sample['rejected'][1]['content']}<|im_end|>"
        return sample

    return transform_fn


================================================
FILE: src/axolotl/prompt_strategies/dpo/llama3.py
================================================
"""
DPO strategies for llama-3 chat template
"""


def default(
    cfg,
    **kwargs,
):
    def transform_fn(sample):
        if "prompt" in sample.keys():
            prompt_key = "prompt"
        elif "input" in sample.keys():
            prompt_key = "input"
        elif "question" in sample.keys():
            prompt_key = "question"
        else:
            prompt_key = "instruction"

        if "chosen" in sample.keys():
            chosen_key = "chosen"
        else:
            chosen_key = "chosen_response"

        if "rejected" in sample.keys():
            rejected_key = "rejected"
        else:
            rejected_key = "rejected_response"

        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>"
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample[prompt_key]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        else:
            sample["prompt"] = (
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample[prompt_key]}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        sample["chosen"] = f"{sample[chosen_key]}<|eot_id|>"
        sample["rejected"] = f"{sample[rejected_key]}<|eot_id|>"
        return sample

    return transform_fn


def argilla_chat(
    cfg,
    **kwargs,
):
    """
    for argilla/dpo-mix-7k conversations
    """

    def transform_fn(sample):
        sample["prompt"] = (
            f"<|start_header_id|>user<|end_header_id|>\n\n{sample['chosen'][0]['content']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
        )
        sample["chosen"] = f"{sample['chosen'][1]['content']}<|eot_id|>"
        sample["rejected"] = f"{sample['rejected'][1]['content']}<|eot_id|>"
        return sample

    return transform_fn


def icr(
    cfg,
    **kwargs,
):
    """
    chatml transforms for datasets with system, input, chosen, rejected
    ex. https://huggingface.co/datasets/argilla/distilabel-intel-orca-dpo-pairs
    """

    def transform_fn(sample):
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>"
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        else:
            sample["prompt"] = (
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['input']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        sample["chosen"] = f"{sample['chosen']}<|eot_id|>"
        sample["rejected"] = f"{sample['rejected']}<|eot_id|>"
        return sample

    return transform_fn


def intel(cfg, **kwargs):
    """
    For Intel Orca DPO Pairs
    """

    def transform_fn(sample):
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>"
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        else:
            sample["prompt"] = (
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        sample["chosen"] = f"{sample['chosen']}<|eot_id|>"
        sample["rejected"] = f"{sample['rejected']}<|eot_id|>"
        return sample

    return transform_fn


def prompt_pairs(cfg, **kwargs):
    def transform_fn(sample):
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>"
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['prompt']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        else:
            sample["prompt"] = (
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['prompt']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        sample["chosen"] = f"{sample['chosen']}<|eot_id|>"
        sample["rejected"] = f"{sample['rejected']}<|eot_id|>"
        return sample

    return transform_fn


def ultra(cfg, **kwargs):
    """
    for ultrafeedback binarized conversations
    """

    def transform_fn(sample):
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>"
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['prompt']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        else:
            sample["prompt"] = (
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['prompt']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        sample["chosen"] = f"{sample['chosen'][1]['content']}<|eot_id|>"
        sample["rejected"] = f"{sample['rejected'][1]['content']}<|eot_id|>"
        return sample

    return transform_fn


================================================
FILE: src/axolotl/prompt_strategies/dpo/passthrough.py
================================================
"""
DPO prompt strategies passthrough/zero-processing strategy
"""


def default(cfg, dataset_idx=0, **kwargs):
    def transform_fn(sample, tokenizer=None):
        return sample

    return transform_fn


================================================
FILE: src/axolotl/prompt_strategies/dpo/user_defined.py
================================================
"""
User-defined DPO strategies
"""


def default(cfg, dataset_idx=0, **kwargs):
    ds_cfg = cfg["datasets"][dataset_idx]["type"]
    if not isinstance(ds_cfg, dict):
        raise ValueError(
            f"User-defined dataset type must be a dictionary. Got: {ds_cfg}"
        )
    field_prompt = ds_cfg.get("field_prompt", "prompt")
    field_system = ds_cfg.get("field_system", "system")
    field_chosen = ds_cfg.get("field_chosen", "chosen")
    field_rejected = ds_cfg.get("field_rejected", "rejected")
    prompt_format = ds_cfg.get("prompt_format")
    if not prompt_format:
        prompt_format = "{" + field_prompt + "}"
    chosen_format = ds_cfg.get("chosen_format")
    if not chosen_format:
        chosen_format = "{" + field_chosen + "}"
    rejected_format = ds_cfg.get("rejected_format")
    if not rejected_format:
        rejected_format = "{" + field_rejected + "}"

    def transform_fn(sample):
        if (
            "{" + field_system + "}" in prompt_format
            and field_system in sample
            and sample[field_system]
        ):
            sample["prompt"] = prompt_format.format(
                system=sample[field_system], prompt=sample[field_prompt]
            )
        else:
            sample["prompt"] = prompt_format.format(prompt=sample[field_prompt])
        sample["chosen"] = chosen_format.format(chosen=sample[field_chosen])
        sample["rejected"] = rejected_format.format(rejected=sample[field_rejected])
        return sample

    return transform_fn


================================================
FILE: src/axolotl/prompt_strategies/dpo/zephyr.py
================================================
"""
DPO strategies for zephyr
"""


def nectar(cfg, **kwargs):
    def transform_fn(sample):
        data = {}
        data["prompt"] = (
            f"<|system|>\n</s>\n<|user|>\n{sample['prompt']}</s>\n<|assistant|>\n"
        )
        answers = sorted(sample["answers"], key=lambda x: x["rank"])
        data["chosen"] = answers[-1]["answer"]
        data["rejected"] = answers[-2]["answer"]

        return data

    return transform_fn


================================================
FILE: src/axolotl/prompt_strategies/input_output.py
================================================
"""Module for plain input/output prompt pairs"""

from typing import Generator, Tuple

from axolotl.prompt_tokenizers import PromptTokenizingStrategy
from axolotl.prompters import IGNORE_TOKEN_ID, Prompter


class RawInputOutputStrategy(PromptTokenizingStrategy):
    """Prompt Strategy class for input/output pairs"""

    def __init__(self, *args, eos_token=None, **kwargs):
        super().__init__(*args, **kwargs)
        self.eos_token = eos_token
        if not eos_token:
            self.eos_token = self.tokenizer.eos_token

    def tokenize_prompt(self, prompt):
        input_ids = []
        labels = []
        for label, text in self.prompter.build_prompt(prompt["segments"]):
            tokenized_output = self.tokenizer(
                text, add_special_tokens=False, return_tensors=None
            )["input_ids"]
            input_ids += tokenized_output
            if label or self.train_on_inputs:
                labels += tokenized_output
            else:
                labels += [IGNORE_TOKEN_ID] * len(tokenized_output)

        tokenized_prompt = {
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": [1] * len(input_ids),
        }

        return tokenized_prompt


class RawInputOutputPrompter(Prompter):
    """prompter for raw i/o data"""

    def build_prompt(self, source) -> Generator[Tuple[bool, str], None, None]:
        for segment in source:
            yield segment["label"], segment["text"]


def load(tokenizer, cfg):
    return RawInputOutputStrategy(
        RawInputOutputPrompter(),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


================================================
FILE: src/axolotl/prompt_strategies/jinja_template_analyzer.py
================================================
"""Module for inspect jinja templates for the variables they use"""

from typing import Dict, Optional, Set, TypedDict, Union

from jinja2 import Environment, meta, nodes
from jinja2.ext import Extension


class JinjaTemplateAnalysis(TypedDict):
    """
    Represents the detailed analysis of a Jinja template variable.

    Attributes:
        accessed_properties (Set[str]): A set of properties accessed from the variable
            (e.g., `foo.bar` results in 'bar' being accessed for 'foo').
        accessed_indices (Set[Union[int, float]]): A set of indices accessed from the variable.
        is_iterated (bool): Indicates if the variable is used as an iteration source in a `for` loop.
        is_conditional (bool): Indicates if the variable is referenced within a conditional statement (e.g., an `if` block).
        iteration_source (Optional[str]): The name of the variable being iterated over, if applicable.
        iteration_target (Optional[Union[str, list[str]]]): The loop target(s) assigned in the iteration.
    """

    accessed_properties: Set[str]
    accessed_indices: Set[Union[int, float]]
    is_iterated: bool
    is_conditional: bool
    iteration_source: Optional[str]
    iteration_target: Optional[Union[str, list[str]]]


class GenerationTagIgnore(Extension):
    """
    Ignores the generation and endgeneration tags in Jinja templates.
    """

    tags = {"generation", "endgeneration"}

    def parse(self, parser):
        parser.stream.skip(1)
        return nodes.Const("")


class JinjaTemplateAnalyzer:
    """
    Analyzes Jinja templates to extract information about variable usage,
    including accessed properties, iteration, and conditional references.

    Attributes:
        env (jinja2.Environment): The Jinja2 environment used for parsing templates.
        property_access (Dict[str, Set[str]]): Tracks accessed properties for variables.
        iteration_targets (Dict[str, str]): Maps iteration target variables to their sources.

    Methods:
        get_template_variables(template: str) -> Dict[str, Set[str]]:
            Parse a Jinja template and return a mapping of variables to their accessed properties.

        analyze_template(template: str) -> Dict[str, JinjaTemplateAnalysis]:
            Perform a detailed analysis of the template, including variable usage,
            iteration, and conditional references.

    Private Methods:
        _visit_node(node) -> None:
            Recursively visit AST nodes to detect attribute access and iteration targets.

        _get_base_name(node) -> Optional[str]:
            Extract the base variable name from a node.

        _get_target_name(node) -> Optional[Union[str, list[str]]]:
            Extract the target name(s) from a `For` node.
    """

    def __init__(self, template: str):
        self.env: Environment = Environment(
            autoescape=True, extensions=[GenerationTagIgnore]
        )
        self.property_access: Dict[str, Set[str]] = {}
        self.iteration_targets: Dict[str, Union[str, list[str]]] = {}
        self.index_access: Dict[str, Set[Union[int, float]]] = {}
        self.ast: nodes.Node = self.env.parse(template)
        self.template: str = template
        self.variable_assignments: Dict[str, str] = {}

    def _visit_node(self, node) -> None:
        """Recursively visit AST nodes to find attribute access."""
        # Handle attribute access (dot notation)
        if isinstance(node, nodes.Getattr):
            base_name = self._get_base_name(node.node)
            if base_name:
                self.property_access.setdefault(base_name, set()).add(node.attr)

        # Handle dictionary access (subscript notation)
        elif isinstance(node, nodes.Getitem):
            base_name = self._get_base_name(node.node)
            if base_name and isinstance(node.arg, nodes.Const):
                value = node.arg.value
                if isinstance(value, (int, float)):
                    self.index_access.setdefault(base_name, set()).add(value)
                else:
                    self.property_access.setdefault(base_name, set()).add(value)

        elif isinstance(node, nodes.Test) and node.name == "defined":
            base_name = self._get_base_name(node.node)
            if base_name:
                if isinstance(node.node, nodes.Getattr):
                    self.property_access.setdefault(base_name, set()).add(
                        node.node.attr
                    )

        # Handle loop variables
        elif isinstance(node, nodes.For):
            iter_name = self._get_base_name(node.iter)
            target_name = self._get_target_name(node.target)
            if iter_name and target_name:
                self.iteration_targets[target_name] = iter_name
                self.property_access.setdefault(iter_name, set())

        elif isinstance(node, nodes.Assign):
            target_name = self._get_target_name(node.target)
            source_name = self._get_base_name(node.node)
            if target_name and source_name:
                self.variable_assignments[target_name] = source_name

        elif isinstance(node, nodes.Filter):
            if node.name == "selectattr":
                target = self._get_base_name(node.node)
                if target:
                    self.variable_assignments[f"filtered_{target}"] = target

        for child in node.iter_child_nodes():
            self._visit_node(child)

    def _get_target_name(self, node) -> Optional[str]:
        """Get the target variable name from a For node.

        Args:
            node: A Jinja AST node representing either a Name or Tuple node

        Returns:
            - str: For simple variable targets (e.g., "item" in "for item in items")
            - None: If the node type is not recognized or is a tuple
        """
        if isinstance(node, nodes.Name):
            return node.name
        return None

    def _get_target_names(self, node) -> list[str]:
        """Get all target variable names from a For node, including tuple unpacking.

        Args:
            node: A Jinja AST node representing either a Name or Tuple node

        Returns:
            List of target variable names
        """
        if isinstance(node, nodes.Name):
            return [node.name]

        if isinstance(node, nodes.Tuple):
            names = []
            for n in node.items:
                if isinstance(n, nodes.Name):
                    names.append(n.name)
            return names

        return []

    def _get_base_name(self, node) -> Optional[str]:
        """Get the base variable name from a node."""
        if isinstance(node, nodes.Name):
            return node.name

        if isinstance(node, nodes.Getattr):
            return self._get_base_name(node.node)

        if isinstance(node, nodes.Getitem):
            return self._get_base_name(node.node)

        return None

    def get_template_variables(self) -> Dict[str, Set[str]]:
        """
        Parse a Jinja template and return both variables and their accessed properties.

        Args:
            template (str): The Jinja template string

        Returns:
            Dict[str, Set[str]]: Dictionary mapping variable names to sets of accessed properties
        """
        # Parse the template
        ast = self.env.parse(self.template)

        # Get all undeclared variables
        variables = meta.find_undeclared_variables(ast)

        # Reset property access tracking
        self.property_access = {}

        # Visit all nodes to find property access
        self._visit_node(ast)

        # Create result dictionary
        result: Dict[str, Set[str]] = {var: set() for var in variables}
        # Merge in any discovered sub-properties
        for var, props in self.property_access.items():
            if var not in result:
                result[var] = set()
            result[var].update(props)

        return result

    def analyze_template(self) -> Dict[str, JinjaTemplateAnalysis]:
        """
        Provide a detailed analysis of template variables and their usage.
        """
        variables = self.get_template_variables()
        self.iteration_targets = {}

        analysis: Dict[str, JinjaTemplateAnalysis] = {
            var: JinjaTemplateAnalysis(
                accessed_properties=props,
                accessed_indices=set(),
                is_iterated=False,
                is_conditional=False,
                iteration_source=None,
                iteration_target=None,
            )
            for var, props in variables.items()
        }

        for var, indices in self.index_access.items():
            if var in analysis:
                analysis[var]["accessed_indices"] = indices

        def visit_node(node):
            if isinstance(node, nodes.If):

                def find_test_vars(test_node):
                    if isinstance(test_node, nodes.Name):
                        if test_node.name in analysis:
                            analysis[test_node.name]["is_conditional"] = True
                    for child in test_node.iter_child_nodes():
                        find_test_vars(child)

                find_test_vars(node.test)

            if isinstance(node, nodes.For):
                iter_target = self._get_base_name(node.iter)
                target_name = self._get_target_name(node.target)
                if iter_target in analysis:
                    analysis[iter_target]["is_iterated"] = True
                    if target_name:
                        analysis[iter_target]["iteration_target"] = target_name
                        if isinstance(target_name, str) and target_name not in analysis:
                            analysis[target_name] = {
                                "accessed_properties": set(),
                                "is_iterated": False,
                                "is_conditional": False,
                                "iteration_source": iter_target,
                                "iteration_target": None,
                            }

            for child in node.iter_child_nodes():
                visit_node(child)

        visit_node(self.ast)
        return analysis

    def get_downstream_properties(self, start_var: str) -> Dict[str, Set[str]]:
        """
        Get all properties accessed on a variable and its downstream assignments.

        Args:
            start_var: The starting variable to trace

        Returns:
            Dict mapping variable names to their accessed properties
        """
        visited = set()
        properties = {}

        def trace_variable(var_name: str):
            if var_name in visited:
                return
            visited.add(var_name)

            # Get direct properties
            if var_name in self.property_access:
                properties[var_name] = self.property_access[var_name]

            # Get properties from iteration targets
            if var_name in self.iteration_targets:
                target = self.iteration_targets[var_name]
                if isinstance(target, str):
                    trace_variable(target)
                elif isinstance(target, list):
                    for t in target:
                        trace_variable(t)

            # Follow assignments
            for target, source in self.variable_assignments.items():
                if source == var_name:
                    trace_variable(target)

            # Check for array slicing
            analysis = self.analyze_template()
            if var_name in analysis:
                var_info = analysis[var_name]
                if var_info["accessed_indices"]:
                    # If this variable is sliced, follow the resulting assignment
                    slice_result = f"{var_name}_slice"
                    if slice_result in self.property_access:
                        trace_variable(slice_result)

        trace_variable(start_var)
        return properties

    def get_message_vars(self, field_messages: str = "messages") -> Set[str]:
        """
        Get all properties accessed on messages and derived variables.
        """
        all_properties = self.get_downstream_properties(field_messages)

        # Combine all properties from all related variables
        combined_properties = set()
        for properties in all_properties.values():
            combined_properties.update(properties)

        # Also include properties from the message iteration variable
        analysis = self.analyze_template()
        if "message" in analysis:
            combined_properties.update(analysis["message"]["accessed_properties"])

        return combined_properties


================================================
FILE: src/axolotl/prompt_strategies/kto/__init__.py
================================================
"""
module for KTO style dataset transform strategies
"""

from functools import partial

from ..base import load as load_base

load = partial(load_base, module_base="axolotl.prompt_strategies.kto")


================================================
FILE: src/axolotl/prompt_strategies/kto/chatml.py
================================================
"""
KTO strategies for chatml
"""


def argilla(
    cfg,
    **kwargs,
):
    def transform_fn(sample):
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
                f"<|im_start|>user\n{sample['instruction']}<|im_end|>\n<|im_start|>assistant\n"
            )
        else:
            sample["prompt"] = (
                f"<|im_start|>user\n{sample['instruction']}<|im_end|>\n<|im_start|>assistant\n"
            )
        sample["completion"] = f"{sample['completion']}<|im_end|>"
        return sample

    return transform_fn


def argilla_chat(
    cfg,
    **kwargs,
):
    """
    for argilla/kto-mix-15k conversations
    """

    def transform_fn(sample):
        sample["prompt"] = (
            f"<|im_start|>user\n{sample['chosen'][0]['content']}<|im_end|>\n<|im_start|>assistant\n"
        )
        sample["completion"] = f"{sample['completion'][1]['content']}<|im_end|>"
        return sample

    return transform_fn


def intel(cfg, **kwargs):
    """
    For Intel Orca KTO
    ex: argilla/distilabel-intel-orca-kto
    """

    def transform_fn(sample):
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
                f"<|im_start|>user\n{sample['question']}<|im_end|>\n<|im_start|>assistant\n"
            )
        else:
            sample["prompt"] = (
                f"<|im_start|>user\n{sample['question']}<|im_end|>\n<|im_start|>assistant\n"
            )
        sample["completion"] = f"{sample['completion']}<|im_end|>"
        return sample

    return transform_fn


def prompt_pairs(cfg, **kwargs):
    def transform_fn(sample):
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
                f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
            )
        else:
            sample["prompt"] = (
                f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
            )
        sample["completion"] = f"{sample['completion']}<|im_end|>"
        return sample

    return transform_fn


def ultra(cfg, **kwargs):
    """
    for ultrafeedback binarized conversations
    ex: argilla/ultrafeedback-binarized-preferences-cleaned-kto
    """

    def transform_fn(sample):
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|im_start|>system\n{sample['system']}<|im_end|>\n"
                f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
            )
        else:
            sample["prompt"] = (
                f"<|im_start|>user\n{sample['prompt']}<|im_end|>\n<|im_start|>assistant\n"
            )
        sample["completion"] = f"{sample['completion']}<|im_end|>"
        return sample

    return transform_fn


================================================
FILE: src/axolotl/prompt_strategies/kto/llama3.py
================================================
"""
KTO strategies for llama-3 chat template
"""


def argilla(
    cfg,
    **kwargs,
):
    def transform_fn(sample):
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>"
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['instruction']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        else:
            sample["prompt"] = (
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['instruction']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        sample["completion"] = f"{sample['completion']}<|eot_id|>"
        return sample

    return transform_fn


def argilla_chat(
    cfg,
    **kwargs,
):
    """
    for argilla/kto-mix-15k conversations
    """

    def transform_fn(sample):
        sample["prompt"] = (
            f"<|start_header_id|>user<|end_header_id|>\n\n{sample['completion'][0]['content']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
        )
        sample["completion"] = f"{sample['completion'][1]['content']}<|eot_id|>"
        return sample

    return transform_fn


def intel(cfg, **kwargs):
    """
    For Intel Orca KTO
    ex: argilla/distilabel-intel-orca-kto
    """

    def transform_fn(sample):
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>"
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        else:
            sample["prompt"] = (
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['question']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        sample["completion"] = f"{sample['completion']}<|eot_id|>"
        return sample

    return transform_fn


def prompt_pairs(cfg, **kwargs):
    def transform_fn(sample):
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>"
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['prompt']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        else:
            sample["prompt"] = (
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['prompt']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        sample["completion"] = f"{sample['completion']}<|eot_id|>"
        return sample

    return transform_fn


def ultra(cfg, **kwargs):
    """
    for ultrafeedback binarized conversations
    ex: argilla/ultrafeedback-binarized-preferences-cleaned-kto
    """

    def transform_fn(sample):
        if "system" in sample and sample["system"]:
            sample["prompt"] = (
                f"<|start_header_id|>system<|end_header_id|>\n\n{sample['system']}<|eot_id|>"
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['prompt']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        else:
            sample["prompt"] = (
                f"<|start_header_id|>user<|end_header_id|>\n\n{sample['prompt']}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
            )
        sample["completion"] = f"{sample['completion']}<|eot_id|>"
        return sample

    return transform_fn


================================================
FILE: src/axolotl/prompt_strategies/kto/user_defined.py
================================================
"""
User-defined KTO strategies
"""


def default(cfg, dataset_idx=0, **kwargs):
    ds_cfg = cfg["datasets"][dataset_idx]["type"]
    if not isinstance(ds_cfg, dict):
        raise ValueError(
            f"User-defined dataset type must be a dictionary. Got: {ds_cfg}"
        )
    field_prompt = ds_cfg.get("field_prompt", "prompt")
    field_system = ds_cfg.get("field_system", "system")
    field_completion = ds_cfg.get("field_completion", "completion")
    field_label = ds_cfg.get("field_label", "label")
    prompt_format = ds_cfg.get("prompt_format")
    if not prompt_format:
        prompt_format = "{" + field_prompt + "}"
    completion_format = ds_cfg.get("completion_format")
    if not completion_format:
        chosen_format = "{" + field_completion + "}"

    def transform_fn(sample):
        if (
            "{" + field_system + "}" in prompt_format
            and field_system in sample
            and sample[field_system]
        ):
            sample["prompt"] = prompt_format.format(
                system=sample[field_system], prompt=sample[field_prompt]
            )
        else:
            sample["prompt"] = prompt_format.format(prompt=sample["prompt"])
        sample["completion"] = chosen_format.format(chosen=sample[field_completion])
        sample["label"] = sample[field_label]
        return sample

    return transform_fn


================================================
FILE: src/axolotl/prompt_strategies/llama2_chat.py
================================================
"""
Prompt Strategy for finetuning Llama2 chat models
see also https://github.com/facebookresearch/llama/blob/6c7fe276574e78057f917549435a2554000a876d/llama/generation.py#L213 for ma reference implementation.

This implementation is based on the Vicuna PR and the fastchat repo, see also:
https://github.com/lm-sys/FastChat/blob/cdd7730686cb1bf9ae2b768ee171bdf7d1ff04f3/fastchat/conversation.py#L847

Use dataset type: "llama2_chat" in conig.yml to use this prompt style.

E.g. in the config.yml:
```
datasets:
  - path: llama_finetune_train.jsonl
    type: llama2_chat
```

The dataset itself should look like this:
```
{'conversations':[{"from": "human", "value": "Who are you?"}, {"from": "gpt", "value": "I am Vicuna"},...]}
```
in a jsonl file. The first message should be from the human, the second from gpt.
For a custom system message, the first "from" can be "system" (followed by alternating "human" and "gpt" turns).

Important: Don't use "special_tokens:" in your config.yml if you are not sure what you are doing!
"""

from dataclasses import dataclass, field
from typing import Generator, List, Sequence

from axolotl.prompt_tokenizers import PromptTokenizingStrategy
from axolotl.prompters import ALTERNATING_ASSERTION_FAILED_ROLE, IGNORE_TOKEN_ID
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


@dataclass
class Llama2ChatConversation:
    """A class that manages prompt templates and keeps all conversation history.
    copied from https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py"""

    name: str = "llama2"
    # The system prompt
    system: str = (
        "[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. "
        "Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. "
        "Please ensure that your responses are socially unbiased and positive in nature.\n\n"
        "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. "
        "If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n\n"
    )
    roles: Sequence[str] = ("[INST]", "[/INST]")
    messages: List[List[str]] = field(default_factory=list)
    offset: int = 0
    sep = " "
    sep2 = " </s><s>"
    stop_token_ids = [2]

    def get_prompt(self) -> str:
        """Get the prompt for generation."""
        seps = [self.sep, self.sep2]
        ret = ""
        for i, (role, message) in enumerate(self.messages):
            if (i == len(self.messages) - 1) and (role == self.roles[0]):
                # last message is from user (due to length),
                #  return prompt without it for training
                return ret
            if i == 0:
                ret += self.system + message.strip()
            else:
                ret += role + " " + message.strip() + seps[i % 2]
        return ret

    def append_message(self, role: str, message: str):
        """Append a new message."""
        self.messages.append([role, message])


class LLama2ChatTokenizingStrategy(PromptTokenizingStrategy):
    """
    Tokenizing strategy for Llama2 prompts.
    adapted from https://github.com/lm-sys/FastChat/blob/main/fastchat/train/train.py
    """

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.tokenizer.add_special_tokens(
            {"pad_token": getattr(self.tokenizer, "pad_token", "<pad>")}
        )
        # https://huggingface.co/meta-llama/Llama-2-7b-chat-hf/blob/main/added_tokens.json

    def tokenize_prompt(self, prompt):
        conv = next(self.prompter.build_prompt(prompt))
        conversation_str = conv.get_prompt()

        # Tokenize conversations
        input_ids = self.tokenizer(
            conversation_str,
            return_tensors="pt",
            padding="max_length",
            max_length=self.sequence_len,
            truncation=True,
        ).input_ids[0]
        target = input_ids.clone()

        # Mask targets. Only compute loss on the assistant outputs.
        sep = conv.roles[1]

        total_len = int(target.ne(self.tokenizer.pad_token_id).sum())

        turns = conversation_str.split(conv.sep2)
        cur_len = 1
        target[:cur_len] = IGNORE_TOKEN_ID
        for turn in turns:
            if turn == "":
                break
            turn_len = len(self.tokenizer(turn).input_ids)

            parts = turn.split(sep)
            if len(parts) != 2:
                break
            parts[0] += sep
            # "-1" is hardcoded for the LLaMA tokenizer to make the offset correct.
            instruction_len = len(self.tokenizer(parts[0]).input_ids) - 1

            # Ignore the user instructions
            target[cur_len - 1 : cur_len + instruction_len] = IGNORE_TOKEN_ID
            cur_len += turn_len + 2  # due to length of role token

        target[cur_len:] = IGNORE_TOKEN_ID

        if cur_len < self.sequence_len:
            if cur_len != total_len:
                target[:] = IGNORE_TOKEN_ID
                LOG.warning(
                    f"WARNING: tokenization mismatch: {cur_len} vs. {total_len}."
                    f" (ignored)"
                )

        attention_mask = input_ids.ne(self.tokenizer.pad_token_id).tolist()
        input_ids = input_ids.tolist()
        target = target.tolist()
        # this is a fix for the tokenizer which tokenizes [ differently with eos tokens and
        # follows the original llama implementation
        for i in range(2, total_len - 2):
            if input_ids[i] == 29961:
                input_ids[i] = 518
            if target[i] == 29961:
                target[i] = 518
        return {
            "input_ids": input_ids,
            "labels": target,
            "attention_mask": attention_mask,
        }


class Llama2ChatPrompter:
    """
    A prompter that generates prompts for Llama2 models.
    """

    system_prompt = (
        "[INST] <<SYS>>\nYou are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. "
        "Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. "
        "Please ensure that your responses are socially unbiased and positive in nature.\n\n"
        "If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. "
        "If you don't know the answer to a question, please don't share false information.\n<</SYS>>\n\n"
    )

    def build_prompt(self, source) -> Generator[Llama2ChatConversation, None, None]:
        # see https://github.com/lm-sys/FastChat/blob/da0641e567cf93756b0978ab5a6b092e96f06240/fastchat/train/train.py#L78
        source = source["conversations"]  # fix data structure for datasets

        # if system prompt provided, use it
        if source[0]["from"] == "system":
            system = f"[INST] <<SYS>>\n{source[0]['value']}\n<</SYS>>\n\n"
            source = source[1:]
        else:
            system = self.system_prompt

        conv = Llama2ChatConversation(system=system)

        if len(source) < 2:
            # If there isn't a back and forth conversation, ignore it
            # also happens on the data splitting leaving empty conversations
            raise IndexError

        roles = {"human": conv.roles[0], "gpt": conv.roles[1]}

        if roles[source[0]["from"]] != conv.roles[0]:
            # Skip the first one if it is not from human
            source = source[1:]

        conv.messages = []
        for j, sentence in enumerate(source):
            role = roles[sentence["from"]]
            assert role == conv.roles[j % 2], ALTERNATING_ASSERTION_FAILED_ROLE
            if sentence["value"]:
                conv.append_message(role, sentence["value"])
        yield conv


def load(tokenizer, cfg) -> LLama2ChatTokenizingStrategy:
    return LLama2ChatTokenizingStrategy(
        Llama2ChatPrompter(),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


================================================
FILE: src/axolotl/prompt_strategies/messages/__init__.py
================================================
"""Module to load message prompt strategies."""

import importlib
import inspect

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def load(tokenizer, cfg, ds_cfg, processor=None):
    try:
        strategy = ds_cfg.get("input_transform", "chat")

        load_fn = "load"
        if strategy.split(".")[-1].startswith("load_"):
            load_fn = strategy.split(".")[-1]
            strategy = ".".join(strategy.split(".")[:-1])
        mod = importlib.import_module(
            f".{strategy}", "axolotl.prompt_strategies.messages"
        )
        func = getattr(mod, load_fn)
        load_kwargs = {}
        sig = inspect.signature(func)
        if "ds_cfg" in sig.parameters:
            load_kwargs["ds_cfg"] = ds_cfg
        if "processor" in sig.parameters:
            load_kwargs["processor"] = processor
        return func(tokenizer, cfg, **load_kwargs)
    except ModuleNotFoundError:
        return None
    except Exception as exc:
        LOG.error(f"Failed to load prompt strategy `{strategy}`: {str(exc)}")
        raise exc


================================================
FILE: src/axolotl/prompt_strategies/messages/chat.py
================================================
"""
Chat dataset wrapping strategy for new internal messages representations
"""

from typing import Any, Callable, Dict, Optional

from axolotl.core.datasets.chat import TokenizedChatDataset
from axolotl.core.datasets.transforms.chat_builder import chat_message_transform_builder
from axolotl.prompt_tokenizers import DatasetWrappingStrategy


class ChatMessageDatasetWrappingStrategy(DatasetWrappingStrategy):
    """
    Chat dataset wrapping strategy for new internal messages representations
    """

    def __init__(
        self,
        processor,
        message_transform=None,
        formatter=None,
        **kwargs,
    ):
        """
        :param processor: tokenizer or image processor
        :param kwargs:
        """
        self.processor = processor
        self.dataset = None
        self.message_transform = message_transform
        self.formatter = formatter

    def wrap_dataset(
        self,
        dataset,
        process_count: Optional[int] = None,
        keep_in_memory: Optional[bool] = False,
        **kwargs,
    ):
        self.dataset = TokenizedChatDataset(
            dataset,
            message_transform=self.message_transform,
            model_transform=self.processor,
            formatter=self.formatter,
            process_count=process_count,
            keep_in_memory=keep_in_memory,
        )
        return self.dataset


def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None):
    ds_cfg = ds_cfg or {}

    field_messages = ds_cfg.get("field_messages")
    message_property_mappings = ds_cfg.get("message_property_mappings")
    message_field_role = (
        message_property_mappings.get("role") if message_property_mappings else None
    )
    message_field_content = (
        message_property_mappings.get("content") if message_property_mappings else None
    )
    message_field_training = ds_cfg.get("message_field_training")

    builder_kwargs = {}
    if field_messages:
        builder_kwargs["conversations_field"] = field_messages
    if message_field_role:
        builder_kwargs["message_field_role"] = message_field_role
    if message_field_content:
        builder_kwargs["message_field_content"] = message_field_content
    if message_field_training:
        builder_kwargs["message_field_training"] = message_field_training

    chat_template = ds_cfg.get("chat_template", cfg.get("chat_template", "chatml"))

    def format_message(x):
        return x

    if chat_template == "chatml":
        from axolotl.core.chat.format.chatml import format_message  # noqa F811
    if chat_template.startswith("llama3"):
        from axolotl.core.chat.format.llama3x import format_message  # noqa F811
    message_transform: Callable = chat_message_transform_builder(
        train_on_inputs=ds_cfg.get("train_on_inputs", False),
        **builder_kwargs,
    )
    strategy = ChatMessageDatasetWrappingStrategy(
        tokenizer, message_transform=message_transform, formatter=format_message
    )

    return strategy


================================================
FILE: src/axolotl/prompt_strategies/metharme.py
================================================
"""Module containing the MetharmenPromptTokenizingStrategy and MetharmePrompter class"""

from typing import Tuple

from axolotl.prompt_tokenizers import InstructionPromptTokenizingStrategy
from axolotl.prompters import AlpacaPrompter
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

IGNORE_TOKEN_ID = -100


class MetharmePromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
    """
    Tokenizing strategy for the Metharme models
    """

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        return (prompt["prompt"], "", prompt["generation"])

    def _tokenize(
        self,
        prompt: str,
        add_eos_token: bool = True,
        strip_bos_token: bool = False,
        num_eos_tokens: int = 3,
    ):
        result = self.tokenizer(
            prompt,
            truncation=True,
            max_length=self.sequence_len,
            padding=False,
            return_tensors=None,
        )
        if len(result["input_ids"]) == 0:
            LOG.warning("Tokenizer result is empty. You may want to audit your dataset")
        # If there's already an EOS token there, subtract from the number added
        if result["input_ids"][-1] == self.tokenizer.eos_token_id:
            num_eos_tokens -= 1

        if num_eos_tokens > 0 and add_eos_token and len(result["input_ids"]) > 0:
            for _ in range(num_eos_tokens):
                if len(result["input_ids"]) < self.sequence_len:
                    result["input_ids"].append(self.tokenizer.eos_token_id)
                    result["attention_mask"].append(1)

        if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
            result["input_ids"] = result["input_ids"][1:]
            result["attention_mask"] = result["attention_mask"][1:]

        result["labels"] = result["input_ids"].copy()
        return result


class MetharmePrompter(AlpacaPrompter):
    """
    Prompter for the Metharme models.
    """

    system_prompt = ""
    system_no_input_prompt = ""
    system_format = ""
    turn_format = "{instruction}"
    turn_no_input_format = "{instruction}"

    def __init__(self, *args, **kwargs):
        pass


def load(tokenizer, cfg):
    return MetharmePromptTokenizingStrategy(
        MetharmePrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
    )


================================================
FILE: src/axolotl/prompt_strategies/orcamini.py
================================================
"""
Prompt Strategy for finetuning Orca Mini (v2) models
see also https://huggingface.co/psmathur/orca_mini_v2_7b for more information

Use dataset type: orcamini in conig.yml to use this prompt style.

Compared to the alpaca_w_system.open_orca dataset type,
this one specifies the system prompt with "### System:".

Not suited/tested for multiple-turn conversations without further adjustments.
"""

from typing import Generator, Union

from axolotl.prompt_strategies.alpaca_w_system import OpenOrcaPromptTokenizingStrategy
from axolotl.prompters import AlpacaPrompter


class OrcaMiniPrompter(AlpacaPrompter):
    """Adjusted Prompter for Orca Mini (v2) datasets"""

    def match_prompt_style(self):
        self.turn_no_input_format = (
            "### System:\n{system}\n\n### User:\n{instruction}\n\n### Response:\n"
        )

    def build_prompt_w_system(
        self,
        system: str,
        instruction: str,
        output: Union[None, str] = None,
    ) -> Generator[str, None, None]:
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        res = self.turn_no_input_format.format(system=system, instruction=instruction)
        if output:
            res = f"{res}{output}"
        yield res


def load(tokenizer, cfg):
    return OpenOrcaPromptTokenizingStrategy(
        OrcaMiniPrompter(),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )


================================================
FILE: src/axolotl/prompt_strategies/orpo/__init__.py
================================================
"""
module for ORPO style dataset transform strategies
"""

from functools import partial

from ..base import load as load_base

load = partial(load_base, module_base="axolotl.prompt_strategies.orpo")


================================================
FILE: src/axolotl/prompt_strategies/orpo/chat_template.py
================================================
"""chatml prompt tokenization strategy for ORPO"""

from typing import Any, Dict, Generator, List, Optional, Tuple

from pydantic import BaseModel

from axolotl.prompt_tokenizers import IGNORE_INDEX, PromptTokenizingStrategy
from axolotl.prompters import Prompter
from axolotl.utils.chat_templates import get_chat_template_from_config


class Message(BaseModel):
    """message/turn"""

    role: str
    content: str
    label: Optional[bool] = None


class MessageList(BaseModel):
    """conversation"""

    messages: List[Message]


def load(tokenizer, cfg, ds_cfg: Optional[Dict[str, Any]] = None, **kwargs):
    """
    chatml transforms for datasets with system, input, chosen, rejected
    """
    chat_template_string = get_chat_template_from_config(
        cfg=cfg, ds_cfg=ds_cfg, tokenizer=tokenizer
    )
    tokenizer.chat_template = chat_template_string

    return ORPOTokenizingStrategy(
        ORPOPrompter(chat_template_string, tokenizer),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
        dataset_parser=ORPODatasetParsingStrategy(),
    )


class ORPODatasetParsingStrategy:
    """Strategy to parse chosen rejected dataset into messagelist"""

    def get_chosen_conversation_thread(self, prompt) -> MessageList:
        """Dataset structure mappings"""

        messages: List[Message] = []
        if system := prompt.get("system", None):
            messages.append(Message(role="system", content=system, label=False))
        messages.append(
            Message(role="user", content=prompt["chosen"][0]["content"], label=False)
        )
        messages.append(
            Message(
                role="assistant", content=prompt["chosen"][1]["content"], label=True
            )
        )
        return MessageList(messages=messages)

    def get_rejected_conversation_thread(self, prompt) -> MessageList:
        """Dataset structure mappings"""

        messages: List[Message] = []
        if system := prompt.get("system", None):
            messages.append(Message(role="system", content=system, label=False))
        messages.append(
            Message(role="user", content=prompt["rejected"][0]["content"], label=False)
        )
        messages.append(
            Message(
                role="assistant", content=prompt["rejected"][1]["content"], label=True
            )
        )
        return MessageList(messages=messages)

    def get_prompt(self, prompt) -> MessageList:
        """Map the data to extract everything up to the last turn"""
        total_msg_len = len(prompt["chosen"])
        total_msg_turns, remainder = divmod(total_msg_len, 2)
        assert remainder == 0, "invalid number of turns"

        messages: List[Message] = []
        if system := prompt.get("system", None):
            messages.append(Message(role="system", content=system, label=False))
        for i in range(total_msg_turns):
            if "prompt" in prompt:
                messages.append(
                    Message(role="user", content=prompt["prompt"], label=False)
                )
            else:
                messages.append(
                    Message(
                        role="user",
                        content=prompt["chosen"][i * 2]["content"],
                        label=False,
                    )
                )
            if i < total_msg_turns - 1:
                messages.append(
                    Message(
                        role="assistant",
                        content=prompt["chosen"][i * 2 + 1]["content"],
                        label=False,
                    )
                )

        return MessageList(messages=messages)

    def get_chosen(self, prompt) -> MessageList:
        res = self.get_prompt(prompt)
        res.messages.append(
            Message(
                role="assistant", content=prompt["chosen"][-1]["content"], label=True
            )
        )
        return res

    def get_rejected(self, prompt) -> MessageList:
        res = self.get_prompt(prompt)
        res.messages.append(
            Message(
                role="assistant", content=prompt["rejected"][-1]["content"], label=True
            )
        )
        return res


class ORPOTokenizingStrategy(PromptTokenizingStrategy):
    """
    rejected_input_ids
    input_ids
    rejected_attention_mask
    attention_mask
    rejected_labels
    labels
    """

    def __init__(
        self,
        *args,
        dataset_parser=None,
        **kwargs,
    ):
        super().__init__(*args, **kwargs)
        self.dataset_parser = dataset_parser

    def tokenize_prompt(self, prompt):
        # pass the rejected prompt/row to the Prompter to get the formatted prompt
        prompt_len = 0
        rejected_message_list: MessageList = (
            self.dataset_parser.get_rejected_conversation_thread(prompt)
        )
        input_ids = []
        labels = []
        for _, (part, label) in enumerate(
            self.prompter.build_prompt(rejected_message_list)
        ):
            if not part:
                continue
            _input_ids = self.tokenizer.encode(part, add_special_tokens=False)
            prev_idx = len(input_ids)
            input_ids += _input_ids[prev_idx:]
            if label:
                labels += input_ids[prev_idx:]
            else:
                labels += [IGNORE_INDEX] * (len(input_ids) - prev_idx)
                prompt_len = len(input_ids)
        # remap the input_ids, attention_mask and labels
        rejected_input_ids = input_ids
        rejected_labels = labels
        # pass the chosen prompt/row to the Prompter to get the formatted prompt
        chosen_message_list: MessageList = (
            self.dataset_parser.get_chosen_conversation_thread(prompt)
        )
        input_ids = []
        labels = []
        for _, (part, label) in enumerate(
            self.prompter.build_prompt(chosen_message_list)
        ):
            if not part:
                continue
            _input_ids = self.tokenizer.encode(part, add_special_tokens=False)
            prev_idx = len(input_ids)
            input_ids += _input_ids[prev_idx:]
            if label:
                labels += input_ids[prev_idx:]
            else:
                labels += [IGNORE_INDEX] * (len(input_ids) - prev_idx)

        return {
            "rejected_input_ids": rejected_input_ids,
            "rejected_labels": rejected_labels,
            "rejected_attention_mask": [1] * len(rejected_labels),
            "input_ids": input_ids,
            "labels": labels,
            "attention_mask": [1] * len(labels),
            "prompt_attention_mask": [1] * prompt_len
            + [0] * (len(labels) - prompt_len),
        }


class ORPOPrompter(Prompter):
    """Single Turn prompter for ORPO"""

    def __init__(self, chat_template, tokenizer):
        self.chat_template = chat_template
        self.tokenizer = tokenizer

    def build_prompt(
        self,
        message_list: MessageList,
    ) -> Generator[Tuple[str, bool], None, None]:
        conversation = []
        for message in message_list.messages:
            conversation.append(message.model_dump())
            if message.role == "system":
                yield (
                    self.tokenizer.apply_chat_template(
                        conversation,
                        add_generation_prompt=False,
                        chat_template=self.chat_template,
                        tokenize=False,
                    ),
                    False,
                )
            if message.role == "user":
                yield (
                    self.tokenizer.apply_chat_template(
                        conversation,
                        add_generation_prompt=True,
                        chat_template=self.chat_template,
                        tokenize=False,
                    ),
                    False,
                )
            if message.role == "assistant":
                yield (
                    self.tokenizer.apply_chat_template(
                        conversation,
                        add_generation_prompt=False,
                        chat_template=self.chat_template,
                        tokenize=False,
                    ),
                    True,
                )


def argilla(cfg, **kwargs):
    dataset_parser = ORPODatasetParsingStrategy()

    def transform_fn(sample, tokenizer=None):
        res = {}

        chat_template_string = get_chat_template_from_config(
            cfg=cfg, tokenizer=tokenizer
        )

        res["prompt"] = tokenizer.apply_chat_template(
            [msg.model_dump() for msg in dataset_parser.get_prompt(sample).messages],
            add_generation_prompt=True,
            chat_template=chat_template_string,
            tokenize=False,
        )
        prompt_str_len = len(res["prompt"])
        res["chosen"] = tokenizer.apply_chat_template(
            [msg.model_dump() for msg in dataset_parser.get_chosen(sample).messages],
            add_generation_prompt=False,
            chat_template=chat_template_string,
            tokenize=False,
        )[prompt_str_len:]
        res["rejected"] = tokenizer.apply_chat_template(
            [msg.model_dump() for msg in dataset_parser.get_rejected(sample).messages],
            add_generation_prompt=False,
            chat_template=chat_template_string,
            tokenize=False,
        )[prompt_str_len:]

        return res

    return transform_fn


================================================
FILE: src/axolotl/prompt_strategies/pretrain.py
================================================
"""pretraining prompt strategies"""

from typing import Generator

from transformers import BatchEncoding

from axolotl.prompt_tokenizers import PromptTokenizingStrategy


class PretrainTokenizer:
    """basic tokenization class for pretraining"""

    def build_prompt(self, prompt) -> Generator[str, None, None]:
        yield prompt


class PretrainTokenizationStrategy(PromptTokenizingStrategy):
    """handles tokenization for pretraining with strides"""

    @property
    def supports_batched(self):
        return True

    def __init__(self, *args, max_length=None, text_column="text", **kwargs):
        super().__init__(*args, **kwargs)
        if max_length:
            self.max_length = max_length
        self.text_column = text_column

    def _tokenize(
        self, prompt: str, add_eos_token: bool = True, strip_bos_token: bool = False
    ) -> BatchEncoding:
        res = self.tokenizer(
            prompt,
            truncation=True,
            max_length=self.max_length - 1,
            add_special_tokens=True,
            return_overflowing_tokens=True,
            stride=256,
        )
        res["input_ids"] = [
            seq + [self.tokenizer.eos_token_id] for seq in res["input_ids"]
        ]
        res["attention_mask"] = [seq + [1] for seq in res["attention_mask"]]

        return res

    def tokenize_prompt(self, prompt):
        return self._tokenize(prompt[self.text_column])


def load(tokenizer, cfg):
    strat = PretrainTokenizationStrategy(
        PretrainTokenizer(),
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
        text_column=cfg.pretraining_dataset[0]["text_column"] or "text",
        max_length=cfg.sequence_len * 64,
    )
    return strat


================================================
FILE: src/axolotl/prompt_strategies/pygmalion.py
================================================
"""Module containing the PygmalionPromptTokenizingStrategy and PygmalionPrompter class"""

import copy
from collections import defaultdict
from typing import Generator, List, Tuple

from axolotl.prompt_tokenizers import (
    PromptTokenizingStrategy,
    parse_tokenized_to_result,
    tokenize_prompt_default,
)
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

IGNORE_TOKEN_ID = -100


class PygmalionPromptTokenizingStrategy(PromptTokenizingStrategy):
    """
    Tokenizing strategy for Pygmalion.
    """

    bot_prefix_token_ids: List[int] = []

    def __init__(self, prompter, tokenizer, *args, **kwargs):
        super().__init__(prompter, tokenizer, *args, **kwargs)
        res = self._tokenize("<|model|>", add_eos_token=False, strip_bos_token=True)
        self.bot_prefix_token_ids = res["input_ids"]

    def tokenize_prompt(self, prompt):
        result, current_len = tokenize_prompt_default()
        for _, part in enumerate(self.prompter.build_prompt(prompt["conversations"])):
            role, message = part
            if role == "system":
                prefix = "<|system|>"
                # this should include a bos token, no eos token, strip trailing "\n<START>"
                if message.endswith("\n<START>"):
                    message = message[:-8]
                res = self._tokenize(
                    prefix + "Persona: " + message.strip(),
                    add_eos_token=False,
                    strip_bos_token=False,
                )
                # everything from this is masked out from the labels
                labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
            elif role == "human":
                prefix = "<|user|>"
                res = self._tokenize(
                    prefix + " " + message.strip(),
                    add_eos_token=False,
                    strip_bos_token=True,
                )
                # everything from this is masked out from the labels
                labels = [IGNORE_TOKEN_ID] * len(res["input_ids"])
            elif role == "bot":
                prefix = "<|model|>"
                res = self._tokenize(
                    prefix + " " + message.strip(),
                    add_eos_token=True,
                    strip_bos_token=True,
                )
                # mask out the prefix token, rest is not masked out from labels
                # make sure we create the labels first, otherwise we get incorrect lengths
                labels = [IGNORE_TOKEN_ID] * len(self.bot_prefix_token_ids) + [
                    *copy.deepcopy(res["input_ids"])
                ][len(self.bot_prefix_token_ids) :]
            else:
                LOG.warning(f"unknown role in conversation: {role}")
                res = defaultdict(lambda: [])

            result, current_len = parse_tokenized_to_result(
                result,
                current_len,
                res,
                labels,
                pad_token_id=self.tokenizer.pad_token_id,
            )
        return result


class PygmalionPrompter:
    """
    Prompter for Pygmalion.
    """

    def __init__(self, *args, **kwargs):
        pass

    def build_prompt(
        self,
        source,
        *args,
        **kwargs,
    ) -> Generator[Tuple[str, str], None, None]:
        for msg in source:
            yield msg["role"], msg["value"]


def load(tokenizer, cfg):
    return PygmalionPromptTokenizingStrategy(
        PygmalionPrompter(), tokenizer, cfg.train_on_inputs, cfg.sequence_len
    )


================================================
FILE: src/axolotl/prompt_strategies/stepwise_supervised.py
================================================
"""
Module for stepwise datasets, typically including a prompt and reasoning traces,
and (optionally) per-step, or per-prompt-trace labels for reward modelling.
"""

from itertools import chain
from typing import Dict, List, Optional, Union

from transformers import BatchEncoding, PreTrainedTokenizer

from axolotl.prompt_tokenizers import IGNORE_INDEX
from axolotl.utils.dict import DictDefault


class StepwiseSupervisedPromptTokenizingStrategy:
    """
    Tokenizing strategy for supervised stepwise datasets, typically used for COT-reasoning.
    These datasets should include the following columns:
    - prompt: the prompt text
    - completions: a list of `n` completion steps
    - labels: a list of `n` labels indicating the "correctness" of each step
    """

    def __init__(
        self,
        tokenizer,
        sequence_len: int = 2048,
        step_separator: str = "\n",
        max_completion_length: Optional[int] = None,
        train_on_last_step_only: bool = False,
    ):
        self.tokenizer = tokenizer
        self.sequence_len = sequence_len
        self.step_separator = step_separator
        self.max_completion_length = max_completion_length
        self.train_on_last_step_only = train_on_last_step_only

    def tokenize_prompt(
        self, prompt: Dict[str, Union[str, List[str]]]
    ) -> BatchEncoding:
        # Inspired by TRL's PRMTRainer
        # https://github.com/huggingface/trl/blob/ed7de87dc766478c024b68f12530d1b0e7c3ff23/trl/trainer/prm_trainer.py#L206
        prompt_ids = self.tokenizer(prompt["prompt"], add_special_tokens=False)[
            "input_ids"
        ]

        completions_ids = [
            self.tokenizer(completion, add_special_tokens=False)["input_ids"]
            for completion in prompt["completions"]
        ]

        # Handle labels
        if self.train_on_last_step_only:
            labels = [IGNORE_INDEX] * (len(prompt["labels"]) - 1) + [
                int(prompt["labels"][-1])
            ]
        else:
            labels = [int(label) for label in prompt["labels"]]

        # Add step separators
        separator_ids = self.tokenizer.encode(
            self.step_separator, add_special_tokens=False
        )
        completions_ids = [completion + separator_ids for completion in completions_ids]

        # Create step-wise labels
        labels = [
            [IGNORE_INDEX] * (len(completion) - 1) + [label]  # type: ignore
            for completion, label in zip(completions_ids, labels, strict=False)
        ]

        # Join all steps
        completion_ids = list(chain(*completions_ids))
        labels = list(chain(*labels))  # type: ignore

        # Handle max lengths
        if self.max_completion_length:
            completion_ids = completion_ids[: self.max_completion_length]
            labels = labels[: self.max_completion_length]

        # Add BOS token if model has one
        if self.tokenizer.bos_token_id is not None:
            prompt_ids = [self.tokenizer.bos_token_id] + prompt_ids

        # Combine prompt and completion
        input_ids = prompt_ids + completion_ids

        full_labels = [IGNORE_INDEX] * len(prompt_ids) + labels
        # Apply max sequence length
        if self.sequence_len:
            input_ids = input_ids[: self.sequence_len]
            full_labels = full_labels[: self.sequence_len]

        return {
            "input_ids": input_ids,
            "labels": full_labels,
            "attention_mask": [1] * len(input_ids),
        }

    @property
    def supports_batched(self):
        return False


def load(
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    ds_cfg: DictDefault,
) -> StepwiseSupervisedPromptTokenizingStrategy:
    return StepwiseSupervisedPromptTokenizingStrategy(
        tokenizer,
        cfg.sequence_len,
        step_separator=ds_cfg.get("step_separator", "\n"),
        max_completion_length=ds_cfg.max_completion_length,
        train_on_last_step_only=ds_cfg.get("train_on_last_step_only", False),
    )


================================================
FILE: src/axolotl/prompt_strategies/user_defined.py
================================================
"""
User Defined prompts with configuration from the YML config
"""

from dataclasses import dataclass
from functools import partial
from typing import Optional, Tuple

from axolotl.prompt_strategies.alpaca_w_system import (
    InstructionWSystemPromptTokenizingStrategy,
    SystemDataPrompter,
)


@dataclass
class UserDefinedDatasetConfig:
    """
    dataclass configuration representing a userdefined dataset type
    """

    system_prompt: str = ""
    field_system: str = "system"
    field_instruction: str = "instruction"
    field_input: str = "input"
    field_output: str = "output"
    format: str = "{instruction} {input} "
    no_input_format: str = "{instruction} "
    system_format: str = "{system}"

    def __getitem__(self, item):
        return getattr(self, item)


class UserDefinedPromptTokenizationStrategy(InstructionWSystemPromptTokenizingStrategy):
    """
    Prompt Tokenization Strategy for user defined prompts
    """


def load(tokenizer, cfg, ds_cfg: Optional[UserDefinedDatasetConfig] = None):
    if not ds_cfg:
        raise ValueError("Missing dataset prompt configuration")

    system_prompt = ""
    if ds_cfg.system_prompt:
        system_prompt = ds_cfg.system_prompt

    def parse_instruction_fields(
        field_instruction,
        field_input,
        field_output,
        field_system,
        system_prompt,
        prompt,
    ) -> Tuple[str, str, str, str]:
        return (
            prompt[field_instruction],
            prompt[field_input] if field_input in prompt else "",
            prompt[field_output] if field_output in prompt else "",
            prompt[field_system] if field_system in prompt else system_prompt,
        )

    turn_format = ds_cfg.format
    turn_no_input_format = ds_cfg.no_input_format
    system_format = ds_cfg.system_format

    class UserDefinedPrompter(SystemDataPrompter):
        """
        Prompter for user defined prompts
        """

        def match_prompt_style(self):
            self.turn_format = turn_format
            self.turn_no_input_format = turn_no_input_format
            self.system_format = system_format

    prompter = UserDefinedPrompter()

    strat = UserDefinedPromptTokenizationStrategy(
        prompter,
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )

    strat.parse_instruction_fields = partial(  # type: ignore[method-assign]
        parse_instruction_fields,
        ds_cfg.field_instruction,
        ds_cfg.field_input,
        ds_cfg.field_output,
        ds_cfg.field_system,
        system_prompt,
    )
    return strat


================================================
FILE: src/axolotl/prompt_tokenizers.py
================================================
"""Module containing PromptTokenizingStrategy and Prompter classes"""

import abc
from typing import Callable, Dict, List, Optional, Tuple, Union

from datasets import Dataset
from transformers import BatchEncoding, PreTrainedTokenizer

from axolotl.prompters import Prompter
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

IGNORE_INDEX = -100
LLAMA_DEFAULT_PAD_TOKEN = "<pad>"  # nosec
LLAMA_DEFAULT_EOS_TOKEN = "</s>"  # nosec
LLAMA_DEFAULT_BOS_TOKEN = "<s>"  # nosec
LLAMA_DEFAULT_UNK_TOKEN = "<unk>"  # nosec


class InvalidDataException(Exception):
    """
    Exception raised when the data is invalid
    """


class DatasetWrappingStrategy(abc.ABC):
    """
    Abstract class for wrapping datasets for Chat Messages
    """

    @abc.abstractmethod
    def wrap_dataset(
        self,
        dataset,
        process_count: int | None = None,
        keep_in_memory: bool | None = False,
        **kwargs,
    ) -> Dataset:
        pass


class PromptTokenizingStrategy(abc.ABC):
    """
    Abstract class for tokenizing strategies
    """

    filter_rows: Optional[Callable] = None

    def __init__(
        self,
        prompter: Prompter,
        tokenizer,
        train_on_inputs: bool = False,
        sequence_len: int = 2048,
    ):
        self.prompter = prompter
        self.tokenizer: PreTrainedTokenizer = tokenizer
        self.train_on_inputs = train_on_inputs
        # sequence_len and max_length can be different for CompletionPromptTokenizingStrategy.
        # TODO: Document how they are different.
        self.sequence_len = sequence_len
        self.max_length = sequence_len

    @abc.abstractmethod
    def tokenize_prompt(self, prompt):
        pass

    @property
    def supports_batched(self):
        return False

    def _tokenize(
        self, prompt: str, add_eos_token: bool = True, strip_bos_token: bool = False
    ) -> BatchEncoding:
        empty = BatchEncoding(data={"input_ids": [], "attention_mask": []})
        if not prompt:
            LOG.warning_once("Empty text requested for tokenization.")
            return empty

        result = self.tokenizer(
            prompt,
            truncation=True,
            max_length=self.max_length,
            padding=False,
            return_tensors=None,
        )
        if len(result["input_ids"]) == 0:
            LOG.warning("Tokenizer result is empty. You may want to audit your dataset")
            return empty

        if (
            result["input_ids"][-1] != self.tokenizer.eos_token_id
            and len(result["input_ids"]) < self.max_length
            and add_eos_token
        ):
            result["input_ids"].append(self.tokenizer.eos_token_id)
            result["attention_mask"].append(1)

        if result["input_ids"][0] == self.tokenizer.bos_token_id and strip_bos_token:
            result["input_ids"] = result["input_ids"][1:]
            result["attention_mask"] = result["attention_mask"][1:]

        result["labels"] = result["input_ids"].copy()
        return result


class InstructionPromptTokenizingStrategy(PromptTokenizingStrategy):
    """
    Tokenizing strategy for instruction-based prompts.
    """

    def parse_instruction_fields(
        self, prompt
    ) -> Union[Tuple[str, str, str], Tuple[str, str, str, str]]:
        raise NotImplementedError

    def tokenize_prompt(self, prompt):
        (
            instruction,
            input,
            response,
        ) = self.parse_instruction_fields(prompt)
        user_prompt = next(
            iter(
                self.prompter.build_prompt(
                    instruction,
                    input,
                )
            )
        )
        tokenized_prompt = self._tokenize(user_prompt, add_eos_token=False)
        if not self.train_on_inputs:
            user_prompt_len = len(tokenized_prompt["input_ids"])
            # TODO this could be sped up using numpy array slicing
            tokenized_prompt["labels"] = [IGNORE_INDEX] * user_prompt_len
        tokenized_res_prompt = self._tokenize(
            response, strip_bos_token=True, add_eos_token=True
        )
        tokenized_prompt["input_ids"] += tokenized_res_prompt["input_ids"]
        tokenized_prompt["attention_mask"] += tokenized_res_prompt["attention_mask"]
        tokenized_prompt["labels"] += tokenized_res_prompt["input_ids"]

        return tokenized_prompt

    def _build_full_prompt(
        self,
        instruction,
        input,
        response,
    ):
        return next(
            iter(
                self.prompter.build_prompt(
                    instruction,
                    input,
                    response,
                )
            )
        )


class AlpacaPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
    """
    Tokenizing strategy for Alpaca prompts.
    """

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        return (
            prompt["instruction"],
            prompt["input"] if "input" in prompt else "",
            prompt["output"],
        )


class AlpacaMultipleChoicePromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
    """
    Tokenizing strategy for Alpaca Multiple Choice prompts.
    """

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        return (
            prompt["question"],
            "\n".join(f'- "{choice}"' for choice in prompt["choices"]),
            prompt["solution"] if "solution" in prompt else prompt["explanation"],
        )


class JeopardyPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
    """
    Tokenizing strategy for Jeopardy prompts.
    """

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        return (
            prompt["question"],
            prompt["category"],
            "what is " + prompt["answer"],
        )


class OpenAssistantPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
    """
    Tokenizing strategy for OpenAssistant prompts.
    """

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        return (
            prompt["INSTRUCTION"],
            "",
            prompt["RESPONSE"],
        )


class SummarizeTLDRPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
    """
    Tokenizing strategy for SummarizeTLDR prompts.
    """

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        return (
            prompt["article"],
            "",
            prompt["summary"],
        )


class GPTeacherPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
    """
    Tokenizing strategy for GPTeacher prompts.
    """

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        return (
            prompt["instruction"],
            prompt["input"] if "input" in prompt else "",
            prompt["response"],
        )


class NomicGPT4AllPromptTokenizingStrategy(InstructionPromptTokenizingStrategy):
    """
    Tokenizing strategy for NomicGPT4All prompts.
    """

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str]:
        return (
            prompt["prompt"],
            "",
            prompt["response"],
        )


class ReflectionPromptTokenizingStrategy(PromptTokenizingStrategy):
    """
    Tokenizing strategy for Reflection prompts.
    """

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str, str]:
        raise NotImplementedError

    def tokenize_prompt(self, prompt):
        (
            instruction,
            input,
            output,
            reflection,
            corrected,
        ) = self.parse_instruction_fields(prompt)
        full_prompt = self._build_full_prompt(
            instruction, input, output, reflection, corrected
        )
        tokenized_full_prompt = self._tokenize(full_prompt)
        if not self.train_on_inputs:
            user_prompt = next(
                iter(
                    self.prompter.build_prompt(
                        instruction,
                        input,
                    )
                )
            )
            tokenized_user_prompt = self._tokenize(user_prompt, add_eos_token=False)
            user_prompt_len = len(tokenized_user_prompt["input_ids"])
            # TODO this could be sped up using numpy array slicing
            tokenized_full_prompt["labels"] = [
                IGNORE_INDEX
            ] * user_prompt_len + tokenized_full_prompt["labels"][user_prompt_len:]

        return tokenized_full_prompt

    def _build_full_prompt(self, instruction, input, output, reflection, corrected):
        return next(
            iter(
                self.prompter.build_prompt(
                    instruction,
                    input,
                    output,
                    reflection,
                    corrected,
                )
            )
        )

    def _tokenize(self, prompt, add_eos_token=True, strip_bos_token=False):
        result = self.tokenizer(
            prompt,
            truncation=True,
            max_length=self.sequence_len,
            padding=False,
            return_tensors=None,
        )
        if (
            result["input_ids"][-1] != self.tokenizer.eos_token_id
            and len(result["input_ids"]) < self.sequence_len
            and add_eos_token
        ):
            result["input_ids"].append(self.tokenizer.eos_token_id)
            result["attention_mask"].append(1)

        result["labels"] = result["input_ids"].copy()
        return result


class AlpacaReflectionPTStrategy(ReflectionPromptTokenizingStrategy):
    """
    Tokenizing strategy for Alpaca Reflection prompts.
    """

    def parse_instruction_fields(self, prompt) -> Tuple[str, str, str, str, str]:
        return (
            prompt["instruction"],
            prompt["input"] if "input" in prompt else "",
            prompt["output"],
            prompt["reflection"],
            prompt["corrected"],
        )


def tokenize_prompt_default() -> Tuple[Dict[str, List[int]], int]:
    """
    Returns the default values for the tokenize prompt function
    """

    result: Dict[str, List[int]] = {
        "input_ids": [],
        "attention_mask": [],
        "labels": [],
    }
    current_len = 0
    return result, current_len


def parse_tokenized_to_result(
    result: Dict[str, List[int]],
    current_len: int,
    res: Dict[str, List[int]],
    labels: List[int],
    pad_token_id: Union[int, None] = None,
) -> Tuple[Dict[str, List[int]], int]:
    """
    Parses the tokenized prompt and append the tokenized input_ids, attention_mask and labels to the result
    """

    input_ids = res["input_ids"]
    input_len = len(input_ids)
    result["input_ids"][current_len : current_len + input_len] = input_ids
    result["attention_mask"][current_len : current_len + input_len] = [
        1 if x != pad_token_id else 0 for x in input_ids
    ]
    result["labels"][current_len : current_len + input_len] = labels
    current_len += input_len

    return result, current_len


================================================
FILE: src/axolotl/prompters.py
================================================
"""Module containing prompters"""

from enum import Enum
from typing import Generator, Optional, Union

from colorama import Fore

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)
IGNORE_TOKEN_ID = -100
REPR_TEMPLATE = "\n<start>\n" + Fore.CYAN + "{full_prompt}" + Fore.RESET + "\n<end>\n"


class PromptStyle(Enum):
    """
    Enum for prompt styles
    """

    INSTRUCT = "instruct"
    CHAT = "chat"
    CHATML = "chatml"
    PHI = "phi"


class Prompter:
    """
    Base prompter class for all prompters
    """


class AlpacaPrompter(Prompter):
    """
    Base class for alpaca prompters
    """

    system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request."
    system_no_input_prompt = "Below is an instruction that describes a task. Write a response that appropriately completes the request."
    system_format: str = "{system}"
    turn_format: str
    turn_no_input_format: str
    prompt_style: Optional[str] = None

    def __init__(self, prompt_style: Optional[str] = PromptStyle.INSTRUCT.value):
        self.prompt_style = prompt_style if prompt_style else PromptStyle.INSTRUCT.value
        self.match_prompt_style()

    def match_prompt_style(self):
        if self.prompt_style == PromptStyle.INSTRUCT.value:
            self.turn_format = "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
            self.turn_no_input_format = (
                "### Instruction:\n{instruction}\n\n### Response:\n"
            )
            self.system_format = "{system}\n\n"
        elif self.prompt_style == PromptStyle.CHAT.value:
            self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
            self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"
            self.system_format = "SYSTEM: {system}\n"
        elif self.prompt_style == PromptStyle.CHATML.value:
            self.turn_format = "<|im_start|>user\n{instruction}\n{input}<|im_end|>\n<|im_start|>assistant\n"
            self.turn_no_input_format = (
                "<|im_start|>user\n{instruction}<|im_end|>\n<|im_start|>assistant\n"
            )
            self.system_format = "<|im_start|>system\n{system}<|im_end|>\n"
        elif self.prompt_style == PromptStyle.PHI.value:
            self.turn_format = "<|user|>\n{instruction}<|end|>{input}<|assistant|>"
            self.turn_no_input_format = (
                "<|user|>\n{instruction}<|end|>\n<|assistant|>\n"
            )
            self.system_format = "<|system|>\n{system}<|end|>\n"

    def _build_result(self, instruction, input_text, output):
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input_text:
            res = (
                self.system_format.format(system=self.system_prompt)
                if self.system_prompt
                else ""
            ) + self.turn_format.format(instruction=instruction, input=input_text)
        else:
            res = (
                self.system_format.format(system=self.system_no_input_prompt)
                if self.system_no_input_prompt
                else ""
            ) + self.turn_no_input_format.format(instruction=instruction)
        if output:
            res = f"{res}{output}"

        return res

    def build_prompt(
        self,
        instruction: str,
        input: Union[None, str] = None,
        output: Union[None, str] = None,
    ) -> Generator[str, None, None]:
        yield self._build_result(instruction, input, output)

    def __repr__(self) -> str:
        return REPR_TEMPLATE.format(
            full_prompt=self._build_result("{instruction}", "{input}", "{output}")
        )


class UnpromptedPrompter(AlpacaPrompter):
    """
    Prompter for alpaca no system prompt
    """

    system_prompt = ""
    system_no_input_prompt = ""


class JeopardyPrompter(AlpacaPrompter):
    """
    Prompter for Jeopardy
    """

    prompt_input = "Below is a Jeopardy clue paired with input providing the category of the clue. Write a concise response that best answers tbe clue given the category.\n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"


class MultipleChoiceExplainPrompter(AlpacaPrompter):
    """
    Prompter for multiple choice explain
    """

    system_prompt = (
        "Choose the answer that best answers the question. Explain your reasoning.\n"
    )
    system_no_input_prompt = (
        "Choose the answer that best answers the question. Explain your reasoning.\n"
    )


class MultipleChoiceConcisePrompter(AlpacaPrompter):
    """
    Prompter for multiple choice concise
    """

    system_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n"
    system_no_input_prompt = "Choose the answer that best answers the question. Be concise in your response.\n\n"

    def match_prompt_style(self):
        self.turn_format = "USER: {instruction}\n{input}\nASSISTANT:"
        self.turn_no_input_format = "USER: {instruction}\nASSISTANT:"


class SummarizeTLDRPrompter(AlpacaPrompter):
    """
    Prompter for summarize TLDR
    """

    system_prompt = ""
    system_no_input_prompt = ""

    def match_prompt_style(self):
        self.turn_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\n{input}\nASSISTANT:"
        self.turn_no_input_format = "USER: Summarize the following article as a TL;DR.\n{instruction}\nASSISTANT:"


class GPTeacherPrompter(AlpacaPrompter):
    """
    Prompter for GPTeacher
    """


class NomicGPT4AllPrompter(AlpacaPrompter):
    """
    Prompter for NomicGPT4All
    """


class ReflectAlpacaPrompter(Prompter):
    """
    Prompter for ReflectAlpaca
    """

    system_prompt = "Below is an instruction that describes a task, paired with an input that provides further context. You, the Assistant, should generate a response as if it were an abstract for an academic or technical paper on the query along with a methodology. Then generate an Agent Reflection where you create a long form response as if from subject matter expert, be verbose, diligent, and creative in your application of knowledge, apply it through the lens of the response generated by the assistant. Look for flawed reasoning, faulty logic, or other mistakes in the method. Finally, generate a final response and method for the user with the Assistant abstract and Reflection analysis as augmentations to the generation\n\n"
    system_no_input_prompt = "Below is an instruction that describes a task. You, the Assistant, should generate a response as if it were an abstract for an academic or technical paper on the query along with a methodology. Then generate an Agent Reflection where you create a long form response as if from subject matter expert, be verbose, diligent, and creative in your application of knowledge, apply it through the lens of the response generated by the assistant. Look for flawed reasoning, faulty logic, or other mistakes in the method. Finally, generate a final response and method for the user with the Assistant abstract and Reflection analysis as augmentations to the generation\n\n"

    prompt_input = (
        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
    )
    prompt_no_input = "### Instruction:\n{instruction}\n\n### Response:\n"
    agent_label = "### Thought:\n{output}\n\n### Agent Reflection:\n{reflection}\n\n### Final Response:\n{corrected}"
    response_split = "### Response:"

    def __init__(self, prompt_style="instruct"):
        self.prompt_style = prompt_style
        self.match_prompt_style()

    def match_prompt_style(self):
        if self.prompt_style == PromptStyle.INSTRUCT.value:
            self.prompt_input = (
                self.system_prompt
                + "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n"
            )
            self.prompt_no_input = (
                self.system_no_input_prompt
                + "### Instruction:\n{instruction}\n\n### Response:\n"
            )
            self.agent_label = "### Thought:\n{output}\n\n### Agent Reflection:\n{reflection}\n\n### Final Response:\n{corrected}"
            self.response_split = "### Final Response:"
        if self.prompt_style == PromptStyle.CHAT.value:
            self.prompt_input = (
                self.system_prompt + "USER: {instruction}\n{input}\nASSISTANT:"
            )
            self.prompt_no_input = (
                self.system_no_input_prompt + "USER: {instruction}\nASSISTANT:"
            )
            self.agent_label = (
                "\nTHOUGHT: {output}\nASSISTANT REFLECTION: {reflection}\nASSISTANT:"
            )
            self.response_split = "ASSISTANT:"

    def _build_result(
        self,
        instruction: str,
        input: Union[None, str] = None,
        output: Union[None, str] = None,
        reflection: Union[None, str] = None,
        corrected: Union[None, str] = None,
    ):
        # returns the full prompt from instruction and optional input
        # if a label (=response, =output) is provided, it's also appended.
        if input:
            res = self.prompt_input.format(instruction=instruction, input=input)
        else:
            res = self.prompt_no_input.format(instruction=instruction)
        if output and reflection and corrected:
            label = self.agent_label.format(
                output=output,
                reflection=reflection,
                corrected=corrected,
            )
            res = f"{res}{label}"

        return res

    def build_prompt(
        self,
        instruction: str,
        input: Union[None, str] = None,
        output: Union[None, str] = None,
        reflection: Union[None, str] = None,
        corrected: Union[None, str] = None,
    ) -> Generator[str, None, None]:
        yield self._build_result(
            instruction,
            input,
            output,
            reflection,
            corrected,
        )

    def __repr__(self) -> str:
        return REPR_TEMPLATE.format(
            full_prompt=self._build_result("{instruction}", "{input}", "{output}")
        )


ALTERNATING_ASSERTION_FAILED_ROLE = (
    "Role did not alternate between turns (gpt and human). Please check your data."
)


class UnsupportedPrompter(Prompter):
    """
    A dummy class for custom prompters
    """

    def __init__(self) -> None:
        pass

    def __repr__(self):
        return "Pre-tokenized or custom dataset types are unsupported for logging"


================================================
FILE: src/axolotl/scripts/__init__.py
================================================


================================================
FILE: src/axolotl/scripts/vllm_serve_lora.py
================================================
"""vLLM serve script with native LoRA adapter support.

Extends TRL's vllm_serve to enable direct LoRA adapter loading in vLLM,
instead of merging adapter weights into the base model before syncing.

Usage:
    Set ``vllm.serve_module: axolotl.scripts.vllm_serve_lora`` in your config,
    or ``trl.vllm_lora_sync: true`` to auto-select.

Benefits over merge-sync:
    - Syncs only LoRA adapter weights via filesystem instead of full merged model via NCCL
    - vLLM handles LoRA application natively (Punica kernels)
    - No NCCL communicator needed for weight sync
"""

import logging
import os
from contextlib import asynccontextmanager
from dataclasses import dataclass, field
from itertools import chain
from multiprocessing import Pipe, Process
from multiprocessing.connection import Connection
from typing import Any

from trl.scripts.vllm_serve import (
    ScriptArguments,
    chunk_list,
    extract_logprobs,
    get_open_port,
)
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest

logger = logging.getLogger(__name__)


@dataclass
class LoRAScriptArguments(ScriptArguments):
    """Extended script arguments with LoRA support."""

    enable_lora: bool = field(
        default=True,
        metadata={"help": "Enable LoRA adapter support in vLLM."},
    )
    max_lora_rank: int = field(
        default=64,
        metadata={"help": "Maximum LoRA rank supported."},
    )
    max_loras: int = field(
        default=2,
        metadata={"help": "Maximum number of LoRA adapters loaded simultaneously."},
    )
    lora_dtype: str = field(
        default="bfloat16",
        metadata={"help": "Data type for LoRA weights."},
    )


def llm_worker(
    script_args: LoRAScriptArguments,
    data_parallel_rank: int,
    master_port: int,
    connection: Connection,
) -> None:
    """Worker process that creates a vLLM LLM with LoRA enabled."""
    os.environ["VLLM_DP_RANK"] = str(data_parallel_rank)
    os.environ["VLLM_DP_RANK_LOCAL"] = str(data_parallel_rank)
    os.environ["VLLM_DP_SIZE"] = str(script_args.data_parallel_size)
    os.environ["VLLM_DP_MASTER_PORT"] = str(master_port)

    llm = LLM(
        model=script_args.model,
        revision=script_args.revision,
        tensor_parallel_size=script_args.tensor_parallel_size,
        gpu_memory_utilization=script_args.gpu_memory_utilization,
        enforce_eager=script_args.enforce_eager,
        dtype=script_args.dtype,
        enable_prefix_caching=script_args.enable_prefix_caching,
        kv_cache_dtype=script_args.kv_cache_dtype,
        max_model_len=script_args.max_model_len,
        # Use batch-capable worker extension (adds batch_update_named_params + auto-close)
        worker_extension_cls="axolotl.scripts.vllm_worker_ext.BatchWeightSyncWorkerExtension",
        trust_remote_code=script_args.trust_remote_code,
        model_impl=script_args.vllm_model_impl,
        logprobs_mode="processed_logprobs",
        # LoRA
        enable_lora=script_args.enable_lora,
        max_lora_rank=script_args.max_lora_rank,
        max_loras=script_args.max_loras,
        lora_dtype=script_args.lora_dtype,
    )

    connection.send({"status": "ready"})

    while True:
        try:
            command = connection.recv()
        except KeyboardInterrupt:
            llm.collective_rpc(method="close_communicator")
            break

        if command["type"] in ["call", "fire_and_forget"]:
            method_name = command["method"]
            args = command.get("args", ())
            kwargs = command.get("kwargs", {})

            # Reconstruct LoRARequest from serialized dict (can't pickle across pipe)
            if "lora_request" in kwargs and kwargs["lora_request"] is not None:
                lr = kwargs["lora_request"]
                kwargs["lora_request"] = LoRARequest(
                    lora_name=lr["lora_name"],
                    lora_int_id=lr["lora_int_id"],
                    lora_path=lr["lora_path"],
                    load_inplace=lr.get("load_inplace", False),
                )

            method = getattr(llm, method_name)
            result = method(*args, **kwargs)
            if command["type"] == "call":
                connection.send(result)
        elif command["type"] == "shutdown":
            break


def main(script_args: ScriptArguments):
    """Start vLLM workers with LoRA support and the HTTP server."""
    import asyncio

    import uvicorn
    from fastapi import FastAPI
    from pydantic import BaseModel, Field as PydanticField

    # Request/Response models (defined locally like TRL's vllm_serve.main)
    class GenerateRequest(BaseModel):
        prompts: list[str]
        images: list[str] | None = None
        n: int = 1
        repetition_penalty: float = 1.0
        temperature: float = 1.0
        top_p: float = 1.0
        top_k: int = -1
        min_p: float = 0.0
        max_tokens: int = 16
        logprobs: int | None = 0
        truncate_prompt_tokens: int | None = None
        structured_outputs_regex: str | None = None
        generation_kwargs: dict = PydanticField(default_factory=dict)

    class GenerateResponse(BaseModel):
        prompt_ids: list[list[int]]
        completion_ids: list[list[int]]
        logprobs: list[list[list[float]]]
        logprob_token_ids: list[list[list[int]]]

    class ChatRequest(BaseModel):
        messages: list[list[dict]]
        n: int = 1
        repetition_penalty: float = 1.0
        temperature: float = 1.0
        top_p: float = 1.0
        top_k: int = -1
        min_p: float = 0.0
        max_tokens: int = 16
        logprobs: int | None = 0
        truncate_prompt_tokens: int | None = None
        structured_outputs_regex: str | None = None
        generation_kwargs: dict = PydanticField(default_factory=dict)
        chat_template_kwargs: dict = PydanticField(default_factory=dict)

    class ChatResponse(BaseModel):
        prompt_ids: list[list[int]]
        completion_ids: list[list[int]]
        logprobs: list[list[list[float]]]
        logprob_token_ids: list[list[list[int]]]

    class InitCommunicatorRequest(BaseModel):
        host: str
        port: int
        world_size: int
        client_device_uuid: str

    # Wrap plain ScriptArguments with LoRA defaults
    if not isinstance(script_args, LoRAScriptArguments):
        lora_args = LoRAScriptArguments.__new__(LoRAScriptArguments)
        for f in ScriptArguments.__dataclass_fields__:
            setattr(lora_args, f, getattr(script_args, f))
        # Apply LoRA defaults
        for f in LoRAScriptArguments.__dataclass_fields__:
            if f not in ScriptArguments.__dataclass_fields__:
                setattr(
                    lora_args, f, LoRAScriptArguments.__dataclass_fields__[f].default
                )
        script_args = lora_args

    # Spawn workers
    master_port = get_open_port()
    connections: list[Connection] = []
    processes: list[Process] = []
    for dp_rank in range(script_args.data_parallel_size):
        parent_conn, child_conn = Pipe()
        process = Process(
            target=llm_worker,
            args=(script_args, dp_rank, master_port, child_conn),
        )
        process.start()
        connections.append(parent_conn)
        processes.append(process)

    @asynccontextmanager
    async def lifespan(app: FastAPI):
        import time

        startup_timeout = 300  # 5 minutes
        start_time = time.monotonic()
        ready: set[int] = set()
        while len(ready) < script_args.data_parallel_size:
            elapsed = time.monotonic() - start_time
            if elapsed > startup_timeout:
                raise RuntimeError(
                    f"vLLM workers failed to start within {startup_timeout}s "
                    f"({len(ready)}/{script_args.data_parallel_size} ready)"
                )
            for i, (conn, proc) in enumerate(zip(connections, processes, strict=True)):
                if id(conn) in ready:
                    continue
                if not proc.is_alive():
                    raise RuntimeError(
                        f"vLLM worker {i} exited unexpectedly during startup"
                    )
                if conn.poll():
                    msg = conn.recv()
                    if isinstance(msg, dict) and msg.get("status") == "ready":
                        ready.add(id(conn))
            await asyncio.sleep(0.1)
        yield
        for p in processes:
            p.join(timeout=10)
            if p.is_alive():
                p.terminate()
                p.join()

    app = FastAPI(lifespan=lifespan)

    # --- Active LoRA state (shared across endpoints via closure) ---
    active_lora: dict = {"request": None}

    # ------------------------------------------------------------------
    # LoRA-specific endpoints
    # ------------------------------------------------------------------

    class SetLoRARequest(BaseModel):
        lora_name: str
        lora_int_id: int
        lora_path: str
        load_inplace: bool = False

    @app.post("/set_lora_adapter/")
    async def set_lora_adapter(request: SetLoRARequest):
        """Register a LoRA adapter for all subsequent generate/chat calls."""
        active_lora["request"] = {
            "lora_name": request.lora_name,
            "lora_int_id": request.lora_int_id,
            "lora_path": request.lora_path,
            "load_inplace": request.load_inplace,
        }
        logger.info(
            "Set active LoRA: %s (id=%d, path=%s)",
            request.lora_name,
            request.lora_int_id,
            request.lora_path,
        )
        return {"status": "ok"}

    @app.post("/clear_lora_adapter/")
    async def clear_lora_adapter():
        """Clear active LoRA adapter (revert to base model)."""
        active_lora["request"] = None
        return {"status": "ok"}

    # ------------------------------------------------------------------
    # Standard endpoints (mirrors TRL's vllm_serve)
    # ------------------------------------------------------------------

    @app.get("/health/")
    async def health():
        return {"status": "ok"}

    @app.get("/get_world_size/")
    async def get_world_size():
        return {
            "world_size": script_args.tensor_parallel_size
            * script_args.data_parallel_size
        }

    @app.post("/generate/", response_model=GenerateResponse)
    async def generate(request: GenerateRequest):
        """Generate completions with optional LoRA adapter."""
        import base64
        from io import BytesIO

        import vllm
        from packaging.version import Version
        from vllm.sampling_params import GuidedDecodingParams

        images: list[str | None] = request.images or [None] * len(request.prompts)  # type: ignore[assignment,list-item]
        prompts: list[dict[str, Any]] = []
        for prompt, image in zip(request.prompts, images, strict=True):
            row: dict[str, Any] = {"prompt": prompt}
            if image is not None:
                from PIL import Image

                row["multi_modal_data"] = {
                    "image": Image.open(BytesIO(base64.b64decode(image)))
                }
            prompts.append(row)

        generation_kwargs = {
            "n": request.n,
            "repetition_penalty": request.repetition_penalty,
            "temperature": request.temperature,
            "top_p": request.top_p,
            "top_k": request.top_k,
            "min_p": request.min_p,
            "max_tokens": request.max_tokens,
            "logprobs": request.logprobs,
        }
        generation_kwargs.update(request.generation_kwargs)

        if Version(vllm.__version__) <= Version("0.10.2"):
            key = "guided_decoding"
            if request.structured_outputs_regex is not None:
                generation_kwargs[key] = GuidedDecodingParams(
                    regex=request.structured_outputs_regex
                )
            else:
                generation_kwargs.setdefault(key, None)
        else:
            from vllm.sampling_params import StructuredOutputsParams

            key = "structured_outputs"
            if request.structured_outputs_regex is not None:
                generation_kwargs[key] = StructuredOutputsParams(
                    regex=request.structured_outputs_regex
                )
            elif isinstance(generation_kwargs.get(key), dict):
                generation_kwargs[key] = StructuredOutputsParams(
                    **generation_kwargs[key]
                )
            else:
                generation_kwargs.setdefault(key, None)

        sampling_params = SamplingParams(**generation_kwargs)
        chunked_prompts = chunk_list(prompts, script_args.data_parallel_size)

        for conn, chunk in zip(connections, chunked_prompts, strict=True):
            if not chunk:
                chunk = [{"prompt": "<placeholder>"}]
            kwargs = {
                "prompts": chunk,
                "sampling_params": sampling_params,
                "lora_request": active_lora["request"],
            }
            conn.send({"type": "call", "method": "generate", "kwargs": kwargs})

        all_outputs = [conn.recv() for conn in connections]
        all_outputs = [
            o for o, c in zip(all_outputs, chunked_prompts, strict=True) if c
        ]
        all_outputs = list(chain.from_iterable(all_outputs))

        return {
            "prompt_ids": [o.prompt_token_ids for o in all_outputs],
            "completion_ids": [
                list(out.token_ids) for o in all_outputs for out in o.outputs
            ],
            "logprobs": extract_logprobs(all_outputs)[0],
            "logprob_token_ids": extract_logprobs(all_outputs)[1],
        }

    @app.post("/chat/", response_model=ChatResponse)
    async def chat(request: ChatRequest):
        """Chat endpoint with optional LoRA adapter."""
        generation_kwargs = {
            "n": request.n,
            "repetition_penalty": request.repetition_penalty,
            "temperature": request.temperature,
            "top_p": request.top_p,
            "top_k": request.top_k,
            "min_p": request.min_p,
            "max_tokens": request.max_tokens,
            "logprobs": request.logprobs,
        }
        generation_kwargs.update(request.generation_kwargs)
        sampling_params = SamplingParams(**generation_kwargs)
        chunked = chunk_list(request.messages, script_args.data_parallel_size)
        for conn, chunk in zip(connections, chunked, strict=True):
            if not chunk:
                chunk = [[{"role": "user", "content": "<placeholder>"}]]
            kwargs = {
                "messages": chunk,
                "sampling_params": sampling_params,
                "use_tqdm": False,
                "lora_request": active_lora["request"],
            }
            conn.send({"type": "call", "method": "chat", "kwargs": kwargs})

        all_outputs = [conn.recv() for conn in connections]
        all_outputs = [o for o, c in zip(all_outputs, chunked, strict=True) if c]
        all_outputs = list(chain.from_iterable(all_outputs))

        return {
            "prompt_ids": [o.prompt_token_ids for o in all_outputs],
            "completion_ids": [
                list(out.token_ids) for o in all_outputs for out in o.outputs
            ],
            "logprobs": extract_logprobs(all_outputs)[0],
            "logprob_token_ids": extract_logprobs(all_outputs)[1],
        }

    # --- Weight sync endpoints (legacy fallback, same as TRL) ---

    @app.post("/init_communicator/")
    async def init_communicator(request: InitCommunicatorRequest):
        world_size = (
            script_args.tensor_parallel_size * script_args.data_parallel_size + 1
        )
        kwargs = {
            "method": "init_communicator",
            "args": (
                request.host,
                request.port,
                world_size,
                request.client_device_uuid,
            ),
        }
        msg = {"type": "fire_and_forget", "method": "collective_rpc", "kwargs": kwargs}
        loop = asyncio.get_running_loop()
        await asyncio.gather(
            *(loop.run_in_executor(None, c.send, msg) for c in connections)
        )
        return {"message": "Initializing communicator"}

    class UpdateWeightsRequest(BaseModel):
        name: str
        dtype: str
        shape: list[int]

    @app.post("/update_named_param/")
    async def update_named_param(request: UpdateWeightsRequest):
        kwargs = {
            "method": "update_named_param",
            "args": (request.name, request.dtype, tuple(request.shape)),
        }
        msg = {"type": "fire_and_forget", "method": "collective_rpc", "kwargs": kwargs}
        loop = asyncio.get_running_loop()
        await asyncio.gather(
            *(loop.run_in_executor(None, c.send, msg) for c in connections)
        )
        return {"message": "Updating parameter"}

    class BatchUpdateWeightsRequest(BaseModel):
        params: list[dict]

    @app.post("/batch_update_named_params/")
    async def batch_update_named_params(request: BatchUpdateWeightsRequest):
        params_list = [
            (p["name"], p["dtype"], tuple(p["shape"])) for p in request.params
        ]
        kwargs = {"method": "batch_update_named_params", "args": (params_list,)}
        msg = {"type": "fire_and_forget", "method": "collective_rpc", "kwargs": kwargs}
        loop = asyncio.get_running_loop()
        await asyncio.gather(
            *(loop.run_in_executor(None, c.send, msg) for c in connections)
        )
        return {"message": f"Batch update for {len(params_list)} params"}

    @app.post("/reset_prefix_cache/")
    async def reset_prefix_cache():
        for conn in connections:
            conn.send({"type": "call", "method": "reset_prefix_cache"})
        results = [conn.recv() for conn in connections]
        return {"message": f"Reset prefix cache: {all(results)}"}

    @app.post("/close_communicator/")
    async def close_communicator():
        kwargs = {"method": "close_communicator"}
        for conn in connections:
            conn.send(
                {
                    "type": "fire_and_forget",
                    "method": "collective_rpc",
                    "kwargs": kwargs,
                }
            )
        return {"message": "Closing communicator"}

    uvicorn.run(
        app,
        host=script_args.host,
        port=script_args.port,
        log_level=script_args.log_level,
        access_log=True,
    )


================================================
FILE: src/axolotl/scripts/vllm_worker_ext.py
================================================
"""Extended vLLM worker extension with batch weight sync support.

Subclasses TRL's WeightSyncWorkerExtension to add:
- batch_update_named_params: receives multiple params in one call
- Auto-close stale communicator on re-init
- _direct_set_weight: proper handling for stacked (qkv_proj, gate_up_proj) params,
  including LoRA-wrapped models where vLLM inserts base_layer into the hierarchy
"""

import logging

import torch

try:
    from transformers import is_torch_xpu_available
except ImportError:
    is_torch_xpu_available = lambda: False  # noqa: E731

from trl.scripts.vllm_serve import WeightSyncWorkerExtension

logger = logging.getLogger(__name__)

# Stacked param name mapping: shard_name -> (packed_name, shard_order)
_STACKED_PARAMS = {
    "q_proj": ("qkv_proj", 0),
    "k_proj": ("qkv_proj", 1),
    "v_proj": ("qkv_proj", 2),
    "gate_proj": ("gate_up_proj", 0),
    "up_proj": ("gate_up_proj", 1),
}


class BatchWeightSyncWorkerExtension(WeightSyncWorkerExtension):
    """Worker extension that adds batch weight update and direct weight setting."""

    def init_communicator(self, host, port, world_size, client_device_uuid):
        """Auto-close stale communicator before re-initializing."""
        if self.communicator is not None:
            self.close_communicator()
        super().init_communicator(host, port, world_size, client_device_uuid)

    def _direct_set_weight(self, name: str, weight: torch.Tensor) -> None:
        """Directly copy weight data into the model, handling stacked params.

        Bypasses model.load_weights() which may fail on vLLM 0.17's new
        module-tree weight loader for stacked params (qkv_proj, gate_up_proj).

        Handles LoRA-wrapped params where vLLM inserts ``base_layer`` into the
        parameter hierarchy (e.g. ``qkv_proj.base_layer.weight``).
        """
        model = self.model_runner.model
        params_dict = dict(model.named_parameters())

        # Check if this is a simple direct param (exists as-is)
        if name in params_dict:
            params_dict[name].data.copy_(weight.to(params_dict[name].dtype))
            return

        # Also check with base_layer inserted: x.y.weight -> x.y.base_layer.weight
        parts_bl = name.rsplit(".", 1)
        if len(parts_bl) == 2:
            base_layer_name = f"{parts_bl[0]}.base_layer.{parts_bl[1]}"
            if base_layer_name in params_dict:
                params_dict[base_layer_name].data.copy_(
                    weight.to(params_dict[base_layer_name].dtype)
                )
                return

        # Handle stacked params: e.g. "model.layers.0.self_attn.q_proj.weight"
        # -> "model.layers.0.self_attn.qkv_proj.weight" with shard offset
        parts = name.rsplit(".", 2)  # [prefix, layer_name, suffix]
        if len(parts) == 3:
            prefix, layer_name, suffix = parts
            if layer_name in _STACKED_PARAMS:
                packed_name, shard_idx = _STACKED_PARAMS[layer_name]
                for packed_full in [
                    f"{prefix}.{packed_name}.{suffix}",
                    f"{prefix}.{packed_name}.base_layer.{suffix}",
                ]:
                    if packed_full not in params_dict:
                        continue
                    param = params_dict[packed_full]
                    # Navigate to the packed module to find shard sizes
                    module_path = packed_full.rsplit(".", 1)[0]  # strip .weight/.bias
                    if ".base_layer" in module_path:
                        module_path = module_path.replace(".base_layer", "")
                    module = model
                    for attr in module_path.split("."):
                        module = getattr(module, attr, None)
                        if module is None:
                            break
                    # LoRA wrappers don't have output_sizes directly;
                    # check base_layer for the underlying parallel linear
                    if module is not None and not hasattr(module, "output_sizes"):
                        base = getattr(module, "base_layer", None)
                        if base is not None and hasattr(base, "output_sizes"):
                            module = base
                    if module is not None and hasattr(module, "output_sizes"):
                        tp_size = getattr(module, "tp_size", 1)
                        sizes = [s // tp_size for s in module.output_sizes]
                        offset = sum(sizes[:shard_idx])
                        shard_size = sizes[shard_idx]
                        param.data[offset : offset + shard_size].copy_(
                            weight.to(param.dtype)
                        )
                        return

        # Fallback: try load_weights (may work for non-stacked params)
        logger.warning("Falling back to load_weights for param: %s", name)
        model.load_weights(weights=[(name, weight)])

    def update_named_param(self, name, dtype, shape):
        """Override to use _direct_set_weight instead of load_weights."""
        if self.communicator is None:
            raise RuntimeError("Communicator not initialized.")

        dtype = getattr(torch, dtype.split(".")[-1])
        weight = torch.empty(shape, dtype=dtype, device=self.device)

        if is_torch_xpu_available():
            self.communicator.broadcast(weight, root=self.client_rank)
            self.communicator.barrier()
        else:
            self.communicator.broadcast(weight, src=self.client_rank)
            self.communicator.group.barrier()

        self._direct_set_weight(name, weight)

    def batch_update_named_params(self, params_list: list[tuple[str, str, tuple]]):
        """Receive and apply multiple weight tensors in sequence.

        Args:
            params_list: List of (name, dtype_str, shape) tuples.
        """
        if self.communicator is None:
            raise RuntimeError("Communicator not initialized.")

        weights_to_load = []
        for name, dtype_str, shape in params_list:
            dtype = getattr(torch, dtype_str.split(".")[-1])
            weight = torch.empty(shape, dtype=dtype, device=self.device)

            if is_torch_xpu_available():
                self.communicator.broadcast(weight, root=self.client_rank)
            else:
                self.communicator.broadcast(weight, src=self.client_rank)

            weights_to_load.append((name, weight))

        # Single barrier after all broadcasts
        if is_torch_xpu_available():
            self.communicator.barrier()
        else:
            self.communicator.group.barrier()

        # Load weights using direct set (handles stacked params)
        for name, weight in weights_to_load:
            self._direct_set_weight(name, weight)


================================================
FILE: src/axolotl/telemetry/__init__.py
================================================


================================================
FILE: src/axolotl/telemetry/callbacks.py
================================================
"""Trainer callbacks for reporting runtime metrics at regular intervals."""

import logging
import time

from transformers import (
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
)

from axolotl.telemetry.manager import TelemetryManager
from axolotl.telemetry.runtime_metrics import RuntimeMetricsTracker

LOG = logging.getLogger(__name__)

TIME_SINCE_LAST = 60


class TelemetryCallback(TrainerCallback):
    """
    Trainer callback for tracking and reporting runtime metrics.

    This callback tracks training progress, runtime, and memory usage,
    sending telemetry at configurable intervals.
    """

    report_interval_steps: int = 100

    def __init__(self):
        """Initialize the metrics callback."""
        self.tracker = RuntimeMetricsTracker()
        self.telemetry_manager = TelemetryManager.get_instance()
        self.current_epoch = -1
        self.start_time = time.time()
        self.last_report_time = None
        self.last_report_step = 0

    # pylint: disable=unused-argument
    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Handle training start."""
        self.telemetry_manager.send_event(event_type="train-start")

    # pylint: disable=unused-argument
    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Handle training end."""
        # Send training completion event
        self.telemetry_manager.send_event(
            event_type="train-end",
            properties=self._extract_last_metrics(state)
            | self.tracker.metrics.to_dict(),
        )

    # pylint: disable=unused-argument
    def on_epoch_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Handle epoch start."""
        self.current_epoch += 1
        self.tracker.start_epoch(self.current_epoch)

    # pylint: disable=unused-argument
    def on_epoch_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Handle epoch end."""
        self.tracker.end_epoch(self.current_epoch)

    # pylint: disable=unused-argument
    def on_step_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Handle step end."""
        step = state.global_step
        self.tracker.update_step(step)

        # Check if we should report metrics
        should_report = (
            step % self.report_interval_steps == 0
            or step == 1  # Always report first step
            or step - self.last_report_step >= self.report_interval_steps
        )

        if should_report:
            current_time = time.time()
            if self.last_report_time is not None:
                time_since_last_report = current_time - self.last_report_time
            else:
                time_since_last_report = current_time - self.start_time
            steps_since_last_report = step - self.last_report_step

            # Only report if enough time has passed
            if (
                step == 1
                or time_since_last_report >= TIME_SINCE_LAST
                or steps_since_last_report >= self.report_interval_steps
            ):
                # Calculate steps per second for this interval
                if time_since_last_report > 0 and steps_since_last_report > 0:
                    steps_per_second = steps_since_last_report / time_since_last_report
                else:
                    steps_per_second = 0

                # Update memory metrics
                self.tracker.update_memory_metrics()

                # Prepare metrics to report
                metrics = self._extract_last_metrics(state) | {
                    "step": step,
                    "epoch": self.current_epoch,
                    "progress": state.epoch,  # Fractional epoch progress
                    "steps_per_second": steps_per_second,
                    "elapsed_time": current_time - self.start_time,
                    "time_since_last_report": time_since_last_report,
                }

                # Add memory metrics
                memory_metrics = self.tracker.get_memory_metrics()
                metrics.update({"memory": memory_metrics})

                # Send telemetry
                self.telemetry_manager.send_event(
                    event_type="train-progress", properties=metrics
                )

                # Update last report time and step
                self.last_report_time = current_time
                self.last_report_step = step

    def _extract_last_metrics(self, state: TrainerState) -> dict:
        """Extract last loss, learning_rate, grad_norm, and token metrics from log history."""
        if not state.log_history:
            return {
                "loss": 0,
                "ppl": 0,
                "learning_rate": 0,
                "grad_norm": 0,
                "tokens/total": 0,
                "tokens/trainable": 0,
                "tokens/train_per_sec_per_gpu": 0,
            }

        last_log = state.log_history[-1]
        return {
            "loss": last_log.get("loss", 0),
            "ppl": last_log.get("ppl", 0),
            "learning_rate": last_log.get("learning_rate", 0),
            "grad_norm": last_log.get("grad_norm", 0),
            "tokens/total": last_log.get("tokens/total", 0),
            "tokens/trainable": last_log.get("tokens/trainable", 0),
            "tokens/train_per_sec_per_gpu": last_log.get(
                "tokens/train_per_sec_per_gpu", 0
            ),
        }


================================================
FILE: src/axolotl/telemetry/errors.py
================================================
"""Telemetry utilities for exception and traceback information."""

import logging
import os
import re
import traceback
from functools import wraps
from inspect import getmodule
from typing import Any, Callable

from axolotl.telemetry.manager import TelemetryManager

LOG = logging.getLogger(__name__)

ERROR_HANDLED = False


def sanitize_stack_trace(stack_trace: str) -> str:
    """
    Remove personal information from stack trace messages while keeping Python package codepaths.

    This function identifies Python packages by looking for common patterns in virtual environment
    and site-packages directories, preserving the package path while removing user-specific paths.

    Args:
        stack_trace: The original stack trace string.

    Returns:
        A sanitized version of the stack trace with Python package paths preserved.
    """
    # Split the stack trace into lines to process each file path separately
    lines = stack_trace.split("\n")
    sanitized_lines = []

    # Regular expression to find file paths in the stack trace
    path_pattern = re.compile(r'(?:File ")(.*?)(?:")')

    # Regular expression to identify paths in site-packages or dist-packages
    # This matches path segments like "site-packages/package_name" or "dist-packages/package_name"
    site_packages_pattern = re.compile(
        r"(?:site-packages|dist-packages)[/\\]([\w\-\.]+)"
    )

    # Additional common virtual environment patterns
    venv_lib_pattern = re.compile(
        r"(?:lib|Lib)[/\\](?:python\d+(?:\.\d+)?[/\\])?(?:site-packages|dist-packages)[/\\]([\w\-\.]+)"
    )

    for line in lines:
        # Check if this line contains a file path
        path_match = path_pattern.search(line)

        if path_match:
            full_path = path_match.group(1)
            sanitized_path = ""

            # Try to match site-packages pattern
            site_packages_match = site_packages_pattern.search(full_path)
            venv_lib_match = venv_lib_pattern.search(full_path)

            if site_packages_match:
                # Find the index where the matched pattern starts
                idx = full_path.find("site-packages")
                if idx == -1:
                    idx = full_path.find("dist-packages")

                # Keep from 'site-packages' onward
                if idx >= 0:
                    sanitized_path = full_path[idx:]
            elif venv_lib_match:
                # For other virtual environment patterns, find the package directory
                match_idx = venv_lib_match.start(1)
                if match_idx > 0:
                    # Keep from the package name onward
                    package_name = venv_lib_match.group(1)
                    idx = full_path.rfind(
                        package_name, 0, match_idx + len(package_name)
                    )
                    if idx >= 0:
                        sanitized_path = full_path[idx:]

            # If we couldn't identify a package pattern but path contains 'axolotl'
            elif "axolotl" in full_path:
                idx = full_path.rfind("axolotl")
                if idx >= 0:
                    sanitized_path = full_path[idx:]

            # Apply the sanitization to the line
            if sanitized_path:
                line = line.replace(full_path, sanitized_path)
            else:
                # If we couldn't identify a package pattern, just keep the filename
                filename = os.path.basename(full_path)
                if filename:
                    line = line.replace(full_path, filename)
                else:
                    line = line.replace(full_path, "")

        sanitized_lines.append(line)

    return "\n".join(sanitized_lines)


def send_errors(func: Callable) -> Callable:
    """
    Decorator to send exception info in a function. If an exception is raised, we send
    telemetry containing the stack trace and error message.

    If an error occurs in a decorated function that is called by another decorated
    function, we'll only send telemetry corresponding to the lower-level function.

    Args:
        func: Function to decorate.

    Returns:
        Decorated function.
    """

    @wraps(func)
    def wrapper(*args, **kwargs) -> Any:
        telemetry_manager = TelemetryManager.get_instance()

        if not telemetry_manager.enabled:
            return func(*args, **kwargs)

        try:
            return func(*args, **kwargs)
        except Exception as exception:
            # Only track if we're not already handling an error. This prevents us from
            # capturing an error more than once in nested decorated function calls.
            global ERROR_HANDLED  # pylint: disable=global-statement
            if not ERROR_HANDLED:
                ERROR_HANDLED = True

                # Get function module path
                module = getmodule(func)
                module_path = (
                    f"{module.__name__}.{func.__name__}" if module else func.__name__
                )

                # Get stack trace
                stack_trace = "".join(
                    traceback.format_exception(
                        type(exception), exception, exception.__traceback__
                    )
                )
                stack_trace = sanitize_stack_trace(stack_trace)

                # Send error telemetry
                telemetry_manager.send_event(
                    event_type=f"{module_path}-error",
                    properties={
                        "exception": str(exception),
                        "stack_trace": stack_trace,
                    },
                )

                LOG.error(
                    f"Error captured in telemetry. Run ID: {telemetry_manager.run_id}"
                )

            raise

    return wrapper


================================================
FILE: src/axolotl/telemetry/manager.py
================================================
"""Telemetry manager and associated utilities."""

import atexit
import importlib
import logging
import os
import platform
import uuid
from pathlib import Path
from typing import Any

import posthog
import psutil
import torch
import yaml

LOG = logging.getLogger(__name__)

POSTHOG_HOST = "https://app.posthog.com"
POSTHOG_WRITE_KEY = "phc_1kUR0o04oJKKTTeSsIz2Mfm5mpiVsQEf2WOlzljMD7y"

WHITELIST_PATH = str(Path(__file__).parent / "whitelist.yaml")

# NOTE: Need to keep these up to date with any config schema changes
FIELDS_TO_REDACT = {
    "base_model",
    "tokenizer_config",
    "base_model_config",
    "pretraining_dataset",  # NOTE: this field may be a string or a dictionary
    "resume_from_checkpoint",
    "hub_model_id",
}
PREFIXES_TO_REDACT = {"wandb_", "comet_", "mlflow_", "gradio_", "trackio_", "swanlab_"}
PATH_INDICATORS = {"path", "dir", "data_files"}

# pylint: disable=duplicate-code
RELEVANT_PACKAGES = {
    "torch",
    "transformers",
    "trl",
    "datasets",
    "peft",
    "bitsandbytes",
    "accelerate",
    "optimum",
    "deepspeed",
    "ray",
    "axolotl",
    "triton",
    "mamba-ssm",
    "flash-attn",
    "xformers",
    "autoawq",
    "tokenizers",
    "sentencepiece",
    "torchao",
    "lm_eval",
}


def is_main_process() -> bool:
    """
    Check whether we're running in the main process.

    Note:
        We're using this function instead of `torch.utils.distributed.is_main_process`
        causes issues with DeepSpeed world_size since. This function avoids that issue
        by checking env vars that are set by various launchers.

    Returns:
        Whether we're running in the main process.
    """
    # If PyTorch distributed is already initialized, use it
    if torch.distributed.is_initialized():
        return torch.distributed.get_rank() == 0

    # Otherwise check environment variables for global rank
    # NOTE: need to verify this in SLURM / OpenMPI environments
    global_rank = int(
        os.environ.get(
            "RANK",
            os.environ.get(
                "GLOBAL_RANK",
                os.environ.get(
                    "SLURM_PROCID",
                    os.environ.get(
                        "OMPI_COMM_WORLD_RANK",
                        "0",
                    ),
                ),
            ),
        )
    )

    return global_rank == 0


class TelemetryManager:
    """Manages telemetry collection and transmission"""

    _instance = None
    _initialized = False

    def __new__(cls):
        """
        Telemetry manager constructor. Creates the singleton instance of this class if
        it doesn't already exist.
        """
        if cls._instance is None:
            cls._instance = super(TelemetryManager, cls).__new__(cls)
            cls._instance._initialized = False

        return cls._instance

    def __init__(self):
        """Telemetry manager initializer"""
        if self._initialized:
            return

        self.enabled = self._check_telemetry_enabled()

        if self.enabled:
            self.run_id = str(uuid.uuid4())
            self.whitelist = self._load_whitelist()

            try:
                self.system_info = self._get_system_info()
            except Exception as e:  # pylint: disable=broad-exception-caught
                LOG.warning(f"Error during system info collection: {e}")
                self.system_info = None

            self._init_posthog()

            # Register shutdown method to flush posthog telemetry
            atexit.register(self.shutdown)

        self._initialized = True

    @classmethod
    def get_instance(cls) -> "TelemetryManager":
        if cls._instance is None:
            cls._instance = TelemetryManager()

        return cls._instance

    def _check_telemetry_enabled(self) -> bool:
        """
        Check if telemetry is enabled based on environment variables. We also check
        whether this is the main process (for the distributed setting and to avoid
        sending duplicate PostHog events per GPU).

        Note: This is enabled by default on an opt-out basis. Set
        `AXOLOTL_DO_NOT_TRACK=1` to disable telemetry. For more details, see
        https://axolotl-ai-cloud.github.io/axolotl/docs/telemetry.html.

        Returns:
            Boolean denoting whether telemetry is enabled or not.
        """
        # Only rank 0 will send telemetry
        if not is_main_process():
            return False

        # Parse relevant env vars
        axolotl_do_not_track = os.getenv("AXOLOTL_DO_NOT_TRACK")
        do_not_track = os.getenv("DO_NOT_TRACK")

        # Default to enabled (opt-out model)
        if axolotl_do_not_track is None or axolotl_do_not_track.lower() not in (
            "0",
            "1",
            "false",
            "true",
        ):
            return True

        if do_not_track is None:
            do_not_track = "0"

        # Respect AXOLOTL_DO_NOT_TRACK, DO_NOT_TRACK if enabled
        enabled = axolotl_do_not_track.lower() not in (
            "1",
            "true",
        ) and do_not_track.lower() not in ("1", "true")

        return enabled

    def _load_whitelist(self) -> dict:
        """Load HuggingFace Hub organization whitelist"""
        with open(WHITELIST_PATH, encoding="utf-8") as f:
            whitelist = yaml.safe_load(f)

            # Send org strings to lowercase since model names are case insensitive
            whitelist["organizations"] = {
                org.lower() for org in whitelist["organizations"]
            }

            return whitelist

    def _is_whitelisted(self, value: str) -> bool:
        """
        Check if model / dataset / etc. org is in whitelist.

        Args:
            value: Value for one of `axolotl.telemetry.manager.FIELDS_WITH_ORGS`
                ("base_model", etc.).

        Returns:
            Boolean indicating whitelist membership.
        """
        # NOTE: This membership-checking logic can be improved.
        # What happens when a local model path matches a whitelisted org?
        parts = value.split("/")
        if len(parts) < 2:
            return False
        org = parts[0]
        whitelisted = org.lower() in self.whitelist["organizations"]

        return whitelisted

    def _init_posthog(self):
        """Initialize PostHog client"""
        posthog.api_key = POSTHOG_WRITE_KEY
        posthog.project_api_key = POSTHOG_WRITE_KEY
        posthog.host = POSTHOG_HOST

    def _redact_paths(self, properties: dict[str, Any]) -> dict[str, Any]:
        """
        Redact properties to remove any paths, so as to avoid inadvertently collecting
        private or personally identifiable information (PII). We also remove
        information related to Wandb, MLflow, etc. configuration.

        Args:
            properties: Dictionary of properties to redact.

        Returns:
            Properties dictionary with redaction applied.
        """
        if not properties:
            return {}

        def redact_value(value: Any, key: str = "") -> Any:
            """Recursively sanitize values, redacting those with path-like keys"""
            if isinstance(key, str) and isinstance(value, str):
                # Other redaction special cases
                if (
                    key in FIELDS_TO_REDACT
                    or any(prefix in key for prefix in PREFIXES_TO_REDACT)
                    or any(indicator in key.lower() for indicator in PATH_INDICATORS)
                ):
                    # Fields with whitelisted orgs don't need to be redacted
                    if not self._is_whitelisted(value):
                        return "[REDACTED]"

            # Handle nested values
            if isinstance(value, dict):
                return {k: redact_value(v, k) for k, v in value.items()}
            if isinstance(value, list):
                return [redact_value(item) for item in value]

            return value

        # Create new dict with redacted values
        redacted = {k: redact_value(v, k) for k, v in properties.items()}

        return redacted

    def _get_system_info(self) -> dict[str, Any]:
        """Collect system information for various hardware accelerators"""
        gpu_info = []
        accelerator_type = "none"

        # NVIDIA GPUs
        if torch.cuda.is_available():
            accelerator_type = "cuda"
            for i in range(torch.cuda.device_count()):
                gpu_info.append(
                    {
                        "name": torch.cuda.get_device_name(i),
                        "memory": torch.cuda.get_device_properties(i).total_memory,
                    }
                )

        # AMD GPUs
        elif hasattr(torch, "hip") and torch.hip.is_available():
            accelerator_type = "hip"
            for i in range(torch.hip.device_count()):
                gpu_info.append(
                    {
                        "name": torch.hip.get_device_name(i),
                        "memory": (
                            torch.hip.get_device_properties(i).total_memory
                            if hasattr(torch.hip, "get_device_properties")
                            else None
                        ),
                    }
                )

        # Apple Silicon
        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
            accelerator_type = "mps"
            gpu_info.append(
                {
                    "name": "Apple Silicon",
                    # NOTE: this is memory allocated to this process, not total memory
                    "memory": torch.mps.driver_allocated_memory(),
                }
            )

        # Intel GPUs
        elif hasattr(torch, "xpu") and torch.xpu.is_available():
            accelerator_type = "xpu"
            for i in range(torch.xpu.device_count()):
                memory = None
                if hasattr(torch.xpu, "get_device_properties"):
                    memory = torch.xpu.get_device_properties(i).total_memory

                gpu_info.append(
                    {
                        "name": torch.xpu.get_device_name(i),
                        "memory": memory,
                    }
                )

        # NPUs
        elif hasattr(torch, "npu") and torch.npu.is_available():
            accelerator_type = "npu"
            for i in range(torch.npu.device_count()):
                memory = None
                if hasattr(torch.npu, "get_device_properties"):
                    memory = torch.npu.get_device_properties(i).total_memory

                gpu_info.append(
                    {
                        "name": torch.npu.get_device_name(i),
                        "memory": memory,
                    }
                )

        # Get relevant package versions
        installed_packages = {}
        for package in RELEVANT_PACKAGES:
            try:
                version = importlib.metadata.version(package)
                installed_packages[f"{package}_version"] = version
            except importlib.metadata.PackageNotFoundError:
                pass

        return {
            "os": platform.system(),
            "python_version": platform.python_version(),
            "cpu_count": psutil.cpu_count(),
            "memory_total": psutil.virtual_memory().total,
            "accelerator_type": accelerator_type,
            "accelerator_count": len(gpu_info),
            "accelerator_info": gpu_info,
            **installed_packages,
        }

    def send_event(self, event_type: str, properties: dict[str, Any] | None = None):
        """Send a telemetry event"""
        if not self.enabled:
            return

        if properties is None:
            properties = {}

        # Sanitize properties to remove PII
        properties = self._redact_paths(properties)

        # Wrap PostHog errors in try / except to not raise errors during Axolotl usage
        try:
            # Send event via PostHog
            posthog.capture(
                distinct_id=self.run_id,
                event=event_type,
                properties=properties,
                disable_geoip=True,
            )
        except Exception as e:  # pylint: disable=broad-exception-caught
            LOG.warning(f"Failed to send telemetry event: {e}")

        # Additionally, send system info telemetry when loading config.
        # NOTE: Is this the best place for this?
        if event_type == "config-loaded":
            self.send_system_info()

    def send_system_info(self):
        """Helper method for sending system info"""
        if self.system_info is not None:
            self.send_event(event_type="system-info", properties=self.system_info)

    def shutdown(self):
        """Ensure all queued events are processed before shutdown"""
        if self.enabled:
            posthog.shutdown()


================================================
FILE: src/axolotl/telemetry/runtime_metrics.py
================================================
"""Telemetry utilities for runtime and memory metrics."""

import logging
import time
from dataclasses import dataclass, field
from typing import Any

import psutil
import torch

from axolotl.telemetry.manager import TelemetryManager

LOG = logging.getLogger(__name__)


@dataclass
class RuntimeMetrics:
    """Container for runtime metrics to be tracked throughout training."""

    # Timing metrics
    start_time: float
    epoch_start_times: dict[int, float] = field(init=False)
    epoch_end_times: dict[int, float] = field(init=False)

    # Memory metrics
    peak_cpu_memory: int = 0
    peak_gpu_memory: dict[int, int] = field(init=False)

    # Progress metrics
    total_steps: int = 0
    current_epoch: int = 0
    current_step: int = 0

    def __post_init__(self):
        """Initialize empty metric mappings."""
        self.epoch_start_times = {}
        self.epoch_end_times = {}
        self.peak_gpu_memory = {}

    @property
    def elapsed_time(self) -> float:
        """Calculate total elapsed time in seconds."""
        return time.time() - self.start_time

    def epoch_time(self, epoch: int) -> float | None:
        """Calculate time taken for a specific epoch in seconds."""
        if epoch in self.epoch_start_times and epoch in self.epoch_end_times:
            return self.epoch_end_times[epoch] - self.epoch_start_times[epoch]

        return None

    def average_epoch_time(self) -> float | None:
        """Calculate average time per epoch in seconds."""
        completed_epochs = [
            epoch for epoch in self.epoch_start_times if epoch in self.epoch_end_times
        ]
        if not completed_epochs:
            return None

        total_time = 0.0
        for epoch in completed_epochs:
            epoch_time = self.epoch_time(epoch)
            if epoch_time is not None:  # Check to avoid mypy warning
                total_time += epoch_time

        return total_time / len(completed_epochs)

    def steps_per_second(self) -> float | None:
        """Calculate average steps per second across all training."""
        if self.total_steps == 0 or self.elapsed_time == 0:
            return None

        return self.total_steps / self.elapsed_time

    def to_dict(self) -> dict[str, Any]:
        """Convert metrics to a dictionary for telemetry reporting."""
        metrics = {
            "total_time_seconds": self.elapsed_time,
            "total_steps": self.total_steps,
            "steps_per_second": self.steps_per_second(),
            "epochs_completed": len(
                [
                    epoch
                    for epoch in self.epoch_start_times
                    if epoch in self.epoch_end_times
                ]
            ),
            "peak_cpu_memory_bytes": self.peak_cpu_memory,
        }

        # Add per-epoch timing if available
        epoch_times: dict[str, float] = {}
        for epoch in sorted(self.epoch_end_times.keys()):
            time_taken = self.epoch_time(epoch)
            if time_taken is not None:
                epoch_times[f"epoch_{epoch}_seconds"] = time_taken

        if epoch_times:
            metrics["epoch_times"] = epoch_times  # type: ignore
            metrics["average_epoch_time_seconds"] = self.average_epoch_time()

        # Add GPU memory metrics if available
        if self.peak_gpu_memory:
            gpu_metrics: dict[str, int] = {}
            for gpu_id, memory in self.peak_gpu_memory.items():
                gpu_metrics[f"gpu_{gpu_id}_peak_memory_bytes"] = memory
            metrics["gpu_memory"] = gpu_metrics  # type: ignore

        return metrics


class RuntimeMetricsTracker:
    """Tracker for runtime metrics during training."""

    update_interval = 100

    def __init__(self):
        """Initialize the runtime metrics tracker."""
        self.metrics = RuntimeMetrics(start_time=time.time())
        self.telemetry_manager = TelemetryManager.get_instance()
        self._process = psutil.Process()

    def start_epoch(self, epoch: int):
        """Record the start of a new epoch."""
        self.metrics.current_epoch = epoch
        self.metrics.epoch_start_times[epoch] = time.time()
        self.update_memory_metrics()

    def end_epoch(self, epoch: int):
        """Record the end of an epoch."""
        self.metrics.epoch_end_times[epoch] = time.time()

    def update_step(self, step: int):
        """Update the current step count."""
        self.metrics.current_step = step
        self.metrics.total_steps += 1

        # Periodically update memory metrics
        if step % self.update_interval == 0:
            self.update_memory_metrics()

    def _get_allocated_memory(self) -> dict[int, int]:
        """
        Helper function for getting accelerator-agnostic allocated memory.

        Returns:
            A dictionary mapping device IDs to allocated memory in bytes
        """
        memory_used: dict[int, int] = {}

        # NVIDIA GPUs
        if torch.cuda.is_available():
            for i in range(torch.cuda.device_count()):
                memory_used[i] = torch.cuda.memory_allocated(i)

        # AMD GPUs
        elif hasattr(torch, "hip") and torch.hip.is_available():
            for i in range(torch.hip.device_count()):
                if hasattr(torch.hip, "memory_allocated"):
                    memory_used[i] = torch.hip.memory_allocated(i)

        # Apple Silicon
        elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
            # MPS doesn't have per-device memory stats since there's only one device
            if hasattr(torch.mps, "current_allocated_memory"):
                memory_used[0] = torch.mps.current_allocated_memory()

        # Intel GPUs
        elif hasattr(torch, "xpu") and torch.xpu.is_available():
            for i in range(torch.xpu.device_count()):
                if hasattr(torch.xpu, "memory_allocated"):
                    memory_used[i] = torch.xpu.memory_allocated(i)

        # NPUs
        elif hasattr(torch, "npu") and torch.npu.is_available():
            for i in range(torch.npu.device_count()):
                if hasattr(torch.npu, "memory_allocated"):
                    memory_used[i] = torch.npu.memory_allocated(i)

        return memory_used

    def update_memory_metrics(self):
        """Update peak memory usage metrics."""
        # CPU memory
        cpu_memory = self._process.memory_info().rss
        self.metrics.peak_cpu_memory = max(self.metrics.peak_cpu_memory, cpu_memory)

        # GPU memory (if available)
        memory_used = self._get_allocated_memory()
        for i, memory in memory_used.items():
            self.metrics.peak_gpu_memory[i] = max(
                self.metrics.peak_gpu_memory.get(i, 0), memory
            )

    def get_memory_metrics(self) -> dict[str, Any]:
        """Get the current memory metrics as a dictionary."""
        memory_metrics = {
            "cpu_memory_bytes": self._process.memory_info().rss,
            "peak_cpu_memory_bytes": self.metrics.peak_cpu_memory,
        }

        # GPU memory (if available)
        memory_used = self._get_allocated_memory()
        for i, memory in memory_used.items():
            memory_metrics[f"gpu_{i}_memory_bytes"] = memory
            memory_metrics[f"gpu_{i}_peak_memory_bytes"] = (
                self.metrics.peak_gpu_memory.get(i, 0)
            )

        return memory_metrics


================================================
FILE: src/axolotl/telemetry/whitelist.yaml
================================================
organizations:
  - "axolotl-ai-co"
  - "meta-llama"
  - "huggingface"
  - "nvidia"
  - "facebook"
  - "google"
  - "microsoft"
  - "deepseek-ai"
  - "HuggingFaceTB"
  - "mistralai"
  - "Qwen"
  - "unsloth"
  - "NousResearch"
  - "allenai"
  - "amd"
  - "tiiuae"
  - "tencent"
  - "zai-org"
  - "openai"
  - "ibm-granite"
  - "arcee-ai"
  - "swiss-ai"
  - "CohereForAI"
  - "deepcogito"
  - "THUDM"
  - "ai21labs"
  - "LiquidAI"
  - "canopylabs"
  - "state-spaces"
  - "mistral-community"
  - "llava-hf"
  - "ByteDance-Seed"
  - "ACE-Step"
  - "openbmb"
  - "MiniMaxAI"
  - "stepfun-ai"
  - "internlm"
  - "katanemo"
  - "XiaomiMiMo"


================================================
FILE: src/axolotl/train.py
================================================
"""Prepare and train a model on a dataset. Can also infer from a model or merge lora"""

from __future__ import annotations

import importlib
import inspect
import json
import os
import shutil
import signal
import sys
import typing
import weakref
from collections import OrderedDict
from contextlib import ExitStack
from pathlib import Path
from typing import Any, Dict

import torch
import transformers.modelcard
from datasets import Dataset
from huggingface_hub.errors import OfflineModeIsEnabled
from peft import PeftConfig, PeftModel
from transformers import PreTrainedModel, PreTrainedTokenizer, ProcessorMixin
from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
from transformers.trainer import Trainer

from axolotl.common.datasets import TrainDatasetMeta
from axolotl.contribs.lgpl import (  # pylint: disable = no-name-in-module
    fix_untrained_tokens,
)
from axolotl.integrations.base import PluginManager
from axolotl.loaders import ModelLoader, load_processor, load_tokenizer
from axolotl.telemetry.errors import send_errors
from axolotl.telemetry.manager import TelemetryManager
from axolotl.utils.ctx_managers.sequence_parallel import SequenceParallelContextManager
from axolotl.utils.dict import DictDefault
from axolotl.utils.distributed import cleanup_distributed
from axolotl.utils.freeze import freeze_layers_except
from axolotl.utils.logging import get_logger
from axolotl.utils.schemas.enums import RLType
from axolotl.utils.train import determine_last_checkpoint
from axolotl.utils.trainer import setup_trainer

if typing.TYPE_CHECKING:
    from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder

LOG = get_logger(__name__)

TELEMETRY_MANAGER = TelemetryManager.get_instance()
PLUGIN_MANAGER = PluginManager.get_instance()


def setup_model_and_tokenizer(
    cfg: DictDefault,
) -> tuple[
    PreTrainedModel, PreTrainedTokenizer, PeftConfig | None, ProcessorMixin | None
]:
    """Load the tokenizer, processor (for multimodal models), and model based on
    configuration.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.

    Returns:
        Tuple containing model, tokenizer, `peft_config` (if LoRA / QLoRA, else
            `None`), and processor (if multimodal, else `None`).
    """
    # Load tokenizer
    LOG.debug(
        f"loading tokenizer... {cfg.tokenizer_config or cfg.base_model_config}",
    )
    tokenizer = load_tokenizer(cfg)

    # Load processor for multimodal models if needed
    processor = None
    if cfg.is_multimodal:
        processor = load_processor(cfg, tokenizer)

    # Load the model
    LOG.debug("Loading model")

    model_loader = ModelLoader(cfg, tokenizer, processor=processor)
    model, peft_config = model_loader.load()
    if model.generation_config is not None:
        model.generation_config.do_sample = True

    model_properties = model.config.to_dict()
    try:
        model_properties["num_parameters"] = model.num_parameters()
    except Exception:  # pylint: disable=broad-exception-caught
        model_properties["num_parameters"] = sum(p.numel() for p in model.parameters())
    # if the num_parameters is less than 2B, let's round to nearest 100M, else round to nearest 1B
    if model_properties["num_parameters"] < 2e9:
        model_properties["num_parameters_est"] = (
            f"{round(model_properties['num_parameters'] / 1e8) * 100}M"
        )
    else:
        model_properties["num_parameters_est"] = (
            f"{round(model_properties['num_parameters'] / 1e9)}B"
        )
    TELEMETRY_MANAGER.send_event(event_type="model-load", properties=model_properties)
    if peft_config:
        TELEMETRY_MANAGER.send_event(
            event_type="peft-config-load", properties=peft_config.to_dict()
        )

    # Apply freezing if specified
    if cfg.unfrozen_parameters:
        freeze_layers_except(model, cfg.unfrozen_parameters)
        if any(
            any(embed in param for embed in ["lm_head", "embed_tokens"])
            for param in cfg.unfrozen_parameters
        ):
            model.enable_input_require_grads()

    return model, tokenizer, peft_config, processor


def setup_reference_model(
    cfg: DictDefault, tokenizer: PreTrainedTokenizer
) -> PreTrainedModel | None:
    """
    Set up the reference model for RL training if needed.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        tokenizer: The tokenizer to use for the reference model.

    Returns:
        Reference model if needed for RL training, `None` otherwise.
    """
    model_ref = None
    if cfg.rl and cfg.rl != RLType.ORPO:
        if cfg.adapter and not cfg.rl_adapter_ref_model:
            # use built-in trl autounwrap
            LOG.debug("Passing model_ref: None to RL trainer")
            model_ref = None  # explicit setting to None
        else:
            reference_model: bool = True
            if cfg.rl == RLType.GRPO and cfg.trl.beta == 0:
                reference_model = False
            # load the model again for model_ref/baseline
            model_loader = ModelLoader(cfg, tokenizer, reference_model=reference_model)
            model_ref, _ = model_loader.load()
    return model_ref


def setup_signal_handler(cfg: DictDefault, model: PreTrainedModel):
    """
    Set up signal handler for graceful termination.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        model: The model to save on termination
    """
    # ray workers don't have access to this signal
    if cfg.local_rank == 0 and not cfg.use_ray:

        def terminate_handler(_, __, model_weakref):
            if model_weakref() is not None:
                _model = model_weakref()
                _model.save_pretrained(cfg.output_dir)

            cleanup_distributed()
            sys.exit(0)

        _model_weakref = weakref.ref(model)
        signal.signal(
            signal.SIGINT,
            lambda signum, frame: terminate_handler(signum, frame, _model_weakref),
        )


def execute_training(
    cfg: DictDefault, trainer: Any, resume_from_checkpoint: str | None
):
    """
    Execute the training process with appropriate SDP kernel configurations.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        trainer: The configured trainer object.
        resume_from_checkpoint: Path to checkpoint to resume from, if applicable.
    """
    with ExitStack() as stack:
        # Define the context managers to use
        if cfg.flash_optimum:
            stack.enter_context(
                torch.backends.cuda.sdp_kernel(
                    enable_flash=True,
                    enable_math=True,
                    enable_mem_efficient=True,
                )
            )

        if cfg.context_parallel_size > 1:
            models = [trainer.model]
            if hasattr(trainer, "ref_model") and trainer.ref_model:
                models.append(trainer.ref_model)

            stack.enter_context(
                SequenceParallelContextManager(
                    models=models,
                    context_parallel_size=cfg.context_parallel_size,
                    gradient_accumulation_steps=cfg.gradient_accumulation_steps,
                    ring_attn_func=cfg.ring_attn_func,
                    heads_k_stride=cfg.heads_k_stride,
                    gather_outputs=cfg.rl is RLType.GRPO,
                    device_mesh=trainer.accelerator.torch_device_mesh,
                )
            )

        # TODO: disabling for now as not compatible with FSDP2 + torchao low bit optimizers
        # if cfg.bf16:
        #     torch.set_default_dtype(torch.bfloat16)

        LOG.info("Starting trainer...")
        trainer.train(resume_from_checkpoint=resume_from_checkpoint)

        PLUGIN_MANAGER.post_train(cfg, trainer.model)


def save_trained_model(
    cfg: DictDefault,
    trainer: Any,
    model: PreTrainedModel,
):
    """
    Save the trained model according to configuration and training setup.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        trainer: The trainer object.
        model: The trained model to save.
    """
    LOG.info(f"Training completed! Saving trained model to {cfg.output_dir}.")

    # Post training module hooks
    for name, module in model.named_modules():
        if hasattr(module, "_post_training"):
            module._post_training(model, name)

    # handle QAT
    if cfg.qat:
        from axolotl.utils.quantization import convert_qat_model

        convert_qat_model(
            model,
            quantize_embedding=cfg.qat.quantize_embedding,
        )
        LOG.info(
            "QAT usage note: please ensure you quantize your model fine-tuned using QAT by running `axolotl quantize`"
            " with the same config which you used for training."
        )
    # Handle ReLoRA early return case
    if cfg.relora:
        if cfg.adapter == "lora" and not (cfg.load_in_4bit or cfg.load_in_8bit):
            model = model.merge_and_unload()
        else:
            # final model weights have already been saved by `ReLoRACallback.on_train_end`
            return

    if trainer.is_fsdp_enabled or cfg.fsdp_config:
        if cfg.fsdp_config or cfg.fsdp:
            if cfg.fsdp_config.final_state_dict_type:
                state_dict_type = cfg.fsdp_config.final_state_dict_type
            else:
                state_dict_type = cfg.fsdp_config.state_dict_type
            trainer.accelerator.state.fsdp_plugin.set_state_dict_type(state_dict_type)
        trainer.save_model(cfg.output_dir)  # only handles FULL_STATE_DICT
        if state_dict_type == "SHARDED_STATE_DICT":
            LOG.info(
                "The final model was saved with a sharded state dict. Please ensure you merge "
                "the sharded weights with `merge-sharded-fsdp-weights`."
            )
            checkpoint_dir = determine_last_checkpoint(cfg, update=False)
            if (
                not (Path(cfg.output_dir) / "model.safetensors.index.json").exists()
                and checkpoint_dir
            ):
                # import here to prevent circular import
                from axolotl.cli.merge_sharded_fsdp_weights import merge_fsdp_weights

                fsdp_dir = Path(checkpoint_dir) / "pytorch_model_fsdp_0"
                merged_path = str(Path(cfg.output_dir) / "merged")
                merge_fsdp_weights(
                    checkpoint_dir=str(fsdp_dir),
                    output_path=merged_path,
                )
                trainer.accelerator.wait_for_everyone()
                if trainer.accelerator.is_main_process:
                    # move all files in merged_path to cfg.output_dir
                    for merged_file in Path(merged_path).iterdir():
                        if (Path(cfg.output_dir) / merged_file.name).exists():
                            (Path(cfg.output_dir) / merged_file.name).unlink()
                        shutil.move(str(merged_file), cfg.output_dir)
                    shutil.rmtree(merged_path)  # remove what should be an empty dir
        # TODO(wing):see https://github.com/huggingface/transformers/pull/40207
        # cleanup the FSDP prefix in the model config.json
        if trainer.accelerator.is_main_process:
            with open(
                Path(cfg.output_dir) / "config.json", "r", encoding="utf-8"
            ) as config_file_io:
                # read the model config as an OrderedDict
                config = json.load(config_file_io, object_pairs_hook=OrderedDict)
                config["architectures"] = [
                    name.lstrip("FSDP") for name in config["architectures"]
                ]
            # write the updated model config back
            with open(
                os.path.join(cfg.output_dir, "config.json"), "w", encoding="utf-8"
            ) as config_file_io:
                json.dump(config, config_file_io, indent=2)
    elif cfg.deepspeed and is_deepspeed_zero3_enabled():
        # Copied over from: https://github.com/huggingface/accelerate/blob/5ae611118057232f441055f7ef9ba0b0f2b8d533/docs/source/usage_guides/deepspeed.md#saving-and-loading
        trainer.accelerator.wait_for_everyone()
        trainer.save_model(cfg.output_dir)

        # the trainer saved a model.safetensors file in the output directory,
        # but it is most likely a proxy model and if so, should be deleted
        maybe_proxy = os.path.exists(os.path.join(cfg.output_dir, "model.safetensors"))
        maybe_sharded = os.path.exists(
            os.path.join(cfg.output_dir, "model.safetensors.index.json")
        )

        if maybe_proxy and maybe_sharded:
            LOG.info(f"Deleting {os.path.join(cfg.output_dir, 'model.safetensors')}")
            LOG.info("This is a proxy model and should be deleted")
            try:
                os.remove(os.path.join(cfg.output_dir, "model.safetensors"))
            except FileNotFoundError:
                pass
    elif cfg.local_rank == 0:
        if cfg.rl and cfg.adapter and not cfg.rl_adapter_ref_model:
            trainer.model.save_pretrained(cfg.output_dir)

        model.save_pretrained(cfg.output_dir)

    if hasattr(cfg, "llmcompressor") and cfg.llmcompressor:
        # TODO: add integration support so this can be implemented completely within the plugin
        from axolotl.integrations.llm_compressor.utils import save_compressed_model

        save_compressed_model(
            model=model,
            output_dir=cfg.output_dir,
            trainer=trainer,
            save_compressed=cfg.llmcompressor.save_compressed,
        )

    LOG.info(f"Model successfully saved to {cfg.output_dir}")


def create_model_card(cfg: DictDefault, trainer: Trainer):
    """
    Create a model card for the trained model if needed.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        trainer: The trainer object with model card creation capabilities.
    """
    if not cfg.hub_model_id:
        # Guard since create_model_card may fail if dataset_tags is empty list
        try:
            model_card_kwarg = {
                "model_name": cfg.output_dir.lstrip("./")
                .encode("utf-8")
                .decode("utf-8")
            }

            # We check if we're using a TRL trainer; if so, `dataset_tags` is not consumed.
            rl = cfg.rl is not None or cfg.reward_model or cfg.process_reward_model
            if cfg.datasets is not None and not rl:
                dataset_tags = [
                    d["path"] for d in cfg.datasets if not Path(d["path"]).is_dir()
                ]
                dataset_tags = [d for d in dataset_tags if not d.startswith("https://")]

                if dataset_tags:
                    model_card_kwarg["dataset_tags"] = dataset_tags

            trainer.create_model_card(**model_card_kwarg)
        except (AttributeError, UnicodeDecodeError, OfflineModeIsEnabled):
            pass
    elif cfg.hub_model_id:
        # Defensively push to the hub to ensure the model card is updated
        trainer.push_to_hub()


def save_initial_configs(
    cfg: DictDefault,
    tokenizer: PreTrainedTokenizer,
    model: PreTrainedModel,
    peft_config: PeftConfig | None,
    processor: ProcessorMixin | None,
):
    """
    Save initial configurations before training.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        tokenizer: The tokenizer to save.
        model: The model to save configuration for.
        peft_config: The PEFT configuration to save if applicable.
    """
    # Create output_dir if it doesn't already exist
    output_dir = Path(cfg.output_dir)
    if not output_dir.is_dir():
        os.makedirs(cfg.output_dir, exist_ok=True)

    # Pre-save adapter config so it's available to inspect
    if peft_config:
        LOG.info(f"Pre-saving adapter config to {cfg.output_dir}...")
        peft_config.save_pretrained(cfg.output_dir)

    # Pre-save the tokenizer and model configs
    LOG.info(f"Pre-saving tokenizer to {cfg.output_dir}...")
    tokenizer.save_pretrained(
        str(Path(cfg.output_dir)), save_jinja_files=cfg.tokenizer_save_jinja_files
    )
    if hasattr(model, "config"):
        LOG.info(f"Pre-saving model config to {cfg.output_dir}...")
        model.config.save_pretrained(str(output_dir))

    if processor:
        LOG.info(f"Pre-saving processor to {cfg.output_dir}...")
        processor.save_pretrained(str(output_dir))


def setup_model_card(cfg: DictDefault):
    """
    Set up the Axolotl badge and add the Axolotl config to the model card if available.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
    """
    badge_markdown = """[<img src="https://raw.githubusercontent.com/axolotl-ai-cloud/axolotl/main/image/axolotl-badge-web.png" alt="Built with Axolotl" width="200" height="32"/>](https://github.com/axolotl-ai-cloud/axolotl)"""
    transformers.modelcard.AUTOGENERATED_TRAINER_COMMENT += f"\n{badge_markdown}"

    if cfg.axolotl_config_path:
        raw_axolotl_cfg = Path(cfg.axolotl_config_path)
        version = importlib.metadata.version("axolotl")
        if raw_axolotl_cfg.is_file():
            transformers.modelcard.AUTOGENERATED_TRAINER_COMMENT += f"\n<details><summary>See axolotl config</summary>\n\naxolotl version: `{version}`\n```yaml\n{raw_axolotl_cfg.read_text(encoding='utf-8')}\n```\n\n</details><br>\n"


def handle_untrained_tokens_fix(
    cfg: DictDefault,
    model: PreTrainedModel,
    tokenizer: PreTrainedTokenizer,
    train_dataset: Dataset,
):
    """
    Apply fixes for untrained tokens if configured.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        model: The model to apply fixes to.
        tokenizer: The tokenizer for token identification.
        train_dataset: The training dataset to use.
    """
    if not cfg.fix_untrained_tokens:
        return

    is_ds_zero3: bool = False
    if os.environ.get("ACCELERATE_DEEPSPEED_ZERO_STAGE") == "3":
        is_ds_zero3 = True

    # Check if the `token_ids_to_fix` kwarg exists in the fix_untrained_tokens args
    sig = inspect.signature(fix_untrained_tokens)

    fix_kwargs: Dict[str, Any] = {}
    # If the function has the `token_ids_to_fix` arg, and fix_untrained_tokens is a list
    if "token_ids_to_fix" in sig.parameters and isinstance(
        cfg.fix_untrained_tokens, list
    ):
        fix_kwargs["token_ids_to_fix"] = cfg.fix_untrained_tokens
    if "is_ds_zero3" in sig.parameters:
        fix_kwargs["is_ds_zero3"] = is_ds_zero3

    fix_untrained_tokens(model, tokenizer, train_dataset, **fix_kwargs)

    if cfg.local_rank == 0:
        model.save_pretrained(str(Path(cfg.output_dir)))


def setup_model_and_trainer(
    cfg: DictDefault, dataset_meta: TrainDatasetMeta
) -> tuple[
    "HFRLTrainerBuilder" | "HFCausalTrainerBuilder",
    PeftModel | PreTrainedModel,
    PreTrainedTokenizer,
    PeftConfig | None,
    ProcessorMixin | None,
]:
    """
    Load model, tokenizer, trainer, etc. Helper function to encapsulate the full
    trainer setup.

    Args:
        cfg: The configuration dictionary with training parameters.
        dataset_meta: Object with training, validation datasets and metadata.

    Returns:
        Tuple of:
            - Trainer (Causal or RLHF)
            - Model
            - Tokenizer
            - PEFT config
            - Processor
    """
    # Load tokenizer, processor and model
    model, tokenizer, peft_config, processor = setup_model_and_tokenizer(cfg)

    # Set up reference model for RL if needed
    model_ref = setup_reference_model(cfg, tokenizer)

    # Get datasets from metadata
    train_dataset = dataset_meta.train_dataset
    eval_dataset = dataset_meta.eval_dataset
    total_num_steps = dataset_meta.total_num_steps

    # Set up trainer
    trainer = setup_trainer(
        cfg=cfg,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        model=model,
        tokenizer=tokenizer,
        processor=processor,
        total_num_steps=total_num_steps,
        model_ref=model_ref,
        peft_config=peft_config,
    )
    PLUGIN_MANAGER.post_trainer_create(cfg, trainer)

    if cfg.use_ray:
        try:
            import ray.train.huggingface.transformers

            trainer = ray.train.huggingface.transformers.prepare_trainer(trainer)
        except ImportError:
            LOG.warning(
                "The Ray integration with Hugging Face Transformers is not available. "
                "To use Ray, install the 'ray[train]' package."
            )

    return (
        trainer,
        model,
        tokenizer,
        peft_config,
        processor,
    )


@send_errors
def train(
    cfg: DictDefault, dataset_meta: TrainDatasetMeta
) -> tuple[PeftModel | PreTrainedModel, PreTrainedTokenizer, Trainer]:
    """
    Train a model on the given dataset.

    Args:
        cfg: The configuration dictionary with training parameters
        dataset_meta: Object with training, validation datasets and metadata

    Returns:
        Tuple of (model, tokenizer) after training
    """
    # Setup model, tokenizer, (causal or RLHF) trainer, etc.
    (
        trainer,
        model,
        tokenizer,
        peft_config,
        processor,
    ) = setup_model_and_trainer(cfg, dataset_meta)

    # Handle untrained tokens if configured
    train_dataset = dataset_meta.train_dataset
    handle_untrained_tokens_fix(cfg, model, tokenizer, train_dataset)

    # Additional setup
    save_initial_configs(cfg, tokenizer, model, peft_config, processor)
    setup_signal_handler(cfg, model)
    setup_model_card(cfg)

    # Execute the training
    resume_from_checkpoint = determine_last_checkpoint(cfg)
    execute_training(cfg, trainer, resume_from_checkpoint)

    # clear cache
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

    # Save the trained model and cleanup
    save_trained_model(cfg, trainer, model)
    tokenizer.save_pretrained(
        str(Path(cfg.output_dir)), save_jinja_files=cfg.tokenizer_save_jinja_files
    )
    create_model_card(cfg, trainer)
    if not cfg.use_ray:
        cleanup_distributed()
    PLUGIN_MANAGER.post_train(cfg, model)

    return model, tokenizer, trainer


================================================
FILE: src/axolotl/utils/__init__.py
================================================
"""
Basic utils for Axolotl
"""

import importlib.util
import os
import re

import torch


def is_mlflow_available():
    return importlib.util.find_spec("mlflow") is not None


def is_comet_available():
    return importlib.util.find_spec("comet_ml") is not None


def is_opentelemetry_available():
    return (
        importlib.util.find_spec("opentelemetry") is not None
        and importlib.util.find_spec("prometheus_client") is not None
    )


def is_trackio_available():
    return importlib.util.find_spec("trackio") is not None


def get_pytorch_version() -> tuple[int, int, int]:
    """
    Get Pytorch version as a tuple of (major, minor, patch).
    """
    torch_version = torch.__version__
    version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version)

    if not version_match:
        raise ValueError("Invalid version format")

    major, minor, patch = version_match.groups()
    major, minor = int(major), int(minor)
    patch = int(patch) if patch is not None else 0  # Default patch to 0 if not present
    return major, minor, patch


def set_pytorch_cuda_alloc_conf():
    """Set up CUDA allocation config"""
    torch_version = torch.__version__.split(".")
    torch_major, torch_minor = int(torch_version[0]), int(torch_version[1])
    config_value = "expandable_segments:True,roundup_power2_divisions:16"
    if (
        torch_major == 2
        and torch_minor >= 9
        and os.getenv("PYTORCH_ALLOC_CONF") is None
    ):
        os.environ["PYTORCH_ALLOC_CONF"] = config_value
    elif (
        torch_major == 2
        and torch_minor >= 2
        and os.getenv("PYTORCH_CUDA_ALLOC_CONF") is None
    ):
        os.environ["PYTORCH_CUDA_ALLOC_CONF"] = config_value


def set_misc_env():
    if os.getenv("XFORMERS_IGNORE_FLASH_VERSION_CHECK") is None:
        os.environ["XFORMERS_IGNORE_FLASH_VERSION_CHECK"] = "1"


def get_not_null(value, default=None):
    """
    return the value if it's not None, otherwise return the default value
    """
    return value if value is not None else default


================================================
FILE: src/axolotl/utils/bench.py
================================================
"""Benchmarking and measurement utilities"""

import functools
import logging

import torch
from transformers.utils.import_utils import is_torch_npu_available

from axolotl.utils.distributed import get_device_type

try:
    from pynvml import (
        NVMLError,
        nvmlDeviceGetHandleByIndex,
        nvmlDeviceGetMemoryInfo,
        nvmlInit,
    )
except ImportError:
    NVMLError = None
    nvmlDeviceGetHandleByIndex = None
    nvmlDeviceGetMemoryInfo = None
    nvmlInit = None


def check_cuda_device(default_value):
    """
    wraps a function and returns the default value instead of running the
    wrapped function if cuda isn't available or the device is auto
    :param default_value:
    :return:
    """

    def deco(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            device = kwargs.get("device", args[0] if args else None)

            if (
                device is None
                or not torch.cuda.is_available()
                or device == "auto"
                or torch.device(device).type == "cpu"
                or torch.device(device).type == "meta"
            ):
                return default_value
            return func(*args, **kwargs)

        return wrapper

    return deco


@check_cuda_device(0.0)
def gpu_memory_usage(device=0):
    return torch.cuda.memory_allocated(device) / 1024.0**3


@check_cuda_device((0.0, 0.0, 0.0))
def gpu_memory_usage_all(device=0):
    active = torch.cuda.memory_stats().get("active_bytes.all.peak", 0) / 1024.0**3
    allocated = torch.cuda.max_memory_allocated(device) / 1024.0**3
    reserved = torch.cuda.max_memory_reserved(device) / 1024.0**3
    torch.cuda.reset_peak_memory_stats(device)
    return active, allocated, reserved


def mps_memory_usage_all():
    active = torch.mps.current_allocated_memory() / 1024.0**3
    allocated = torch.mps.driver_allocated_memory() / 1024.0**3
    return active, allocated, 0


def npu_memory_usage_all(device=0):
    usage = torch.npu.memory_allocated(device) / 1024.0**3
    reserved = torch.npu.memory_reserved(device) / 1024.0**3
    return usage, reserved - usage, 0


@check_cuda_device(0.0)
def gpu_memory_usage_smi(device=0):
    if isinstance(device, torch.device):
        device = device.index
    if isinstance(device, str) and device.startswith("cuda:"):
        device = int(device[5:])
    if not nvmlInit:
        return 0.0
    try:
        nvmlInit()
        handle = nvmlDeviceGetHandleByIndex(device)
        info = nvmlDeviceGetMemoryInfo(handle)
        return info.used / 1024.0**3
    except NVMLError:
        return 0.0


def get_gpu_memory_usage(device: int | torch.device = 0):
    cur_device_type = str(get_device_type())
    if torch.backends.mps.is_available():
        usage, cache, misc = mps_memory_usage_all()
    elif "npu" in cur_device_type and is_torch_npu_available():
        usage, cache, misc = npu_memory_usage_all(device)
    elif "cuda" in cur_device_type and torch.cuda.is_available():
        usage, cache, misc = gpu_memory_usage_all(device)
    else:
        return 0.0, 0.0, 0.0

    return usage, cache, misc


def log_gpu_memory_usage(
    log: logging.Logger | logging.LoggerAdapter,
    msg: str = "",
    device: int | torch.device = 0,
):
    try:
        active, allocated, reserved = get_gpu_memory_usage(device)
    except ValueError:
        # likely CPU, ignore
        return
    cur_device_type = str(get_device_type())
    extras = []
    if allocated > 0:
        extras.append(f"+{allocated:.03f}GB allocated")
    if reserved > 0:
        extras.append(f"+{reserved:.03f}GB reserved")
    msg = f"{cur_device_type} memory active:" if not msg else msg
    log.debug(
        f"{msg} {active:.03f}GB ({', '.join(extras)})",
        stacklevel=2,
    )


================================================
FILE: src/axolotl/utils/callbacks/__init__.py
================================================
"""Callbacks for Trainer class"""

from __future__ import annotations

import gc
import json
import os
import traceback
from shutil import copyfile
from tempfile import NamedTemporaryFile
from typing import TYPE_CHECKING, Any, Dict, List

import evaluate
import numpy as np
import pandas as pd
import torch
import torch.distributed as dist
import wandb
import yaml
from datasets import load_dataset
from tqdm import tqdm
from transformers import (
    GenerationConfig,
    Trainer,
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
)
from transformers.trainer_utils import (
    SaveStrategy,
)
from trl.models import unwrap_model_for_generation

from axolotl.utils import is_comet_available, is_mlflow_available
from axolotl.utils.callbacks.perplexity import Perplexity
from axolotl.utils.distributed import (
    barrier,
    broadcast_dict,
    gather_scalar_from_all_ranks,
    get_world_size,
    is_distributed,
    is_main_process,
    zero_first,
)
from axolotl.utils.logging import get_logger
from axolotl.utils.schemas.config import AxolotlInputConfig

if TYPE_CHECKING:
    from axolotl.core.training_args import AxolotlTrainingArguments


IGNORE_INDEX = -100
LOG = get_logger(__name__)


class LossWatchDogCallback(TrainerCallback):
    """Callback to track loss and stop training if loss is too high"""

    def __init__(self, cfg):
        self.cfg = cfg
        self.violations = 0
        self.threshold = cfg.loss_watchdog_threshold
        self.patience = cfg.loss_watchdog_patience or 3

    def on_step_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **_kwargs,
    ) -> TrainerControl:
        if len(state.log_history) > 0 and "loss" in state.log_history[-1]:
            if state.log_history[-1]["loss"] > self.threshold:
                self.violations += 1
                if self.violations >= self.patience:
                    LOG.warning(
                        "Loss is too high, stopping training (loss_watchdog_threshold)"
                    )
                    control.should_training_stop = True
            else:
                self.violations = 0
        return control


class SaveModelOnFirstStepCallback(TrainerCallback):
    """Callback to save the model on the first step of training if enabled"""

    def on_step_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **_kwargs,
    ) -> TrainerControl:
        if state.global_step == 1:
            control.should_save = True
        return control


def bench_eval_callback_factory(trainer, tokenizer):
    accuracy = evaluate.load("accuracy")
    abcd_idx = [
        tokenizer("A", add_special_tokens=False).input_ids[0],
        tokenizer("B", add_special_tokens=False).input_ids[0],
        tokenizer("C", add_special_tokens=False).input_ids[0],
        tokenizer("D", add_special_tokens=False).input_ids[0],
        tokenizer("E", add_special_tokens=False).input_ids[0],
        tokenizer("F", add_special_tokens=False).input_ids[0],
        tokenizer("G", add_special_tokens=False).input_ids[0],
    ]
    bench_split = "eval"

    def transform_bench_subject(example):
        # Split on ':' and trim whitespace
        parts = example["subject"].split(":")
        first_part = (
            parts[0].strip().lower().replace("-", "_")
        )  # Lowercase the first part
        second_part = (
            parts[1].strip().replace("-", "_") if len(parts) > 1 else "all"
        )  # Replace hyphens with underscores

        # Return the transformed values
        return {"name": first_part, "subject": second_part}

    if trainer.args.bench_dataset == "mmlu-zs":
        bench_dataset = load_dataset(
            "openaccess-ai-collective/mmlu-evals",
            data_files={
                "eval": "zero_shot_mmlu_val.json",
                "test": "zero_shot_mmlu_test.json",
            },
        )
        # bench_dataset = bench_dataset.remove_columns("subject")
    # MMLU Five-shot (Eval/Test only)
    elif trainer.args.bench_dataset in ["mmlu", "mmlu-fs"]:
        bench_dataset = load_dataset(
            "openaccess-ai-collective/mmlu-evals",
            data_files={
                "eval": "five_shot_mmlu_val.json",
                "test": "five_shot_mmlu_test.json",
            },
        )
        # bench_dataset = bench_dataset.remove_columns('subject')
    elif "/" in trainer.args.bench_dataset:
        bench_ds = trainer.args.bench_dataset
        bench_ds_name = "/".join(bench_ds.split("/", 2)[:2])
        bench_ds_data_file = "/".join(bench_ds.split("/", 2)[2:])
        bench_dataset = load_dataset(
            bench_ds_name,
            data_files={
                "eval": bench_ds_data_file,
            },
        )
        bench_dataset["eval"] = bench_dataset["eval"].map(transform_bench_subject)
    else:
        raise ValueError(
            f"unhandled value `{trainer.args.bench_dataset}` for bench_dataset training args"
        )
    bench_dataset = bench_dataset[trainer.args.bench_split]
    if trainer.args.max_bench_samples is not None:
        bench_dataset = bench_dataset.select(range(trainer.args.max_bench_samples))

    def tokenize_evals(example):
        source = f"{tokenizer.bos_token}{example['input']}"
        target = f"{example['output']}{tokenizer.eos_token}"

        tokenized_source = tokenizer(
            source,
            max_length=2048,
            truncation=True,
            add_special_tokens=False,
        )
        tokenized_target = tokenizer(
            target,
            max_length=2048,
            truncation=True,
            add_special_tokens=False,
        )
        input_ids = tokenized_source["input_ids"] + tokenized_target["input_ids"]
        labels = [IGNORE_INDEX] * len(tokenized_source["input_ids"]) + tokenized_target[
            "input_ids"
        ]

        return {
            "input_ids": input_ids,
            "labels": labels,
            "subject": example["subject"],
        }

    with zero_first(is_main_process()):
        bench_dataset = bench_dataset.map(tokenize_evals)
        bench_dataset = bench_dataset.filter(lambda x: x["labels"][-2] in abcd_idx)

    class BenchEvalCallback(TrainerCallback):
        """
        TrainerCallback that runs the MMLU evals
        """

        def on_evaluate(
            self,
            args: AxolotlTrainingArguments,
            state: TrainerState,
            control: TrainerControl,
            metrics: Dict[str, float],
            **kwargs,
        ):
            data_loader = trainer.get_bench_dataloader(
                bench_dataset.remove_columns(["input", "subject", "output", "name"])
            )
            trainer.model.eval()
            preds, refs = [], []
            loss_bench = 0
            for batch in tqdm(data_loader, total=len(data_loader)):
                (loss, logits, labels) = trainer.prediction_step(
                    trainer.model,
                    batch,
                    prediction_loss_only=False,
                )
                # There are two tokens, the output, and eos token.
                for i, logit in enumerate(logits):
                    label_non_zero_id = (batch["labels"][i] != IGNORE_INDEX).nonzero()[
                        0
                    ][0]
                    logit_abcd = logit[label_non_zero_id - 1][abcd_idx]
                    preds.append(torch.argmax(logit_abcd).item())
                labels = labels[labels != IGNORE_INDEX].view(-1, 2)[:, 0]
                refs += [
                    abcd_idx.index(label) if label in abcd_idx else -1
                    for label in labels.tolist()
                ]
                loss_bench += loss.item()
            # Extract results by subject.
            bench_name = bench_dataset["name"]
            bench_names: dict = {s: {"refs": [], "preds": []} for s in set(bench_name)}
            for s, p, r in zip(bench_name, preds, refs, strict=False):
                bench_names[s]["preds"].append(p)
                bench_names[s]["refs"].append(r)
            barrier()
            local_bench_names = bench_names
            gathered_bench_names: List[Dict] = [{} for _ in range(get_world_size())]
            # Gather results from all GPUs to GPU 0

            loss_bench_ranks = gather_scalar_from_all_ranks(
                lambda: loss_bench, get_world_size()
            )
            len_data_loader_ranks = gather_scalar_from_all_ranks(
                lambda: len(data_loader), get_world_size()
            )

            results = {}
            if is_distributed() and not is_main_process():
                dist.gather_object(local_bench_names, dst=0)
            else:
                if is_distributed():
                    dist.gather_object(local_bench_names, gathered_bench_names, dst=0)
                else:
                    gathered_bench_names = [local_bench_names]
                bench_loss = sum(loss_bench_ranks) / sum(len_data_loader_ranks)
                results = {f"{bench_split}_bench_loss": bench_loss}

                # Combine results from all GPUs
                combined_bench_names: Dict[str, Dict[str, List]] = {}
                for bench_name in gathered_bench_names:
                    for name, data in bench_name.items():
                        if name not in combined_bench_names:
                            combined_bench_names[name] = {"refs": [], "preds": []}
                        combined_bench_names[name]["refs"].extend(data["refs"])
                        combined_bench_names[name]["preds"].extend(data["preds"])

                bench_scores = []
                bench_refs = []
                bench_preds = []
                for bench_name in combined_bench_names:
                    bench_score = accuracy.compute(
                        references=combined_bench_names[bench_name]["refs"],
                        predictions=combined_bench_names[bench_name]["preds"],
                    )["accuracy"]
                    bench_refs.extend(combined_bench_names[bench_name]["refs"])
                    bench_preds.extend(combined_bench_names[bench_name]["preds"])
                    if not pd.isna(bench_score):
                        results[f"{bench_split}_bench_accuracy_{bench_name}"] = (
                            bench_score
                        )
                        bench_scores.append(bench_score)
                    else:
                        results[f"{bench_split}_bench_accuracy_{bench_name}"] = 0.0
                        bench_scores.append(0.0)
                results[f"{bench_split}_bench_average_accuracy"] = np.mean(bench_scores)
                results[f"{bench_split}_bench_total_accuracy"] = accuracy.compute(
                    references=bench_refs, predictions=bench_preds
                )["accuracy"]
                trainer.log(results)

            results = broadcast_dict(results)
            for key, val in results.items():
                metrics[key] = val

    return BenchEvalCallback


def causal_lm_bench_eval_callback_factory(trainer: Trainer, tokenizer):
    class CausalLMBenchEvalCallback(TrainerCallback):
        """Callback to log prediction values during each evaluation"""

        def __init__(self, cfg):
            self.cfg = cfg
            self.logged = False
            self.metrics = self.__maybe_load_metrics()

        def __maybe_load_metrics(self):
            metrics = {}
            for metric in self.cfg.eval_causal_lm_metrics:
                if metric == "perplexity":
                    max_seq_len = self.cfg.eval_max_new_tokens
                    metrics[metric] = Perplexity(
                        tokenizer=tokenizer,
                        max_seq_len=max_seq_len,
                    )
                else:
                    try:
                        metrics[metric] = evaluate.load(metric)
                    except Exception as exc:
                        LOG.warning(f"{metric}: {exc.args}")
            return metrics

        def on_evaluate(
            self,
            args: AxolotlTrainingArguments,
            state: TrainerState,
            control: TrainerControl,
            train_dataloader,
            eval_dataloader,
            **kwargs,
        ):
            trainer.model_wrapped.eval()

            device = torch.device(
                self.cfg.device
            )  # Use this instead of trainer.model_wrapped.device as it may return cpu if fsdp offloaded

            generation_config = GenerationConfig(
                max_new_tokens=self.cfg.eval_max_new_tokens,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=False,
                use_cache=True,
                return_dict_in_generate=True,
                output_attentions=False,
                output_hidden_states=False,
                output_scores=False,
            )

            def find_ranges(lst):
                ranges = []
                start = 0
                for i in range(1, len(lst)):
                    if lst[i] == 0:
                        ranges.append((start, i - 1))
                        start = i
                end = len(lst) - 1
                ranges.append((start, end))
                return ranges

            def compute(metric: evaluate.Metric, **kwargs):
                # safely compute a metric and return the score if the format is correct
                metric_score = None
                try:
                    # Only pass the kwargs that are in the metric's feature list
                    metric_kwargs = {
                        k: kwargs[k] for k in metric._feature_names() if k in kwargs
                    }

                    if isinstance(metric, Perplexity):
                        metric_kwargs["model"] = trainer.model_wrapped

                    metric_score = metric.compute(**metric_kwargs)
                    return (
                        metric_score["score"]
                        if "score" in metric_score
                        else metric_score["mean_score"]
                    )
                except Exception:
                    traceback.print_exc()
                    LOG.debug(
                        f"Failed to compute metric {metric.name} with kwargs {kwargs.keys()}"
                    )
                return metric_score

            def evaluate_preds(sources, predictions, references):
                scores = {}

                for metric_name, metric in self.metrics.items():
                    score = compute(
                        metric,
                        references=references,
                        predictions=predictions,
                        sources=sources,
                    )
                    if score is None:
                        score = compute(
                            metric,
                            references=[[r] for r in references],
                            predictions=predictions,
                        )
                    scores["eval_" + metric_name] = score
                return scores

            def predict_with_generate():
                eval_src, eval_pred, eval_ref = [], [], []

                with unwrap_model_for_generation(
                    trainer.model_wrapped, trainer.accelerator
                ) as unwrapped_model:
                    for batch in tqdm(eval_dataloader, disable=not is_main_process()):
                        batch_labels = batch["labels"].to(device)
                        batch_input_ids = batch["input_ids"].to(device)

                        if "position_ids" in batch:
                            batch_pos_ids = batch["position_ids"].tolist()
                        else:
                            batch_pos_ids = [None] * len(batch["input_ids"])

                        prompt_token_ids_list = []
                        completion_token_ids_list = []

                        for input_ids_all, labels_all, pos_ids in zip(
                            batch_input_ids,
                            batch_labels,
                            batch_pos_ids,
                            strict=False,
                        ):
                            if pos_ids is None:
                                pos_ranges = [(0, len(input_ids_all) - 1)]
                            else:
                                pos_ranges = find_ranges(pos_ids)

                            for pos_range in pos_ranges:
                                start, end = pos_range
                                if start == end:
                                    continue

                                input_ids = input_ids_all[start : end + 1]
                                labels = labels_all[start : end + 1]

                                tokens_without_loss = labels == IGNORE_INDEX
                                tokens_with_loss = labels != IGNORE_INDEX
                                tokens_exclude_padding = (
                                    input_ids != tokenizer.pad_token_id
                                )
                                prompt_token_includes = (
                                    tokens_without_loss & tokens_exclude_padding
                                )

                                prompt_token_ids = input_ids[prompt_token_includes]
                                prompt_token_ids_list.append(prompt_token_ids)

                                completion_token_ids = input_ids[tokens_with_loss]
                                completion_token_ids_list.append(completion_token_ids)

                        prompt_texts = tokenizer.batch_decode(
                            prompt_token_ids_list, skip_special_tokens=True
                        )
                        completion_texts = tokenizer.batch_decode(
                            completion_token_ids_list, skip_special_tokens=True
                        )

                        with torch.no_grad():
                            prompt_encoding = tokenizer(
                                prompt_texts, padding=True, return_tensors="pt"
                            ).to(device)

                            predictions = unwrapped_model.generate(
                                **prompt_encoding, generation_config=generation_config
                            )

                            del prompt_encoding

                        prediction_all_tokens = predictions["sequences"].cpu().tolist()
                        prediction_without_prompt_tokens_list = []
                        for prompt_token_ids, prediction_tokens in zip(
                            prompt_token_ids_list, prediction_all_tokens, strict=False
                        ):
                            prediction_without_prompt_tokens = prediction_tokens[
                                len(prompt_token_ids) :
                            ]
                            prediction_without_prompt_tokens_list.append(
                                prediction_without_prompt_tokens
                            )

                        predicted_texts = tokenizer.batch_decode(
                            prediction_without_prompt_tokens_list,
                            skip_special_tokens=True,
                        )

                        eval_src.extend(prompt_texts)
                        eval_pred.extend(predicted_texts)
                        eval_ref.extend(completion_texts)

                return eval_src, eval_pred, eval_ref

            eval_preds = predict_with_generate()
            trainer.log(evaluate_preds(*eval_preds))

            return control

    return CausalLMBenchEvalCallback


def log_prediction_callback_factory(trainer: Trainer, tokenizer, logger: str):
    class LogPredictionCallback(TrainerCallback):
        """Callback to log prediction values during each evaluation"""

        def __init__(self, cfg):
            self.cfg = cfg
            self.logged = False

        def on_evaluate(
            self,
            args: AxolotlTrainingArguments,
            state: TrainerState,
            control: TrainerControl,
            train_dataloader,
            eval_dataloader,
            **kwargs,
        ):
            eval_table_size = self.cfg.eval_table_size

            if eval_table_size <= 0:
                return control

            trainer.model.eval()
            device = torch.device(self.cfg.device)

            generation_config = GenerationConfig(
                max_new_tokens=self.cfg.eval_max_new_tokens,
                bos_token_id=tokenizer.bos_token_id,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
                do_sample=False,
                use_cache=True,
                return_dict_in_generate=True,
                output_attentions=False,
                output_hidden_states=False,
                output_scores=False,
            )

            def logits_to_tokens(logits) -> torch.Tensor:
                probabilities = torch.softmax(logits, dim=-1)
                # Get the predicted token ids (the ones with the highest probability)
                predicted_token_ids = torch.argmax(probabilities, dim=-1)
                return predicted_token_ids

            def find_ranges(lst):
                ranges = []
                start = 0
                for i in range(1, len(lst)):
                    if lst[i] == 0:
                        ranges.append((start, i - 1))
                        start = i
                end = len(lst) - 1
                ranges.append((start, end))
                return ranges

            def log_table_from_dataloader(name: str, table_dataloader):
                table_data: Dict[str, List[Any]] = {
                    "id": [],
                    "Prompt": [],
                    "Correct Completion": [],
                    "Predicted Completion (model.generate)": [],
                    "Predicted Completion (trainer.prediction_step)": [],
                }
                row_index = 0

                for batch in tqdm(table_dataloader):
                    if row_index > eval_table_size:
                        break

                    batch_labels = batch["labels"].to(device)
                    batch_input_ids = batch["input_ids"].to(device)

                    if "position_ids" in batch:
                        batch_pos_ids = batch["position_ids"].tolist()
                    else:
                        batch_pos_ids = [None] * len(batch["input_ids"])

                    (_, batch_logits, _) = trainer.prediction_step(
                        trainer.model,
                        batch,
                        prediction_loss_only=False,
                    )

                    prompt_token_ids_list = []
                    pred_step_token_ids_list = []
                    completion_token_ids_list = []

                    for input_ids_all, labels_all, pos_ids, logits in zip(
                        batch_input_ids,
                        batch_labels,
                        batch_pos_ids,
                        batch_logits,
                        strict=False,
                    ):
                        if pos_ids is None:
                            pos_ranges = [(0, len(input_ids_all) - 1)]
                        else:
                            pos_ranges = find_ranges(pos_ids)

                        for pos_range in pos_ranges:
                            start, end = pos_range
                            if start == end:
                                continue

                            input_ids = input_ids_all[start : end + 1]
                            labels = labels_all[start : end + 1]

                            tokens_without_loss = labels == IGNORE_INDEX
                            tokens_with_loss = labels != IGNORE_INDEX
                            tokens_exclude_padding = input_ids != tokenizer.pad_token_id
                            prompt_token_includes = (
                                tokens_without_loss & tokens_exclude_padding
                            )

                            prompt_token_ids = input_ids[prompt_token_includes]
                            prompt_token_ids_list.append(prompt_token_ids)

                            completion_token_ids = input_ids[tokens_with_loss]
                            completion_token_ids_list.append(completion_token_ids)

                            pred_step_token_ids = logits_to_tokens(
                                logits[start : end + 1]
                            )[tokens_with_loss]
                            pred_step_token_ids_list.append(pred_step_token_ids)

                    prompt_texts = tokenizer.batch_decode(
                        prompt_token_ids_list, skip_special_tokens=True
                    )
                    completion_texts = tokenizer.batch_decode(
                        completion_token_ids_list, skip_special_tokens=True
                    )
                    pred_step_texts = tokenizer.batch_decode(
                        pred_step_token_ids_list, skip_special_tokens=True
                    )

                    with torch.no_grad():
                        prompt_encoding = tokenizer(
                            prompt_texts, padding=True, return_tensors="pt"
                        ).to(self.cfg.device)
                        predictions = trainer.model.generate(
                            **prompt_encoding, generation_config=generation_config
                        )

                    prediction_all_tokens = predictions["sequences"].cpu().tolist()
                    prediction_without_prompt_tokens_list = []
                    for prompt_token_ids, prediction_tokens in zip(
                        prompt_token_ids_list, prediction_all_tokens, strict=False
                    ):
                        prediction_without_prompt_tokens = prediction_tokens[
                            len(prompt_token_ids) :
                        ]
                        prediction_without_prompt_tokens_list.append(
                            prediction_without_prompt_tokens
                        )

                    predicted_texts = tokenizer.batch_decode(
                        prediction_without_prompt_tokens_list, skip_special_tokens=True
                    )

                    for (
                        prompt_text,
                        completion_text,
                        prediction_text,
                        pred_step_text,
                    ) in zip(
                        prompt_texts,
                        completion_texts,
                        predicted_texts,
                        pred_step_texts,
                        strict=False,
                    ):
                        table_data["id"].append(row_index)
                        table_data["Prompt"].append(prompt_text)
                        table_data["Correct Completion"].append(completion_text)
                        table_data["Predicted Completion (model.generate)"].append(
                            prediction_text
                        )
                        table_data[
                            "Predicted Completion (trainer.prediction_step)"
                        ].append(pred_step_text)
                        row_index += 1
                if logger == "wandb":
                    # type: ignore[attr-defined]
                    wandb.run.log(
                        {
                            f"{name} - Predictions vs Ground Truth": pd.DataFrame(
                                table_data
                            )
                        }
                    )
                elif logger == "mlflow" and is_mlflow_available():
                    import mlflow

                    tracking_uri = AxolotlInputConfig(
                        **self.cfg.to_dict()
                    ).mlflow_tracking_uri
                    mlflow.log_table(
                        data=table_data,
                        artifact_file="PredictionsVsGroundTruth.json",
                        tracking_uri=tracking_uri,
                    )
                elif logger == "comet_ml" and is_comet_available():
                    import comet_ml

                    experiment = comet_ml.get_running_experiment()
                    if experiment:
                        experiment.log_table(
                            f"{name} - Predictions vs Ground Truth.csv",
                            pd.DataFrame(table_data),
                        )

            if is_main_process():
                log_table_from_dataloader("Eval", eval_dataloader)

            return control

    return LogPredictionCallback


class SaveAxolotlConfigtoWandBCallback(TrainerCallback):
    """Callback to save axolotl config to wandb"""

    def __init__(self, axolotl_config_path):
        self.axolotl_config_path = axolotl_config_path

    def on_train_begin(
        self,
        args: AxolotlTrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if state.is_world_process_zero:
            try:
                # sync config to top level in run, cannot delete file right away because wandb schedules it to be synced even w/policy = 'now', so let OS delete it later.
                with NamedTemporaryFile(
                    mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
                ) as temp_file:
                    copyfile(self.axolotl_config_path, temp_file.name)
                    artifact = wandb.Artifact(
                        f"config-{wandb.run.id}", type="axolotl-config"
                    )
                    artifact.add_file(temp_file.name)
                    wandb.log_artifact(artifact)
                    wandb.save(temp_file.name)
                    LOG.info(
                        "The Axolotl config has been saved to the WandB run under files."
                    )
            except (FileNotFoundError, ConnectionError) as err:
                LOG.warning(f"Error while saving Axolotl config to WandB: {err}")

            try:
                with open(self.axolotl_config_path, "r", encoding="utf-8") as f:
                    cfg = yaml.safe_load(f) or {}

                chat_tpl = cfg.get("chat_template_jinja")
                if chat_tpl:
                    with NamedTemporaryFile(
                        mode="w", delete=True, suffix=".jinja", prefix="chat_template_"
                    ) as temp_ct_file:
                        if (
                            isinstance(chat_tpl, str)
                            and os.path.exists(chat_tpl)
                            and os.path.isfile(chat_tpl)
                        ):
                            copyfile(chat_tpl, temp_ct_file.name)
                        else:
                            temp_ct_file.write(str(chat_tpl))
                            temp_ct_file.flush()

                        artifact = wandb.Artifact(
                            f"chat-template-{wandb.run.id}", type="jinja-template"
                        )
                        artifact.add_file(temp_ct_file.name)
                        wandb.log_artifact(artifact)
                        wandb.save(temp_ct_file.name)
                        LOG.info(
                            "The chat_template_jinja has been saved to the WandB run under files."
                        )
            except (FileNotFoundError, ConnectionError, yaml.YAMLError) as err:
                LOG.warning(f"Error while saving chat_template_jinja to WandB: {err}")

            if args.deepspeed:
                try:
                    # sync config to top level in run, cannot delete file right away because wandb schedules it to be synced even w/policy = 'now', so let OS delete it later.
                    with NamedTemporaryFile(
                        mode="w",
                        delete=False,
                        suffix=".json",
                        prefix="deepspeed_config_",
                    ) as temp_file:
                        skip_upload = False
                        if isinstance(args.deepspeed, dict):
                            json.dump(args.deepspeed, temp_file, indent=4)
                        elif isinstance(args.deepspeed, str) and os.path.exists(
                            args.deepspeed
                        ):
                            copyfile(args.deepspeed, temp_file.name)
                        else:
                            skip_upload = True
                        if not skip_upload:
                            artifact = wandb.Artifact(
                                f"deepspeed-config-{wandb.run.id}",
                                type="deepspeed-config",
                            )
                            artifact.add_file(temp_file.name)
                            wandb.log_artifact(artifact)
                            wandb.save(temp_file.name)
                            LOG.info(
                                "The DeepSpeed config has been saved to the WandB run under files."
                            )
                except (FileNotFoundError, ConnectionError) as err:
                    LOG.warning(f"Error while saving DeepSpeed config to WandB: {err}")

        return control


class GCCallback(TrainerCallback):
    """Callback to garbage collect torch cache"""

    def __init__(self, gc_steps: int | None = -1):
        self.gc_steps: int = gc_steps or -1
        self.next_gc_on_begin_step: int = -1

    def _gc(self):
        torch.cuda.empty_cache()
        gc.collect()

    def on_train_begin(
        self,
        args,
        state,
        control,
        **kwargs,
    ):
        self._gc()

    def on_step_begin(
        self,
        args,
        state,
        control,
        **kwargs,
    ):
        if self.next_gc_on_begin_step == state.global_step or state.global_step == 0:
            self._gc()

    def on_step_end(
        self,
        args,
        state,
        control,
        **kwargs,
    ):
        if control.should_evaluate:
            # automatically GC before evals so the eval memory spike from the CEL doesn't OOM the trainer
            self._gc()
            # also GC on the start of the next step after the eval
            self.next_gc_on_begin_step = state.global_step + 1
        elif self.gc_steps > 0 and state.global_step % self.gc_steps == 0:
            self._gc()
        elif (
            args.save_strategy == SaveStrategy.STEPS
            and state.save_steps > 0
            and state.global_step % state.save_steps == 0
        ):
            # gc on save steps in case anything is loaded to CPU RAM like offloaded tensors
            self._gc()
        elif state.global_step >= state.max_steps:
            if args.save_strategy == SaveStrategy.STEPS:
                # gc on save steps in case anything is loaded to CPU RAM like offloaded tensors
                self._gc()

    def on_epoch_end(
        self,
        args,
        state,
        control,
        **kwargs,
    ):
        self._gc()


def colab_inference_post_train_callback(trainer: Trainer):
    class ColabCallback(TrainerCallback):
        """Callback to prep model for inference on Google Colab"""

        def __init__(self, cfg):
            self.gpu_name = torch.cuda.get_device_name(0)
            self.cfg = cfg

        def on_train_end(self, args, state, control, **kwargs):
            """
            handle T4 gpu, we need to convert attention to eager for inference
            """
            if "Tesla T4" in self.gpu_name and self.cfg.xformers_attention:
                trainer.model.config._attn_implementation = "eager"
            trainer.model.gradient_checkpointing_disable()
            trainer.model.config.use_cache = True
            trainer.model.eval()

    return ColabCallback


================================================
FILE: src/axolotl/utils/callbacks/comet_.py
================================================
"""Comet module for trainer callbacks"""

from typing import TYPE_CHECKING

import comet_ml
from transformers import TrainerCallback, TrainerControl, TrainerState

from axolotl.utils.distributed import is_main_process
from axolotl.utils.logging import get_logger

if TYPE_CHECKING:
    from axolotl.core.training_args import AxolotlTrainingArguments

LOG = get_logger(__name__)


class SaveAxolotlConfigtoCometCallback(TrainerCallback):
    """Callback to save axolotl config to comet"""

    def __init__(self, axolotl_config_path):
        self.axolotl_config_path = axolotl_config_path

    def on_train_begin(
        self,
        args: "AxolotlTrainingArguments",
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if is_main_process():
            try:
                comet_experiment = comet_ml.start(source="axolotl")
                comet_experiment.log_other("Created from", "axolotl")
                comet_experiment.log_asset(
                    self.axolotl_config_path,
                    file_name="axolotl-config",
                )
                LOG.info(
                    "The Axolotl config has been saved to the Comet Experiment under assets."
                )
            except (FileNotFoundError, ConnectionError) as err:
                LOG.warning(f"Error while saving Axolotl config to Comet: {err}")
        return control


================================================
FILE: src/axolotl/utils/callbacks/dynamic_checkpoint.py
================================================
from pathlib import Path

from transformers import (
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
)

from axolotl.utils.distributed import (
    barrier,
    is_distributed,
    is_main_process,
)
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

DEFAULT_TRIGGER_FILENAME = "axolotl_checkpoint.save"


class DynamicCheckpointCallback(TrainerCallback):
    """
    Callback to save checkpoints on-demand during training via:
    1. File-based trigger (works everywhere, rank 0 checks file)

    Thread-safe for multi-GPU distributed training.

    Usage:
        # File-based:
        touch /path/to/output_dir/axolotl_checkpoint.save
    """

    def _get_config_value(self, config, key, default=None):
        """Helper to get config value from dict or object."""
        if isinstance(config, dict):
            return config.get(key, default)
        return getattr(config, key, default)

    def __init__(self, cfg):
        self.cfg = cfg
        if not cfg.dynamic_checkpoint or not cfg.dynamic_checkpoint.enabled:
            self.enabled = False
            return

        self.enabled = True
        dc_config = cfg.dynamic_checkpoint

        trigger_file_path = self._get_config_value(dc_config, "trigger_file_path")
        self.trigger_filename = (
            trigger_file_path if trigger_file_path else DEFAULT_TRIGGER_FILENAME
        )

        check_interval = self._get_config_value(dc_config, "check_interval")
        self.check_interval = check_interval if check_interval is not None else 100
        self.should_save_checkpoint = False

        LOG.info(
            f"Dynamic checkpoint enabled. To trigger checkpoint save:\n"
            f"  • File: touch {cfg.output_dir}/{self.trigger_filename}\n"
            f"  • Check interval: every {self.check_interval} steps",
        )

    def on_step_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **_kwargs,
    ) -> TrainerControl:
        """
        Check for checkpoint triggers at the end of each step.
        ONLY rank 0 checks the file, then all ranks synchronize.
        """
        if not self.enabled:
            return control

        trigger_detected = False

        if state.global_step % self.check_interval == 0:
            if is_main_process():
                trigger_path = Path(args.output_dir) / self.trigger_filename

                if trigger_path.exists():
                    trigger_detected = True
                    try:
                        trigger_path.unlink()  # Delete the trigger file
                        LOG.info(
                            f"Dynamic checkpoint triggered via file '{self.trigger_filename}' "
                            f"at step {state.global_step}",
                        )
                    except OSError as exc:
                        LOG.warning(
                            f"Failed to delete trigger file: {exc}",
                        )

                if self.should_save_checkpoint:
                    trigger_detected = True
                    self.should_save_checkpoint = False  # Reset flag

            if is_distributed():
                import torch
                import torch.distributed as dist

                device = getattr(
                    args,
                    "device",
                    torch.device("cuda" if torch.cuda.is_available() else "cpu"),
                )

                trigger_tensor = torch.tensor(
                    1 if trigger_detected else 0,
                    dtype=torch.long,
                    device=device,
                )

                dist.broadcast(trigger_tensor, src=0)

                trigger_detected = bool(trigger_tensor.item())

                barrier()

        if trigger_detected:
            control.should_save = True
            LOG.info(
                f"Saving dynamic checkpoint at step {state.global_step}",
            )
        return control


================================================
FILE: src/axolotl/utils/callbacks/generation.py
================================================
"""Callback for generating samples during SFT/Pretrain training."""

from transformers.trainer_callback import TrainerCallback, TrainerControl, TrainerState
from transformers.training_args import TrainingArguments

from axolotl.utils.generation.sft import generate_samples
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


class SFTGenerationCallback(TrainerCallback):
    """Callback for generating samples during SFT/Pretrain training."""

    def __init__(self, trainer):
        self.trainer = trainer

    def on_evaluate(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Generate samples at specified intervals."""
        cfg = self.trainer.axolotl_cfg

        if not getattr(cfg, "generate_samples", False):
            return

            dataloader = None
            try:
                if getattr(self.trainer, "eval_dataset", None) is not None:
                    dataloader = self.trainer.get_eval_dataloader()
                    LOG.info(
                        f"Using eval dataloader for generation at step {state.global_step}"
                    )
            except Exception as e:
                LOG.warning(f"Could not get eval dataloader: {e}")
                dataloader = None

            if dataloader is None:
                dataloader = self.trainer.get_train_dataloader()
                LOG.info(
                    f"Using train dataloader for generation at step {state.global_step}"
                )

            samples = generate_samples(
                model=self.trainer.model,
                tokenizer=self.trainer.processing_class,
                dataloader=dataloader,
                num_generation_samples=getattr(cfg, "num_generation_samples", 3),
                max_new_tokens=getattr(cfg, "generation_max_new_tokens", 50),
                temperature=getattr(cfg, "generation_temperature", 0.7),
                top_p=getattr(cfg, "generation_top_p", None),
                top_k=getattr(cfg, "generation_top_k", None),
                do_sample=getattr(cfg, "generation_do_sample", True),
                prompt_ratio=getattr(cfg, "generation_prompt_ratio", 0.5),
            )
            self._log_samples(samples, state.global_step)

    def _log_samples(self, samples: list, step: int):
        """Log generated samples to console and W&B."""
        from axolotl.utils.generation.sft import format_generation_for_logging

        for i, sample in enumerate(samples):
            console_text, wandb_text = format_generation_for_logging(sample, i, step)

            LOG.info(console_text)

            try:
                import wandb

                if wandb.run is not None:
                    wandb.log(
                        {
                            f"samples/sample_{i + 1}": wandb.Html(
                                f"<pre>{wandb_text}</pre>"
                            )
                        },
                        step=step,
                    )
            except (ImportError, Exception):
                pass


================================================
FILE: src/axolotl/utils/callbacks/lisa.py
================================================
"""
module for LISA

Adapted from https://github.com/OptimalScale/LMFlow/pull/701 for HF transformers & Axolotl
Arxiv: https://arxiv.org/abs/2403.17919
License: Apache 2.0
"""

from functools import reduce
from typing import TYPE_CHECKING

import numpy as np
from transformers import TrainerCallback

from axolotl.utils.logging import get_logger

if TYPE_CHECKING:
    from axolotl.core.trainers import AxolotlTrainer

LOG = get_logger(__name__)


def lisa_callback_factory(trainer: "AxolotlTrainer"):
    class LISACallback(TrainerCallback):
        """trainer callback for lisa layer switching"""

        def __init__(
            self, n_layers, step_interval, trainer, layers_attribute="model.layers"
        ):
            super().__init__()
            self.n_layers = n_layers
            self.step_interval = step_interval
            self.layers_attribute = layers_attribute
            self.trainer = trainer

            reduce(getattr, self.layers_attribute.split("."), self.trainer.model)

            self.total_layers = len(
                reduce(getattr, self.layers_attribute.split("."), self.trainer.model)
            )
            self.active_layers_indices = []

            layers = reduce(
                getattr, self.layers_attribute.split("."), self.trainer.model
            )
            LOG.info(
                f"LISA will activate {self.n_layers}/{len(layers)} layers ({self.n_layers * 100 / len(layers)}%) every {self.step_interval} steps"
            )

        def freeze_all_layers(self):
            layers = reduce(
                getattr, self.layers_attribute.split("."), self.trainer.model
            )
            for layer in layers:
                for param in layer.parameters():
                    param.requires_grad = False

        def on_step_begin(self, args, state, control, **kwargs):
            # Check if it's time to switch active layers, including at step 0
            if state.global_step % self.step_interval == 0 or state.global_step == 1:
                self.switch_active_layers()

        def switch_active_layers(self):
            # First, disable gradients for all layers
            self.freeze_all_layers()

            # Randomly select n_layers to activate
            layers = reduce(
                getattr, self.layers_attribute.split("."), self.trainer.model
            )
            self.active_layers_indices = np.random.choice(
                range(self.total_layers), self.n_layers, replace=False
            )
            LOG.info(
                f"Activating layers at indices: {self.active_layers_indices} for the next steps."
            )

            # Enable gradients only for the selected layers
            for idx in self.active_layers_indices:
                for param in layers[idx].parameters():
                    param.requires_grad = True

    lisa_callback = LISACallback(
        n_layers=trainer.args.lisa_n_layers,
        step_interval=trainer.args.lisa_step_interval,
        trainer=trainer,
        layers_attribute=trainer.args.lisa_layers_attribute,
    )

    return lisa_callback


================================================
FILE: src/axolotl/utils/callbacks/mlflow_.py
================================================
"""MLFlow module for trainer callbacks"""

import os
from shutil import copyfile
from tempfile import NamedTemporaryFile
from typing import TYPE_CHECKING

import mlflow
from transformers import TrainerCallback, TrainerControl, TrainerState

from axolotl.utils.distributed import is_main_process
from axolotl.utils.logging import get_logger

if TYPE_CHECKING:
    from axolotl.core.training_args import AxolotlTrainingArguments

LOG = get_logger(__name__)


def should_log_artifacts() -> bool:
    truths = ["TRUE", "1", "YES"]
    return os.getenv("HF_MLFLOW_LOG_ARTIFACTS", "FALSE").upper() in truths


class SaveAxolotlConfigtoMlflowCallback(TrainerCallback):
    """Callback to save axolotl config to mlflow"""

    def __init__(self, axolotl_config_path):
        self.axolotl_config_path = axolotl_config_path

    def on_train_begin(
        self,
        args: "AxolotlTrainingArguments",
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if is_main_process():
            try:
                if should_log_artifacts():
                    with NamedTemporaryFile(
                        mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
                    ) as temp_file:
                        copyfile(self.axolotl_config_path, temp_file.name)
                        mlflow.log_artifact(temp_file.name, artifact_path="")
                        LOG.info(
                            "The Axolotl config has been saved to the MLflow artifacts."
                        )
                else:
                    LOG.info(
                        "Skipping logging artifacts to MLflow (hf_mlflow_log_artifacts is false)"
                    )
            except (FileNotFoundError, ConnectionError) as err:
                LOG.warning(f"Error while saving Axolotl config to MLflow: {err}")
        return control


================================================
FILE: src/axolotl/utils/callbacks/models.py
================================================
"""Helper functions for model classes"""

from typing import Tuple

from transformers.models.auto.modeling_auto import MODEL_FOR_CAUSAL_LM_MAPPING_NAMES


def get_causal_lm_model_cls_prefix(model_type: str) -> Tuple[str, str]:
    if model_type in MODEL_FOR_CAUSAL_LM_MAPPING_NAMES:
        causal_lm_cls = MODEL_FOR_CAUSAL_LM_MAPPING_NAMES[model_type]
        causal_lm_cls_prefix = causal_lm_cls
        for suffix in [
            "ForCausalLM",
            "ForConditionalGeneration",
            "LMHeadModel",
            "GenerationDecoder",
        ]:
            causal_lm_cls_prefix = causal_lm_cls_prefix.replace(suffix, "")
        return causal_lm_cls_prefix, causal_lm_cls
    causal_lm_cls_prefix = "".join(
        [part.capitalize() for part in model_type.split("_")]
    )
    return causal_lm_cls_prefix, f"{causal_lm_cls_prefix}ForCausalLM"


================================================
FILE: src/axolotl/utils/callbacks/opentelemetry.py
================================================
"""OpenTelemetry metrics callback for Axolotl training"""

import threading
from typing import Dict, Optional

from transformers import (
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
)

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

try:
    from opentelemetry import metrics
    from opentelemetry.exporter.prometheus import PrometheusMetricReader
    from opentelemetry.metrics import set_meter_provider
    from opentelemetry.sdk.metrics import MeterProvider as SDKMeterProvider
    from prometheus_client import start_http_server

    OPENTELEMETRY_AVAILABLE = True
except ImportError:
    LOG.warning("OpenTelemetry not available. pip install [opentelemetry]")
    OPENTELEMETRY_AVAILABLE = False


class OpenTelemetryMetricsCallback(TrainerCallback):
    """
    TrainerCallback that exports training metrics to OpenTelemetry/Prometheus.

    This callback automatically tracks key training metrics including:
    - Training loss
    - Evaluation loss
    - Learning rate
    - Epoch progress
    - Global step count
    - Gradient norm

    Metrics are exposed via HTTP endpoint for Prometheus scraping.
    """

    def __init__(self, cfg):
        if not OPENTELEMETRY_AVAILABLE:
            LOG.warning("OpenTelemetry not available, metrics will not be collected")
            self.metrics_enabled = False
            return

        self.cfg = cfg
        self.metrics_host = getattr(cfg, "otel_metrics_host", "localhost")
        self.metrics_port = getattr(cfg, "otel_metrics_port", 8000)
        self.metrics_enabled = True
        self.server_started = False
        self.metrics_lock = threading.Lock()

        try:
            # Create Prometheus metrics reader
            prometheus_reader = PrometheusMetricReader()

            # Create meter provider with Prometheus exporter
            provider = SDKMeterProvider(metric_readers=[prometheus_reader])
            set_meter_provider(provider)

            # Get meter for creating metrics
            self.meter = metrics.get_meter("axolotl.training")

            # Create metrics
            self._create_metrics()

        except Exception as e:
            LOG.warning(f"Failed to initialize OpenTelemetry metrics: {e}")
            self.metrics_enabled = False

    def _create_metrics(self):
        """Create all metrics that will be tracked"""
        self.train_loss_gauge = self.meter.create_gauge(
            name="axolotl_train_loss",
            description="Current training loss",
            unit="1",
        )

        self.eval_loss_gauge = self.meter.create_gauge(
            name="axolotl_eval_loss",
            description="Current evaluation loss",
            unit="1",
        )

        self.learning_rate_gauge = self.meter.create_gauge(
            name="axolotl_learning_rate",
            description="Current learning rate",
            unit="1",
        )

        self.epoch_gauge = self.meter.create_gauge(
            name="axolotl_epoch",
            description="Current training epoch",
            unit="1",
        )

        self.global_step_counter = self.meter.create_counter(
            name="axolotl_global_steps",
            description="Total training steps completed",
            unit="1",
        )

        self.grad_norm_gauge = self.meter.create_gauge(
            name="axolotl_gradient_norm",
            description="Gradient norm",
            unit="1",
        )

        self.memory_usage_gauge = self.meter.create_gauge(
            name="axolotl_memory_usage",
            description="Current memory usage in MB",
            unit="MB",
        )

    def _start_metrics_server(self):
        """Start the HTTP server for metrics exposure"""
        if self.server_started:
            return

        try:
            start_http_server(self.metrics_port, addr=self.metrics_host)
            self.server_started = True
            LOG.info(
                f"OpenTelemetry metrics server started on http://{self.metrics_host}:{self.metrics_port}/metrics"
            )

        except Exception as e:
            LOG.error(f"Failed to start OpenTelemetry metrics server: {e}")

    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Called at the beginning of training"""
        if not self.metrics_enabled:
            return

        self._start_metrics_server()
        LOG.info("OpenTelemetry metrics collection started")

    def on_log(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        logs: Optional[Dict[str, float]] = None,
        **kwargs,
    ):
        """Called when logging occurs"""
        if not self.metrics_enabled or not logs:
            return

        if "loss" in logs:
            self.train_loss_gauge.set(logs["loss"])

        if "eval_loss" in logs:
            self.eval_loss_gauge.set(logs["eval_loss"])

        if "learning_rate" in logs:
            self.learning_rate_gauge.set(logs["learning_rate"])

        if "epoch" in logs:
            self.epoch_gauge.set(logs["epoch"])

        if "grad_norm" in logs:
            self.grad_norm_gauge.set(logs["grad_norm"])
        if "memory_usage" in logs:
            self.memory_usage_gauge.set(logs["memory_usage"])

    def on_step_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Called at the end of each training step"""
        if not self.metrics_enabled:
            return

        # Update step counter and epoch
        self.global_step_counter.add(1)
        if state.epoch is not None:
            self.epoch_gauge.set(state.epoch)

    def on_evaluate(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        metrics: Optional[Dict[str, float]] = None,
        **kwargs,
    ):
        """Called after evaluation"""
        if not self.metrics_enabled or not metrics:
            return

        if "eval_loss" in metrics:
            self.eval_loss_gauge.set(metrics["eval_loss"])

        # Record any other eval metrics as gauges
        for key, value in metrics.items():
            if key.startswith("eval_") and isinstance(value, (int, float)):
                # Create gauge for this metric if it doesn't exist
                gauge_name = f"axolotl_{key}"
                try:
                    gauge = self.meter.create_gauge(
                        name=gauge_name,
                        description=f"Evaluation metric: {key}",
                        unit="1",
                    )
                    gauge.set(value)
                except Exception as e:
                    LOG.warning(f"Failed to create/update metric {gauge_name}: {e}")

    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Called at the end of training"""
        if not self.metrics_enabled:
            return

        LOG.info("Training completed. OpenTelemetry metrics collection finished.")
        LOG.info(
            f"Metrics are still available at http://{self.metrics_host}:{self.metrics_port}/metrics"
        )


================================================
FILE: src/axolotl/utils/callbacks/perplexity.py
================================================
"""callback to calculate perplexity as an evaluation metric."""

from typing import Dict, List, Optional

import torch
from torch import Tensor
from tqdm import tqdm
from transformers.modeling_outputs import CausalLMOutput
from transformers.modeling_utils import PreTrainedModel

try:
    from transformers.tokenization_python import PreTrainedTokenizer
except ImportError:
    from transformers.tokenization_utils import PreTrainedTokenizer

from axolotl.utils.distributed import is_main_process


class Perplexity:
    """
    Calculate perplexity as defined in https://huggingface.co/docs/transformers/en/perplexity.
    This is a custom variant that doesn't re-tokenize the input or re-load the model.
    """

    def __init__(
        self,
        tokenizer: PreTrainedTokenizer,
        max_seq_len: int,
        stride: int = 512,
    ) -> None:
        self.max_seq_len = max_seq_len
        self.stride = stride
        self.tokenizer = tokenizer
        self.name = "perplexity"

    def _feature_names(self) -> List[str]:
        return ["references"]

    def compute(
        self,
        model: PreTrainedModel,
        references: Optional[List[str]] = None,
    ) -> Dict[str, float]:
        """
        Compute perplexity in a fixed length sliding window across the sequence.
        """
        assert references is not None, "Missing parameter: references"

        model.eval()

        references_tokenized = self.tokenizer(
            references, return_tensors="pt", padding=True, truncation=True
        )
        input_ids: Tensor = references_tokenized["input_ids"]  # type: ignore
        input_ids = input_ids.to(model.device)

        sequence_length = input_ids.size(1)

        losses = []
        prev_end_loc = 0
        for begin_loc in tqdm(
            range(0, sequence_length, self.stride), disable=not is_main_process()
        ):
            end_loc = min(begin_loc + self.max_seq_len, sequence_length)
            trg_len = end_loc - prev_end_loc
            input_ids_slice = input_ids[:, begin_loc:end_loc]
            labels_slice = input_ids_slice.clone()
            labels_slice[:, :-trg_len] = -100

            with torch.no_grad():
                outputs: CausalLMOutput = model(
                    input_ids=input_ids_slice, labels=labels_slice
                )

            losses.append(outputs.loss)

            prev_end_loc = end_loc
            if end_loc == sequence_length:
                break

        perplexity = torch.exp(torch.stack(losses).mean()).item()

        return {
            "score": perplexity,
        }


================================================
FILE: src/axolotl/utils/callbacks/profiler.py
================================================
"""
HF Trainer callback for creating pytorch profiling snapshots
"""

from pathlib import Path
from pickle import dump  # nosec B403

import torch
from transformers import (
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
)


class PytorchProfilerCallback(TrainerCallback):
    """
    PyTorch Profiler callback to create snapshots of GPU memory usage at specified steps.

    Also runs torch.profiler to produce a Chrome trace for timing analysis.
    """

    def __init__(self, steps_to_profile: int = 5, profiler_steps_start: int = 0):
        # steps are 0 indexed, so to start at 0-th step, we start at beginning of first step,
        # and finish at end of last step, so 5 steps_to_profile is steps [0, 1, 2, 3, 4]
        self.profiler_steps_end = profiler_steps_start + steps_to_profile - 1
        if profiler_steps_start == 0:
            # start recording memory allocations before everything is allocated, because if we start
            # at the beginning of step 0, we won't have any memory allocations in the traces
            torch.cuda.memory._record_memory_history(enabled="all", stacks="all")
            profiler_steps_start = -1
        self.profiler_steps_start = profiler_steps_start
        self._profiler = None

    def on_step_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if state.global_step == self.profiler_steps_start:
            torch.cuda.memory._record_memory_history(enabled="all", stacks="all")

        # Start torch.profiler on the first profiled step
        if state.global_step == max(self.profiler_steps_start, 0):
            profiler = torch.profiler.profile(
                activities=[
                    torch.profiler.ProfilerActivity.CPU,
                    torch.profiler.ProfilerActivity.CUDA,
                ],
                record_shapes=True,
                profile_memory=True,
                with_stack=True,
            )
            profiler.__enter__()
            self._profiler = profiler

    def on_step_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if state.global_step == self.profiler_steps_end:
            snapshot = torch.cuda.memory._snapshot()
            with open(Path(args.output_dir) / "snapshot.pickle", "wb") as fout:
                dump(snapshot, fout)

            # tell CUDA to stop recording memory allocations now
            torch.cuda.memory._record_memory_history(enabled=None)

            # Stop and export torch.profiler trace
            if self._profiler is not None:
                self._profiler.__exit__(None, None, None)
                trace_path = Path(args.output_dir) / "profiler_trace.json"
                self._profiler.export_chrome_trace(str(trace_path))
                self._profiler = None

    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        # make sure to record if we happen to have more steps than steps to profile
        if (
            state.global_step >= self.profiler_steps_start
            and state.global_step < self.profiler_steps_end
        ):
            snapshot = torch.cuda.memory._snapshot()
            with open(Path(args.output_dir) / "snapshot.pickle", "wb") as fout:
                dump(snapshot, fout)

            # tell CUDA to stop recording memory allocations now
            torch.cuda.memory._record_memory_history(enabled=None)

        if self._profiler is not None:
            self._profiler.__exit__(None, None, None)
            trace_path = Path(args.output_dir) / "profiler_trace.json"
            self._profiler.export_chrome_trace(str(trace_path))
            self._profiler = None


================================================
FILE: src/axolotl/utils/callbacks/qat.py
================================================
"""QAT Callback for HF Causal Trainer"""

from functools import partial

from torch import nn
from torchao.quantization.qat.embedding import FakeQuantizedEmbedding
from torchao.quantization.qat.linear import FakeQuantizedLinear
from transformers import TrainerCallback

from axolotl.utils.logging import get_logger
from axolotl.utils.schemas.quantization import QATConfig

LOG = get_logger(__name__)


def toggle_fake_quant(mod: nn.Module, enable: bool):
    """
    Toggle fake quantization for any fake quantized linear or embedding layers in the model.

    Args:
        mod: The module to toggle fake quantization for.
        enable: Whether to enable or disable fake quantization.
    """
    if isinstance(mod, (FakeQuantizedLinear, FakeQuantizedEmbedding)):
        if (
            isinstance(mod, FakeQuantizedLinear)
            and mod.activation_fake_quantizer is not None
        ):
            mod.activation_fake_quantizer.enabled = enable
        mod.weight_fake_quantizer.enabled = enable


class QATCallback(TrainerCallback):
    """
    Callback to toggle fake quantization for the model.
    """

    def __init__(self, cfg: QATConfig):
        self.cfg = cfg

    def on_step_begin(self, args, state, control, model, **kwargs):
        if self.cfg.fake_quant_after_n_steps is not None:
            if state.global_step == 0:
                LOG.info(f"Disabling fake quantization at step {state.global_step}")
                model.apply(partial(toggle_fake_quant, enable=False))
            elif state.global_step == self.cfg.fake_quant_after_n_steps:
                LOG.info(f"Enabling fake quantization at step {state.global_step}")
                model.apply(partial(toggle_fake_quant, enable=True))


================================================
FILE: src/axolotl/utils/callbacks/swanlab.py
================================================
"""Callbacks for SwanLab integration"""

from __future__ import annotations

import json
import os
from shutil import copyfile
from tempfile import NamedTemporaryFile
from typing import TYPE_CHECKING

from transformers import (
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
)

from axolotl.utils.logging import get_logger

if TYPE_CHECKING:
    from axolotl.core.training_args import AxolotlTrainingArguments

LOG = get_logger(__name__)


class CustomSwanLabCallback(TrainerCallback):
    """
    Lightweight SwanLab callback that directly logs metrics without using
    SwanLab's transformers integration (which requires omegaconf).

    This avoids the antlr4 version conflict between omegaconf and axolotl.
    """

    def __init__(self):
        self._initialized = False
        self.swanlab = None

    def setup(self):
        """Lazy initialization of SwanLab"""
        if self._initialized:
            return

        try:
            import swanlab

            self.swanlab = swanlab

            # Check if SwanLab run is initialized
            if swanlab.get_run() is None:
                LOG.warning("SwanLab run is not initialized")
                return

            self._initialized = True
            LOG.info("CustomSwanLabCallback initialized successfully")
        except ImportError:
            LOG.error("SwanLab is not installed")

    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Called at the beginning of training"""
        if not state.is_world_process_zero:
            return control

        self.setup()

        if not self._initialized:
            return control

        # Log training configuration
        try:
            self.swanlab.config.update(
                {
                    "train_batch_size": args.per_device_train_batch_size,
                    "eval_batch_size": args.per_device_eval_batch_size,
                    "learning_rate": args.learning_rate,
                    "num_train_epochs": args.num_train_epochs,
                    "max_steps": args.max_steps,
                    "warmup_steps": args.warmup_steps,
                    "logging_steps": args.logging_steps,
                    "save_steps": args.save_steps,
                    "gradient_accumulation_steps": args.gradient_accumulation_steps,
                }
            )
            LOG.debug("Training configuration logged to SwanLab")
        except Exception as err:
            LOG.warning(f"Failed to log training config: {err}")

        return control

    def on_log(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        logs=None,
        **kwargs,
    ):
        """Called when logging metrics"""
        if not state.is_world_process_zero:
            return control

        if not self._initialized:
            self.setup()

        if not self._initialized or logs is None:
            return control

        # Log metrics to SwanLab
        try:
            # Filter out non-numeric values and prepare for logging
            metrics = {}
            for key, value in logs.items():
                if isinstance(value, (int, float)):
                    # Use step from state
                    metrics[key] = value

            if metrics and state.global_step is not None:
                self.swanlab.log(metrics, step=state.global_step)
        except Exception as err:
            LOG.warning(f"Failed to log metrics to SwanLab: {err}")

        return control

    def on_train_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        """Called at the end of training"""
        if not state.is_world_process_zero:
            return control

        if self._initialized:
            LOG.info("Training completed. SwanLab logs are available.")

        return control


class SaveAxolotlConfigtoSwanLabCallback(TrainerCallback):
    """Callback to save axolotl config to SwanLab"""

    def __init__(self, axolotl_config_path):
        self.axolotl_config_path = axolotl_config_path

    def on_train_begin(
        self,
        args: AxolotlTrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if state.is_world_process_zero:
            try:
                import swanlab

                # Check if SwanLab is initialized
                if swanlab.get_run() is None:
                    LOG.warning(
                        "SwanLab run is not initialized. Please initialize SwanLab before training."
                    )
                    return control

                # Log Axolotl config as artifact
                with NamedTemporaryFile(
                    mode="w", delete=False, suffix=".yml", prefix="axolotl_config_"
                ) as temp_file:
                    copyfile(self.axolotl_config_path, temp_file.name)

                    # Log config file to SwanLab
                    with open(temp_file.name, "r", encoding="utf-8") as config_file:
                        swanlab.log(
                            {
                                "axolotl_config": swanlab.Text(
                                    config_file.read(), caption="Axolotl Config"
                                )
                            }
                        )

                    LOG.info(
                        "The Axolotl config has been saved to the SwanLab run under logs."
                    )

                    # Clean up temp file
                    os.unlink(temp_file.name)

            except ImportError:
                LOG.warning(
                    "SwanLab is not installed. Install it with: pip install swanlab"
                )
            except (FileNotFoundError, ConnectionError) as err:
                LOG.warning(f"Error while saving Axolotl config to SwanLab: {err}")

            # Log DeepSpeed config if available
            if args.deepspeed:
                try:
                    import swanlab

                    with NamedTemporaryFile(
                        mode="w",
                        delete=False,
                        suffix=".json",
                        prefix="deepspeed_config_",
                    ) as temp_file:
                        skip_upload = False
                        if isinstance(args.deepspeed, dict):
                            json.dump(args.deepspeed, temp_file, indent=4)
                        elif isinstance(args.deepspeed, str) and os.path.exists(
                            args.deepspeed
                        ):
                            copyfile(args.deepspeed, temp_file.name)
                        else:
                            skip_upload = True

                        if not skip_upload:
                            temp_file.flush()
                            with open(
                                temp_file.name, "r", encoding="utf-8"
                            ) as ds_config_file:
                                swanlab.log(
                                    {
                                        "deepspeed_config": swanlab.Text(
                                            ds_config_file.read(),
                                            caption="DeepSpeed Config",
                                        )
                                    }
                                )
                            LOG.info(
                                "The DeepSpeed config has been saved to the SwanLab run under logs."
                            )

                        # Clean up temp file
                        os.unlink(temp_file.name)

                except (FileNotFoundError, ConnectionError) as err:
                    LOG.warning(
                        f"Error while saving DeepSpeed config to SwanLab: {err}"
                    )
                except ImportError:
                    pass

        return control


================================================
FILE: src/axolotl/utils/callbacks/tokens_per_second.py
================================================
"""A callback for calculating tokens per second during training."""

import json
import os
import time

import torch
from transformers import (
    TrainerCallback,
    TrainerControl,
    TrainerState,
    TrainingArguments,
)

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

TOKENS_STATE_FILE = "tokens_state.json"


class TokensPerSecondCallback(TrainerCallback):
    """
    A callback to measure and log tokens per second during training.
    Also handles saving/restoring total_tokens state across checkpoint resumes.
    """

    def __init__(
        self, tensor_parallel_size, context_parallel_size, resume_from_checkpoint=None
    ):
        super().__init__()
        self.step_time = 0.0
        self.start_time = 0.0
        self.non_data_parallel_size = 1
        self.resume_from_checkpoint = resume_from_checkpoint
        if tensor_parallel_size is not None:
            self.non_data_parallel_size *= tensor_parallel_size
        if context_parallel_size is not None:
            self.non_data_parallel_size *= context_parallel_size

    def on_train_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):  # pylint: disable=unused-argument
        """Restore total_tokens state when resuming from checkpoint."""
        if not isinstance(self.resume_from_checkpoint, str):
            return
        tokens_state_path = os.path.join(self.resume_from_checkpoint, TOKENS_STATE_FILE)
        if os.path.isfile(tokens_state_path):
            with open(tokens_state_path, "r", encoding="utf-8") as f:
                tokens_state = json.load(f)
            state.tokens = {
                "total": torch.tensor(tokens_state.get("total", 0)),
                "trainable": torch.tensor(tokens_state.get("trainable", 0)),
            }
            LOG.info(f"Restored total_tokens: {state.tokens['total']}")

    def on_step_begin(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):  # pylint: disable=unused-argument
        if not hasattr(state, "tokens"):
            state.tokens = {"trainable": torch.zeros(1), "total": torch.zeros(1)}
        self.start_time = time.perf_counter()
        state.last_tokens_per_second = torch.zeros(1)

    def on_step_end(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):  # pylint: disable=unused-argument
        tokens = getattr(state, "tokens", None)
        if not (tokens and "trainable_tokens" in tokens):
            return
        step_time = time.perf_counter() - self.start_time
        if step_time <= 0:
            return

        num_tokens = tokens["trainable_tokens"].clone() / self.non_data_parallel_size
        if torch.distributed.is_initialized():
            dp_size = max(
                1, torch.distributed.get_world_size() // self.non_data_parallel_size
            )
            num_tokens = num_tokens / dp_size
        state.last_tokens_per_second = num_tokens / step_time

    def on_log(
        self,
        args: TrainingArguments,
        state: TrainerState,
        control: TrainerControl,
        logs=None,
        **kwargs,
    ):  # pylint: disable=unused-argument
        # after logging, clear the running metrics
        if hasattr(state, "last_tokens_per_second"):
            logs["tokens/train_per_sec_per_gpu"] = state.last_tokens_per_second.item()
            state.last_tokens_per_second.zero_()
        tokens = getattr(state, "tokens", None)
        # Clear per-step tokens after logging
        if tokens and "trainable_tokens" in tokens:
            tokens["trainable_tokens"] = torch.zeros_like(tokens["trainable_tokens"])


================================================
FILE: src/axolotl/utils/callbacks/trackio_.py
================================================
"""Trackio module for trainer callbacks"""

from typing import TYPE_CHECKING

import trackio
from transformers import TrainerCallback, TrainerControl, TrainerState

from axolotl.utils.distributed import is_main_process
from axolotl.utils.environment import is_package_version_ge
from axolotl.utils.logging import get_logger

if TYPE_CHECKING:
    from axolotl.core.training_args import AxolotlTrainingArguments

LOG = get_logger(__name__)


class SaveAxolotlConfigtoTrackioCallback(TrainerCallback):
    """Callback for trackio integration"""

    def __init__(self, axolotl_config_path):
        self.axolotl_config_path = axolotl_config_path

    def on_train_begin(
        self,
        args: "AxolotlTrainingArguments",
        state: TrainerState,
        control: TrainerControl,
        **kwargs,
    ):
        if is_main_process():
            try:
                if not is_package_version_ge("trackio", "0.11.0"):
                    LOG.warning(
                        "Trackio version 0.11.0 or higher is required to save config files. "
                        "Please upgrade trackio: pip install --upgrade trackio"
                    )
                    return control

                trackio.save(self.axolotl_config_path)
                LOG.info("The Axolotl config has been saved to Trackio.")
            except (FileNotFoundError, ConnectionError, AttributeError) as err:
                LOG.warning(f"Error while saving Axolotl config to Trackio: {err}")
        return control


================================================
FILE: src/axolotl/utils/chat_templates/__init__.py
================================================
"""
This module provides functionality for selecting chat templates based on user choices.
These templates are used for formatting messages in a conversation.
"""

from .base import (
    _CHAT_TEMPLATES,
    extract_chat_template_args,
    get_chat_template,
    get_chat_template_from_config,
    register_chat_template,
)

__all__ = [
    "get_chat_template",
    "extract_chat_template_args",
    "get_chat_template_from_config",
    "register_chat_template",
    "_CHAT_TEMPLATES",
]


================================================
FILE: src/axolotl/utils/chat_templates/base.py
================================================
"""
utility functions for chat templates
"""

import os
from typing import TYPE_CHECKING, Any, Dict, Optional

from axolotl.utils.logging import get_logger

if TYPE_CHECKING:
    from transformers import PreTrainedTokenizerBase

LOG = get_logger("axolotl.utils.chat_templates")

_JINJA_TEMPLATE_CHOICE = "jinja"
_DEFAULT_TEMPLATE_CHOICE = "tokenizer_default"
_DEFAULT_FALLBACK_CHATML_TEMPLATE_CHOICE_PREFIX = "tokenizer_default_fallback_"

TEMPLATE_DIR = os.path.join(os.path.dirname(__file__), "templates")
_CHAT_TEMPLATES: dict[str, str] = {}
for filename in [f for f in os.listdir(TEMPLATE_DIR) if f.endswith(".jinja")]:
    with open(os.path.join(TEMPLATE_DIR, filename), "r", encoding="utf-8") as f:
        _CHAT_TEMPLATES[filename[:-6]] = f.read()


def get_chat_template(
    user_choice: str,
    jinja_template: str | None = None,
    tokenizer: Optional["PreTrainedTokenizerBase"] = None,
) -> str:
    """
    Finds the correct chat_template based on the user's choice, jinja_template, and tokenizer.

    Args:
        user_choice (str): The user's choice of template.
        jinja_template (str, optional): The jinja template string or Path to a valid jinja template file. Defaults to None.
        tokenizer (PreTrainedTokenizerBase, optional): The tokenizer. Defaults to None.

    Returns:
        str: The chosen template string.

    Raises:
        ValueError: If the user_choice is not found in the templates.
    """
    if user_choice == _JINJA_TEMPLATE_CHOICE:
        if not jinja_template:
            raise ValueError(
                f"`jinja_template` cannot be None when `chat_template` choice is {_JINJA_TEMPLATE_CHOICE}"
            )
        if os.path.exists(jinja_template) and os.path.isfile(jinja_template):
            with open(jinja_template, "r", encoding="utf-8") as file:
                jinja_template = file.read()
        return jinja_template

    if user_choice == _DEFAULT_TEMPLATE_CHOICE:
        if not tokenizer:
            raise ValueError(
                f"`tokenizer` cannot be None when chat_template choice is {_DEFAULT_TEMPLATE_CHOICE}"
            )
        if not tokenizer.chat_template:
            raise ValueError(
                f"`chat_template choice is {_DEFAULT_TEMPLATE_CHOICE} but tokenizer's chat_template is null. "
                f"Please add a chat_template in tokenizer config"
            )
        return tokenizer.chat_template  # type: ignore

    if user_choice.startswith(_DEFAULT_FALLBACK_CHATML_TEMPLATE_CHOICE_PREFIX):
        if not tokenizer:
            raise ValueError(
                f"`tokenizer` cannot be None when chat_template choice starts with {_DEFAULT_FALLBACK_CHATML_TEMPLATE_CHOICE_PREFIX}"
            )
        if tokenizer.chat_template:
            return tokenizer.chat_template  # type: ignore

        user_choice = user_choice[
            len(_DEFAULT_FALLBACK_CHATML_TEMPLATE_CHOICE_PREFIX) :
        ]
        LOG.warning(
            f"No chat template found on tokenizer, falling back to {user_choice}. It is recommended to set --train_on_inputs to True for the model to learn this chat template."
        )

    if user_choice in _CHAT_TEMPLATES:
        return _CHAT_TEMPLATES[user_choice]

    raise ValueError(f"Template '{user_choice}' not found.")


def extract_chat_template_args(cfg, ds_cfg: Dict[str, Any] | None = None):
    if ds_cfg and ds_cfg.get("chat_template"):
        chat_template_choice = ds_cfg.get("chat_template") or _DEFAULT_TEMPLATE_CHOICE
        chat_template_jinja = ds_cfg.get("chat_template_jinja")
    else:
        chat_template_choice = cfg.get("chat_template") or _DEFAULT_TEMPLATE_CHOICE
        chat_template_jinja = cfg.get("chat_template_jinja")
    return chat_template_choice, chat_template_jinja


def get_chat_template_from_config(
    cfg,
    ds_cfg: Dict[str, Any] | None = None,
    tokenizer: Optional["PreTrainedTokenizerBase"] = None,
) -> str:
    chat_template_choice, chat_template_jinja = extract_chat_template_args(
        cfg=cfg, ds_cfg=ds_cfg
    )
    return get_chat_template(
        user_choice=chat_template_choice,
        jinja_template=chat_template_jinja,
        tokenizer=tokenizer,
    )


def register_chat_template(template_name: str, chat_template: str):
    """
    Registers chat templates.

    Args:
        template_name (str): The name of the template.
        chat_template (str): The template string.
    """

    if template_name in _CHAT_TEMPLATES:
        raise ValueError(f"Template '{template_name}' already exists.")

    _CHAT_TEMPLATES[template_name] = chat_template


================================================
FILE: src/axolotl/utils/chat_templates/templates/alpaca.jinja
================================================
{{ bos_token }}{% for message in messages %}{% if message['role'] == 'system' and loop.first %}{{ message['content'] }}{% elif message['role'] == 'user' %}{{ '### Instruction:
' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '### Response:
' + message['content'] + eos_token }}{% endif %}{% if not loop.last %}{{ '

' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '

### Response:
' }}{% endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/aya.jinja
================================================
{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Aya, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/chatml.jinja
================================================
{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '
' + message['content'] + '<|im_end|>' + '
'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
' }}{% endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/cohere.jinja
================================================
{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% elif false == true %}{% set loop_messages = messages %}{% set system_message = 'You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by providing thorough responses. You are trained by Cohere.' %}{% else %}{% set loop_messages = messages %}{% set system_message = false %}{% endif %}{% if system_message != false %}{{ '<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>' + system_message + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% for message in loop_messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|START_OF_TURN_TOKEN|><|USER_TOKEN|>' + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% elif message['role'] == 'assistant' %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'  + content.strip() + '<|END_OF_TURN_TOKEN|>' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>' }}{% endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/command_a.jinja
================================================
{{ bos_token }}{% if documents %}
{% set tools = [] %}
{%- macro document_turn(documents) -%}
{# format documents into chat turn #}
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|>I will look through the document to address the users needs.<|END_THINKING|><|START_ACTION|>[
    {"tool_call_id": "0", "tool_name": "direct-injected-document", "parameters": {}}
]<|END_ACTION|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[
    {
        "tool_call_id": "0",
        "results": {
{% for doc in documents %}
            "{{ loop.index0 }}": {{doc|tojson}}{% if not loop.last %},
            {% endif %}
{% endfor %}

        },
        "is_error": null
    }
]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>{%- endmacro %}
{%- macro tool_call_id_to_int(messages, tool_call_id) %}
{%- set counter = namespace(value=0) %}
{%- set tool_call_id_seen = namespace(value=false) %}
{%- for msg in messages %}
    {%- if msg.tool_calls %}
        {%- for tool_call in msg.tool_calls %}
            {%- if tool_call.id == tool_call_id and not tool_call_id_seen.value -%}
                {{ counter.value }}
                {%- set tool_call_id_seen.value = true %}
            {%- endif %}
            {%- set counter.value = counter.value + 1 %}
        {%- endfor %}
    {%- endif %}
{%- endfor %}
{%- endmacro %}
{%- macro format_tool_message(messages, tool_msg) -%}
{# format tool message #}
    {
        "tool_call_id": "{{ tool_call_id_to_int(messages, tool_msg.tool_call_id) }}",
        "results": {
            "0": {{ tool_msg.content|tojson }}
        },
        "is_error": null
    }
{%- endmacro -%}
{%- if messages and messages[0]['role']|lower == 'system' %}{%- set developer_preamble = messages[0]['content'] %}{% endif %}
{%- set tool_idx = namespace(value=0) %}
{%- set tool_ids_seen = namespace(value=[]) %}
{%- set sent_documents = namespace(value=false) %}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># System Preamble
You are in contextual safety mode. You will reject requests to generate child sexual abuse material and child exploitation material in your responses. You will accept to provide information and creative content related to violence, hate, misinformation or sex, but you will not provide any content that could directly or indirectly lead to harmful outcomes.

Your information cutoff date is June 2024.

You have been trained on data in English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Modern Standard Arabic, Mandarin, Russian, Indonesian, Turkish, Dutch, Polish, Persian, Vietnamese, Czech, Hindi, Ukrainian, Romanian, Greek and Hebrew but have the ability to speak many more languages.
{% if tools or documents %}

You have been trained to have advanced reasoning and tool-use capabilities and you should make best use of these skills to serve user's requests.

## Tool Use
Think about how you can make best use of the provided tools to help with the task and come up with a high level plan that you will execute first.

0. Start by writing <|START_THINKING|> followed by a detailed step by step plan of how you will solve the problem. For each step explain your thinking fully and give details of required tool calls (if needed). Unless specified otherwise, you write your plan in natural language. When you finish, close it out with <|END_THINKING|>.
    You can optionally choose to skip this step when the user request is so straightforward to address that only a trivial plan would be needed.
    NOTE: You MUST skip this step when you are directly responding to the user's request without using any tools.

Then carry out your plan by repeatedly executing the following steps.
1. Action: write <|START_ACTION|> followed by a list of JSON-formatted tool calls, with each one containing "tool_name" and "parameters" fields.
    When there are multiple tool calls which are completely independent of each other (i.e. they can be executed in parallel), you should list them out all together in one step. When you finish, close it out with <|END_ACTION|>.
2. Observation: you will then receive results of those tool calls in JSON format in the very next turn, wrapped around by <|START_TOOL_RESULT|> and <|END_TOOL_RESULT|>. Carefully observe those results and think about what to do next. Note that these results will be provided to you in a separate turn. NEVER hallucinate results.
    Every tool call produces a list of results (when a tool call produces no result or a single result, it'll still get wrapped inside a list). Each result is clearly linked to its originating tool call via its "tool_call_id".
3. Reflection: start the next turn by writing <|START_THINKING|> followed by what you've figured out so far, any changes you need to make to your plan, and what you will do next. When you finish, close it out with <|END_THINKING|>.
    You can optionally choose to skip this step when everything is going according to plan and no special pieces of information or reasoning chains need to be recorded.
    NOTE: You MUST skip this step when you are done with tool-use actions and are ready to respond to the user.

You can repeat the above 3 steps multiple times (could be 0 times too if no suitable tool calls are available or needed), until you decide it's time to finally respond to the user.

4. Response: then break out of the loop and write <|START_RESPONSE|> followed by a piece of text which serves as a response to the user's last request. Use all previous tool calls and results to help you when formulating your response. When you finish, close it out with <|END_RESPONSE|>.
{% if enable_citations %}

## Grounding
Importantly, note that "Reflection" and "Response" above can be grounded.
Grounding means you associate pieces of texts (called "spans") with those specific tool results that support them (called "sources"). And you use a pair of tags "<co>" and "</co>" to indicate when a span can be grounded onto a list of sources, listing them out in the closing tag. Sources from the same tool call are grouped together and listed as "{tool_call_id}:[{list of result indices}]", before they are joined together by ",". E.g., "<co>span</co: 0:[1,2],1:[0]>" means that "span" is supported by result 1 and 2 from "tool_call_id=0" as well as result 0 from "tool_call_id=1".
{% endif %}

## Available Tools
Here is the list of tools that you have available to you.
You can ONLY use the tools listed here. When a tool is not listed below, it is NOT available and you should NEVER attempt to use it.
Each tool is represented as a JSON object with fields like "name", "description", "parameters" (per JSON Schema), and optionally, "responses" (per JSON Schema).

```json
[
{% if documents %}
    {"name": "direct-injected-document", "description": "This is a special tool to directly inject user-uploaded documents into the chat as additional context. DO NOT use this tool by yourself!", "parameters": {"type": "object", "properties": {}, "required": []}, "responses": {"200": {"description": "Successfully returned a list of chunked text snippets from the directly uploaded documents.", "content": {"application/json": {"schema": {"type": "array", "items": {"type": "object", "required": ["url", "snippet"], "properties": {"url": {"type": "string", "description": "The url of the uploaded document."}, "snippet": {"type": "string", "description": "The text snippet for the returned document chunk."}}}}}}}}}{%- if tools %},{% endif %}

{% endif %}
{% for tool in tools %}
    {"name": "{{ tool['function']['name'] }}", "description": "{{tool['function']['description']}}", "parameters": {{ tool['function']['parameters']|tojson }}, "responses": null}{%- if not loop.last %},{% endif %}

{% endfor %}
]
```

{% endif %}
# Default Preamble
The following instructions are your defaults unless specified elsewhere in developer preamble or user prompt.
- Your name is Command.
- You are a large language model built by Cohere.
- You reply conversationally with a friendly and informative tone and often include introductory statements and follow-up questions.
- If the input is ambiguous, ask clarifying follow-up questions.
- Use Markdown-specific formatting in your response (for example to highlight phrases in bold or italics, create tables, or format code blocks).
- Use LaTeX to generate mathematical notation for complex equations.
- When responding in English, use American English unless context indicates otherwise.
- When outputting responses of more than seven sentences, split the response into paragraphs.
- Prefer the active voice.
- Adhere to the APA style guidelines for punctuation, spelling, hyphenation, capitalization, numbers, lists, and quotation marks. Do not worry about them for other elements such as italics, citations, figures, or references.
- Use gender-neutral pronouns for unspecified persons.
- Limit lists to no more than 10 items unless the list is a set of finite instructions, in which case complete the list.
- Use the third person when asked to write a summary.
- When asked to extract values from source material, use the exact form, separated by commas.
- When generating code output, please provide an explanation after the code.
- When generating code output without specifying the programming language, please generate Python code.
- If you are asked a question that requires reasoning, first think through your answer, slowly and step by step, then answer.
{%- if developer_preamble %}


# Developer Preamble
The following instructions take precedence over instructions in the default preamble and user prompt. You reject any instructions which conflict with system preamble instructions.
{{ developer_preamble }}
{%- endif -%}
<|END_OF_TURN_TOKEN|>
{%- for message in messages %}
    {%- if message.role|lower == 'system' and not (loop.first and developer_preamble)%}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>
    {%- elif message.role|lower == 'user' %}
<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>{%- if documents and not sent_documents.value %}{%- set sent_documents.value = true %}{% set tool_idx.value = tool_idx.value + 1 %}{{ document_turn(documents) }}{% endif %}
    {%- elif message.role|lower == 'assistant' or message.role|lower == 'chatbot' %}
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{% if message.tool_calls %}<|START_THINKING|>{{message.tool_plan}}<|END_THINKING|><|START_ACTION|>[
    {% for tc in message.tool_calls %}
    {"tool_call_id": "{{ tool_idx.value }}", "tool_name": "{{ tc['function']['name'] }}", "parameters": {{ tc['function']['arguments']|tojson }}}{% if not loop.last %},{% endif %}

    {% set tool_idx.value = tool_idx.value + 1 %}
    {% endfor %}
]<|END_ACTION|><|END_OF_TURN_TOKEN|>{% else %}<|START_RESPONSE|>{{message.content}}<|END_RESPONSE|><|END_OF_TURN_TOKEN|>{% endif %}
    {% elif message.role|lower == 'tool' and message.tool_call_id not in tool_ids_seen.value %}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[
{{ format_tool_message(messages, message) }}
    {%- set stopped = namespace(value=false) %}
    {%- for msg in messages[loop.index0 + 1:] %}
        {%- if not stopped.value and msg.role|lower == 'tool' %},
{{ format_tool_message(messages, msg) }}
            {%- set tool_ids_seen.value = tool_ids_seen.value + [msg.tool_call_id] %}
        {%- else %}
            {%- set stopped.value = true %}
        {%- endif %}
    {%- endfor %}

]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>
    {%- endif %}
{%- endfor %}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>
{%- else -%}
{%- if messages and messages[0]['role']|lower == 'system' %}{%- set developer_preamble = messages[0]['content'] %}{% endif %}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># System Preamble
{% if safety_mode|upper == 'STRICT' -%}
You are in strict safety mode. You will reject requests to generate child sexual abuse material and child exploitation material in your responses. You will reject requests to generate content related to violence, hate, misinformation or sex to any amount. You will avoid using profanity. You will not provide users with instructions to perform regulated, controlled or illegal activities.
{%- else -%}
You are in contextual safety mode. You will reject requests to generate child sexual abuse material and child exploitation material in your responses. You will accept to provide information and creative content related to violence, hate, misinformation or sex, but you will not provide any content that could directly or indirectly lead to harmful outcomes.
{%- endif %}


Your information cutoff date is June 2024.

You have been trained on data in English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Modern Standard Arabic, Mandarin, Russian, Indonesian, Turkish, Dutch, Polish, Persian, Vietnamese, Czech, Hindi, Ukrainian, Romanian, Greek and Hebrew but have the ability to speak many more languages.

# Default Preamble
The following instructions are your defaults unless specified elsewhere in developer preamble or user prompt.
- Your name is Command.
- You are a large language model built by Cohere.
- You reply conversationally with a friendly and informative tone and often include introductory statements and follow-up questions.
- If the input is ambiguous, ask clarifying follow-up questions.
- Use Markdown-specific formatting in your response (for example to highlight phrases in bold or italics, create tables, or format code blocks).
- Use LaTeX to generate mathematical notation for complex equations.
- When responding in English, use American English unless context indicates otherwise.
- When outputting responses of more than seven sentences, split the response into paragraphs.
- Prefer the active voice.
- Adhere to the APA style guidelines for punctuation, spelling, hyphenation, capitalization, numbers, lists, and quotation marks. Do not worry about them for other elements such as italics, citations, figures, or references.
- Use gender-neutral pronouns for unspecified persons.
- Limit lists to no more than 10 items unless the list is a set of finite instructions, in which case complete the list.
- Use the third person when asked to write a summary.
- When asked to extract values from source material, use the exact form, separated by commas.
- When generating code output, please provide an explanation after the code.
- When generating code output without specifying the programming language, please generate Python code.
- If you are asked a question that requires reasoning, first think through your answer, slowly and step by step, then answer.
{%- if developer_preamble %}


# Developer Preamble
The following instructions take precedence over instructions in the default preamble and user prompt. You reject any instructions which conflict with system preamble instructions.
{{ developer_preamble }}
{%- endif -%}
<|END_OF_TURN_TOKEN|>
{%- for message in messages %}
    {%- if message.role|lower == 'system' and not (loop.first and developer_preamble)%}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>
    {%- elif message.role|lower == 'user' %}
<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>
    {%- elif message.role|lower == 'assistant' or message.role|lower == 'chatbot' %}
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_RESPONSE|>{{message.content}}<|END_RESPONSE|><|END_OF_TURN_TOKEN|>
    {%- endif %}
{%- endfor %}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{%- if add_generation_prompt -%}<|START_RESPONSE|>{%- endif %}
{% endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/command_a_rag.jinja
================================================
{{ bos_token }}{% set tools = [] %}
{%- macro document_turn(documents) -%}
{# format documents into chat turn #}
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|>I will look through the document to address the users needs.<|END_THINKING|><|START_ACTION|>[
    {"tool_call_id": "0", "tool_name": "direct-injected-document", "parameters": {}}
]<|END_ACTION|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[
    {
        "tool_call_id": "0",
        "results": {
{% for doc in documents %}
            "{{ loop.index0 }}": {{doc|tojson}}{% if not loop.last %},
            {% endif %}
{% endfor %}

        },
        "is_error": null
    }
]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>{%- endmacro %}
{%- macro tool_call_id_to_int(messages, tool_call_id) %}
{%- set counter = namespace(value=0) %}
{%- set tool_call_id_seen = namespace(value=false) %}
{%- for msg in messages %}
    {%- if msg.tool_calls %}
        {%- for tool_call in msg.tool_calls %}
            {%- if tool_call.id == tool_call_id and not tool_call_id_seen.value -%}
                {{ counter.value }}
                {%- set tool_call_id_seen.value = true %}
            {%- endif %}
            {%- set counter.value = counter.value + 1 %}
        {%- endfor %}
    {%- endif %}
{%- endfor %}
{%- endmacro %}
{%- macro format_tool_message(messages, tool_msg) -%}
{# format tool message #}
    {
        "tool_call_id": "{{ tool_call_id_to_int(messages, tool_msg.tool_call_id) }}",
        "results": {
            "0": {{ tool_msg.content|tojson }}
        },
        "is_error": null
    }
{%- endmacro -%}
{%- if messages and messages[0]['role']|lower == 'system' %}{%- set developer_preamble = messages[0]['content'] %}{% endif %}
{%- set tool_idx = namespace(value=0) %}
{%- set tool_ids_seen = namespace(value=[]) %}
{%- set sent_documents = namespace(value=false) %}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># System Preamble
You are in contextual safety mode. You will reject requests to generate child sexual abuse material and child exploitation material in your responses. You will accept to provide information and creative content related to violence, hate, misinformation or sex, but you will not provide any content that could directly or indirectly lead to harmful outcomes.

Your information cutoff date is June 2024.

You have been trained on data in English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Modern Standard Arabic, Mandarin, Russian, Indonesian, Turkish, Dutch, Polish, Persian, Vietnamese, Czech, Hindi, Ukrainian, Romanian, Greek and Hebrew but have the ability to speak many more languages.
{% if tools or documents %}

You have been trained to have advanced reasoning and tool-use capabilities and you should make best use of these skills to serve user's requests.

## Tool Use
Think about how you can make best use of the provided tools to help with the task and come up with a high level plan that you will execute first.

0. Start by writing <|START_THINKING|> followed by a detailed step by step plan of how you will solve the problem. For each step explain your thinking fully and give details of required tool calls (if needed). Unless specified otherwise, you write your plan in natural language. When you finish, close it out with <|END_THINKING|>.
    You can optionally choose to skip this step when the user request is so straightforward to address that only a trivial plan would be needed.
    NOTE: You MUST skip this step when you are directly responding to the user's request without using any tools.

Then carry out your plan by repeatedly executing the following steps.
1. Action: write <|START_ACTION|> followed by a list of JSON-formatted tool calls, with each one containing "tool_name" and "parameters" fields.
    When there are multiple tool calls which are completely independent of each other (i.e. they can be executed in parallel), you should list them out all together in one step. When you finish, close it out with <|END_ACTION|>.
2. Observation: you will then receive results of those tool calls in JSON format in the very next turn, wrapped around by <|START_TOOL_RESULT|> and <|END_TOOL_RESULT|>. Carefully observe those results and think about what to do next. Note that these results will be provided to you in a separate turn. NEVER hallucinate results.
    Every tool call produces a list of results (when a tool call produces no result or a single result, it'll still get wrapped inside a list). Each result is clearly linked to its originating tool call via its "tool_call_id".
3. Reflection: start the next turn by writing <|START_THINKING|> followed by what you've figured out so far, any changes you need to make to your plan, and what you will do next. When you finish, close it out with <|END_THINKING|>.
    You can optionally choose to skip this step when everything is going according to plan and no special pieces of information or reasoning chains need to be recorded.
    NOTE: You MUST skip this step when you are done with tool-use actions and are ready to respond to the user.

You can repeat the above 3 steps multiple times (could be 0 times too if no suitable tool calls are available or needed), until you decide it's time to finally respond to the user.

4. Response: then break out of the loop and write <|START_RESPONSE|> followed by a piece of text which serves as a response to the user's last request. Use all previous tool calls and results to help you when formulating your response. When you finish, close it out with <|END_RESPONSE|>.
{% if enable_citations %}

## Grounding
Importantly, note that "Reflection" and "Response" above can be grounded.
Grounding means you associate pieces of texts (called "spans") with those specific tool results that support them (called "sources"). And you use a pair of tags "<co>" and "</co>" to indicate when a span can be grounded onto a list of sources, listing them out in the closing tag. Sources from the same tool call are grouped together and listed as "{tool_call_id}:[{list of result indices}]", before they are joined together by ",". E.g., "<co>span</co: 0:[1,2],1:[0]>" means that "span" is supported by result 1 and 2 from "tool_call_id=0" as well as result 0 from "tool_call_id=1".
{% endif %}

## Available Tools
Here is the list of tools that you have available to you.
You can ONLY use the tools listed here. When a tool is not listed below, it is NOT available and you should NEVER attempt to use it.
Each tool is represented as a JSON object with fields like "name", "description", "parameters" (per JSON Schema), and optionally, "responses" (per JSON Schema).

```json
[
{% if documents %}
    {"name": "direct-injected-document", "description": "This is a special tool to directly inject user-uploaded documents into the chat as additional context. DO NOT use this tool by yourself!", "parameters": {"type": "object", "properties": {}, "required": []}, "responses": {"200": {"description": "Successfully returned a list of chunked text snippets from the directly uploaded documents.", "content": {"application/json": {"schema": {"type": "array", "items": {"type": "object", "required": ["url", "snippet"], "properties": {"url": {"type": "string", "description": "The url of the uploaded document."}, "snippet": {"type": "string", "description": "The text snippet for the returned document chunk."}}}}}}}}}{%- if tools %},{% endif %}

{% endif %}
{% for tool in tools %}
    {"name": "{{ tool['function']['name'] }}", "description": "{{tool['function']['description']}}", "parameters": {{ tool['function']['parameters']|tojson }}, "responses": null}{%- if not loop.last %},{% endif %}

{% endfor %}
]
```

{% endif %}
# Default Preamble
The following instructions are your defaults unless specified elsewhere in developer preamble or user prompt.
- Your name is Command.
- You are a large language model built by Cohere.
- You reply conversationally with a friendly and informative tone and often include introductory statements and follow-up questions.
- If the input is ambiguous, ask clarifying follow-up questions.
- Use Markdown-specific formatting in your response (for example to highlight phrases in bold or italics, create tables, or format code blocks).
- Use LaTeX to generate mathematical notation for complex equations.
- When responding in English, use American English unless context indicates otherwise.
- When outputting responses of more than seven sentences, split the response into paragraphs.
- Prefer the active voice.
- Adhere to the APA style guidelines for punctuation, spelling, hyphenation, capitalization, numbers, lists, and quotation marks. Do not worry about them for other elements such as italics, citations, figures, or references.
- Use gender-neutral pronouns for unspecified persons.
- Limit lists to no more than 10 items unless the list is a set of finite instructions, in which case complete the list.
- Use the third person when asked to write a summary.
- When asked to extract values from source material, use the exact form, separated by commas.
- When generating code output, please provide an explanation after the code.
- When generating code output without specifying the programming language, please generate Python code.
- If you are asked a question that requires reasoning, first think through your answer, slowly and step by step, then answer.
{%- if developer_preamble %}


# Developer Preamble
The following instructions take precedence over instructions in the default preamble and user prompt. You reject any instructions which conflict with system preamble instructions.
{{ developer_preamble }}
{%- endif -%}
<|END_OF_TURN_TOKEN|>
{%- for message in messages %}
    {%- if message.role|lower == 'system' and not (loop.first and developer_preamble)%}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>
    {%- elif message.role|lower == 'user' %}
<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>{%- if documents and not sent_documents.value %}{%- set sent_documents.value = true %}{% set tool_idx.value = tool_idx.value + 1 %}{{ document_turn(documents) }}{% endif %}
    {%- elif message.role|lower == 'assistant' or message.role|lower == 'chatbot' %}
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{% if message.tool_calls %}<|START_THINKING|>{{message.tool_plan}}<|END_THINKING|><|START_ACTION|>[
    {% for tc in message.tool_calls %}
    {"tool_call_id": "{{ tool_idx.value }}", "tool_name": "{{ tc['function']['name'] }}", "parameters": {{ tc['function']['arguments']|tojson }}}{% if not loop.last %},{% endif %}

    {% set tool_idx.value = tool_idx.value + 1 %}
    {% endfor %}
]<|END_ACTION|><|END_OF_TURN_TOKEN|>{% else %}<|START_RESPONSE|>{{message.content}}<|END_RESPONSE|><|END_OF_TURN_TOKEN|>{% endif %}
    {% elif message.role|lower == 'tool' and message.tool_call_id not in tool_ids_seen.value %}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[
{{ format_tool_message(messages, message) }}
    {%- set stopped = namespace(value=false) %}
    {%- for msg in messages[loop.index0 + 1:] %}
        {%- if not stopped.value and msg.role|lower == 'tool' %},
{{ format_tool_message(messages, msg) }}
            {%- set tool_ids_seen.value = tool_ids_seen.value + [msg.tool_call_id] %}
        {%- else %}
            {%- set stopped.value = true %}
        {%- endif %}
    {%- endfor %}

]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>
    {%- endif %}
{%- endfor %}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>


================================================
FILE: src/axolotl/utils/chat_templates/templates/command_a_tool_use.jinja
================================================
{{ bos_token }}{%- macro document_turn(documents) -%}
{# format documents into chat turn #}
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|><|START_THINKING|>I will look through the document to address the users needs.<|END_THINKING|><|START_ACTION|>[
    {"tool_call_id": "0", "tool_name": "direct-injected-document", "parameters": {}}
]<|END_ACTION|><|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[
    {
        "tool_call_id": "0",
        "results": {
{% for doc in documents %}
            "{{ loop.index0 }}": {{doc|tojson}}{% if not loop.last %},
            {% endif %}
{% endfor %}

        },
        "is_error": null
    }
]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>{%- endmacro %}
{%- macro tool_call_id_to_int(messages, tool_call_id) %}
{%- set counter = namespace(value=0) %}
{%- set tool_call_id_seen = namespace(value=false) %}
{%- for msg in messages %}
    {%- if msg.tool_calls %}
        {%- for tool_call in msg.tool_calls %}
            {%- if tool_call.id == tool_call_id and not tool_call_id_seen.value -%}
                {{ counter.value }}
                {%- set tool_call_id_seen.value = true %}
            {%- endif %}
            {%- set counter.value = counter.value + 1 %}
        {%- endfor %}
    {%- endif %}
{%- endfor %}
{%- endmacro %}
{%- macro format_tool_message(messages, tool_msg) -%}
{# format tool message #}
    {
        "tool_call_id": "{{ tool_call_id_to_int(messages, tool_msg.tool_call_id) }}",
        "results": {
            "0": {{ tool_msg.content|tojson }}
        },
        "is_error": null
    }
{%- endmacro -%}
{%- if messages and messages[0]['role']|lower == 'system' %}{%- set developer_preamble = messages[0]['content'] %}{% endif %}
{%- set tool_idx = namespace(value=0) %}
{%- set tool_ids_seen = namespace(value=[]) %}
{%- set sent_documents = namespace(value=false) %}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|># System Preamble
You are in contextual safety mode. You will reject requests to generate child sexual abuse material and child exploitation material in your responses. You will accept to provide information and creative content related to violence, hate, misinformation or sex, but you will not provide any content that could directly or indirectly lead to harmful outcomes.

Your information cutoff date is June 2024.

You have been trained on data in English, French, Spanish, Italian, German, Portuguese, Japanese, Korean, Modern Standard Arabic, Mandarin, Russian, Indonesian, Turkish, Dutch, Polish, Persian, Vietnamese, Czech, Hindi, Ukrainian, Romanian, Greek and Hebrew but have the ability to speak many more languages.
{% if tools or documents %}

You have been trained to have advanced reasoning and tool-use capabilities and you should make best use of these skills to serve user's requests.

## Tool Use
Think about how you can make best use of the provided tools to help with the task and come up with a high level plan that you will execute first.

0. Start by writing <|START_THINKING|> followed by a detailed step by step plan of how you will solve the problem. For each step explain your thinking fully and give details of required tool calls (if needed). Unless specified otherwise, you write your plan in natural language. When you finish, close it out with <|END_THINKING|>.
    You can optionally choose to skip this step when the user request is so straightforward to address that only a trivial plan would be needed.
    NOTE: You MUST skip this step when you are directly responding to the user's request without using any tools.

Then carry out your plan by repeatedly executing the following steps.
1. Action: write <|START_ACTION|> followed by a list of JSON-formatted tool calls, with each one containing "tool_name" and "parameters" fields.
    When there are multiple tool calls which are completely independent of each other (i.e. they can be executed in parallel), you should list them out all together in one step. When you finish, close it out with <|END_ACTION|>.
2. Observation: you will then receive results of those tool calls in JSON format in the very next turn, wrapped around by <|START_TOOL_RESULT|> and <|END_TOOL_RESULT|>. Carefully observe those results and think about what to do next. Note that these results will be provided to you in a separate turn. NEVER hallucinate results.
    Every tool call produces a list of results (when a tool call produces no result or a single result, it'll still get wrapped inside a list). Each result is clearly linked to its originating tool call via its "tool_call_id".
3. Reflection: start the next turn by writing <|START_THINKING|> followed by what you've figured out so far, any changes you need to make to your plan, and what you will do next. When you finish, close it out with <|END_THINKING|>.
    You can optionally choose to skip this step when everything is going according to plan and no special pieces of information or reasoning chains need to be recorded.
    NOTE: You MUST skip this step when you are done with tool-use actions and are ready to respond to the user.

You can repeat the above 3 steps multiple times (could be 0 times too if no suitable tool calls are available or needed), until you decide it's time to finally respond to the user.

4. Response: then break out of the loop and write <|START_RESPONSE|> followed by a piece of text which serves as a response to the user's last request. Use all previous tool calls and results to help you when formulating your response. When you finish, close it out with <|END_RESPONSE|>.
{% if enable_citations %}

## Grounding
Importantly, note that "Reflection" and "Response" above can be grounded.
Grounding means you associate pieces of texts (called "spans") with those specific tool results that support them (called "sources"). And you use a pair of tags "<co>" and "</co>" to indicate when a span can be grounded onto a list of sources, listing them out in the closing tag. Sources from the same tool call are grouped together and listed as "{tool_call_id}:[{list of result indices}]", before they are joined together by ",". E.g., "<co>span</co: 0:[1,2],1:[0]>" means that "span" is supported by result 1 and 2 from "tool_call_id=0" as well as result 0 from "tool_call_id=1".
{% endif %}

## Available Tools
Here is the list of tools that you have available to you.
You can ONLY use the tools listed here. When a tool is not listed below, it is NOT available and you should NEVER attempt to use it.
Each tool is represented as a JSON object with fields like "name", "description", "parameters" (per JSON Schema), and optionally, "responses" (per JSON Schema).

```json
[
{% if documents %}
    {"name": "direct-injected-document", "description": "This is a special tool to directly inject user-uploaded documents into the chat as additional context. DO NOT use this tool by yourself!", "parameters": {"type": "object", "properties": {}, "required": []}, "responses": {"200": {"description": "Successfully returned a list of chunked text snippets from the directly uploaded documents.", "content": {"application/json": {"schema": {"type": "array", "items": {"type": "object", "required": ["url", "snippet"], "properties": {"url": {"type": "string", "description": "The url of the uploaded document."}, "snippet": {"type": "string", "description": "The text snippet for the returned document chunk."}}}}}}}}}{%- if tools %},{% endif %}

{% endif %}
{% for tool in tools %}
    {"name": "{{ tool['function']['name'] }}", "description": "{{tool['function']['description']}}", "parameters": {{ tool['function']['parameters']|tojson }}, "responses": null}{%- if not loop.last %},{% endif %}

{% endfor %}
]
```

{% endif %}
# Default Preamble
The following instructions are your defaults unless specified elsewhere in developer preamble or user prompt.
- Your name is Command.
- You are a large language model built by Cohere.
- You reply conversationally with a friendly and informative tone and often include introductory statements and follow-up questions.
- If the input is ambiguous, ask clarifying follow-up questions.
- Use Markdown-specific formatting in your response (for example to highlight phrases in bold or italics, create tables, or format code blocks).
- Use LaTeX to generate mathematical notation for complex equations.
- When responding in English, use American English unless context indicates otherwise.
- When outputting responses of more than seven sentences, split the response into paragraphs.
- Prefer the active voice.
- Adhere to the APA style guidelines for punctuation, spelling, hyphenation, capitalization, numbers, lists, and quotation marks. Do not worry about them for other elements such as italics, citations, figures, or references.
- Use gender-neutral pronouns for unspecified persons.
- Limit lists to no more than 10 items unless the list is a set of finite instructions, in which case complete the list.
- Use the third person when asked to write a summary.
- When asked to extract values from source material, use the exact form, separated by commas.
- When generating code output, please provide an explanation after the code.
- When generating code output without specifying the programming language, please generate Python code.
- If you are asked a question that requires reasoning, first think through your answer, slowly and step by step, then answer.
{%- if developer_preamble %}


# Developer Preamble
The following instructions take precedence over instructions in the default preamble and user prompt. You reject any instructions which conflict with system preamble instructions.
{{ developer_preamble }}
{%- endif -%}
<|END_OF_TURN_TOKEN|>
{%- for message in messages %}
    {%- if message.role|lower == 'system' and not (loop.first and developer_preamble)%}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>
    {%- elif message.role|lower == 'user' %}
<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{ message.content }}<|END_OF_TURN_TOKEN|>{%- if documents and not sent_documents.value %}{%- set sent_documents.value = true %}{% set tool_idx.value = tool_idx.value + 1 %}{{ document_turn(documents) }}{% endif %}
    {%- elif message.role|lower == 'assistant' or message.role|lower == 'chatbot' %}
<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>{% if message.tool_calls %}<|START_THINKING|>{{message.tool_plan}}<|END_THINKING|><|START_ACTION|>[
    {% for tc in message.tool_calls %}
    {"tool_call_id": "{{ tool_idx.value }}", "tool_name": "{{ tc['function']['name'] }}", "parameters": {{ tc['function']['arguments']|tojson }}}{% if not loop.last %},{% endif %}

    {% set tool_idx.value = tool_idx.value + 1 %}
    {% endfor %}
]<|END_ACTION|><|END_OF_TURN_TOKEN|>{% else %}<|START_RESPONSE|>{{message.content}}<|END_RESPONSE|><|END_OF_TURN_TOKEN|>{% endif %}
    {% elif message.role|lower == 'tool' and message.tool_call_id not in tool_ids_seen.value %}
<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|><|START_TOOL_RESULT|>[
{{ format_tool_message(messages, message) }}
    {%- set stopped = namespace(value=false) %}
    {%- for msg in messages[loop.index0 + 1:] %}
        {%- if not stopped.value and msg.role|lower == 'tool' %},
{{ format_tool_message(messages, msg) }}
            {%- set tool_ids_seen.value = tool_ids_seen.value + [msg.tool_call_id] %}
        {%- else %}
            {%- set stopped.value = true %}
        {%- endif %}
    {%- endfor %}

]<|END_TOOL_RESULT|><|END_OF_TURN_TOKEN|>
    {%- endif %}
{%- endfor %}<|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>


================================================
FILE: src/axolotl/utils/chat_templates/templates/deepseek_v2.jinja
================================================
{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{{ bos_token }}{% for message in messages %}{% if message['role'] == 'user' %}{{ '<｜User｜>' + message['content'] }}{% elif message['role'] == 'assistant' %}{{ '<｜Assistant｜>' + message['content'] + eos_token }}{% elif message['role'] == 'system' %}{{ message['content'] + '

' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<｜Assistant｜>' }}{% endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/deepseek_v3.jinja
================================================
{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set ns = namespace(is_first=false, is_tool=false, is_output_first=true, system_prompt='', is_first_sp=true) %}{%- for message in messages %}{%- if message['role'] == 'system' %}{%- if ns.is_first_sp %}{% set ns.system_prompt = ns.system_prompt + message['content'] %}{% set ns.is_first_sp = false %}{%- else %}{% set ns.system_prompt = ns.system_prompt + '\n\n' + message['content'] %}{%- endif %}{%- endif %}{%- endfor %}{{ bos_token }}{{ ns.system_prompt }}{%- for message in messages %}{%- if message['role'] == 'user' %}{%- set ns.is_tool = false -%}{{'<｜User｜>' + message['content']}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' in message %}{%- set ns.is_tool = false -%}{%- for tool in message['tool_calls'] %}{%- if not ns.is_first %}{%- if message['content'] is none %}{{'<｜Assistant｜><｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- else %}{{'<｜Assistant｜>' + message['content'] + '<｜tool▁calls▁begin｜><｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- set ns.is_first = true -%}{%- else %}{{'\n' + '<｜tool▁call▁begin｜>' + tool['type'] + '<｜tool▁sep｜>' + tool['function']['name'] + '\n' + '```json' + '\n' + tool['function']['arguments'] + '\n' + '```' + '<｜tool▁call▁end｜>'}}{%- endif %}{%- endfor %}{{'<｜tool▁calls▁end｜><｜end▁of▁sentence｜>'}}{%- endif %}{%- if message['role'] == 'assistant' and 'tool_calls' not in message %}{%- if ns.is_tool %}{{'<｜tool▁outputs▁end｜>' + message['content'] + '<｜end▁of▁sentence｜>'}}{%- set ns.is_tool = false -%}{%- else %}{% set content = message['content'] %}{% if '</think>' in content %}{% set content = content.split('</think>')[-1] %}{% endif %}{{'<｜Assistant｜>' + content + '<｜end▁of▁sentence｜>'}}{%- endif %}{%- endif %}{%- if message['role'] == 'tool' %}{%- set ns.is_tool = true -%}{%- if ns.is_output_first %}{{'<｜tool▁outputs▁begin｜><｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- set ns.is_output_first = false %}{%- else %}{{'<｜tool▁output▁begin｜>' + message['content'] + '<｜tool▁output▁end｜>'}}{%- endif %}{%- endif %}{%- endfor -%}{% if ns.is_tool %}{{'<｜tool▁outputs▁end｜>'}}{% endif %}{% if add_generation_prompt and not ns.is_tool %}{{'<｜Assistant｜>'}}{% endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/exaone.jinja
================================================
{% for message in messages %}{% if loop.first and message['role'] != 'system' %}{{ '[|system|][|endofturn|]
' }}{% endif %}{{ '[|' + message['role'] + '|]' + message['content'] }}{% if message['role'] == 'user' %}{{ '
' }}{% else %}{{ '[|endofturn|]
' }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '[|assistant|]' }}{% endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/exaone4.jinja
================================================
{%- if not skip_think is defined %}
  {%- set skip_think = true %}
{%- endif %}
{%- set role_indicators = {
    'user': '[|user|]\n',
    'assistant': '[|assistant|]\n',
    'system': '[|system|]\n',
    'tool': '[|tool|]\n'
} %}
{%- set end_of_turn = '[|endofturn|]\n' %}
{%- macro available_tools(tools) %}
    {{- "# Available Tools" }}
    {{- "\nYou can use none, one, or multiple of the following tools by calling them as functions to help with the user’s query." }}
    {{- "\nHere are the tools available to you in JSON format within <tool> and </tool> tags:\n" }}
    {%- for tool in tools %}
        {{- "<tool>" }}
        {{- tool | tojson(ensure_ascii=False) | safe }}
        {{- "</tool>\n" }}
    {%- endfor %}
    {{- "\nFor each function call you want to make, return a JSON object with function name and arguments within <tool_call> and </tool_call> tags, like:" }}
    {{- "\n<tool_call>{\"name\": function_1_name, \"arguments\": {argument_1_name: argument_1_value, argument_2_name: argument_2_value}}</tool_call>" }}
    {{- "\n<tool_call>{\"name\": function_2_name, \"arguments\": {...}}</tool_call>\n..." }}
    {{- "\nNote that if no argument name is specified for a tool, you can just print the argument value directly, without the argument name or JSON formatting." }}
{%- endmacro %}
{%- set ns = namespace(last_query_index = messages|length - 1) %}
{%- for message in messages %}
    {%- if message.role == "user" and message.content is string %}
        {%- set ns.last_query_index = loop.index0 -%}
    {%- endif %}
{%- endfor %}
{%- for i in range(messages | length) %}
    {%- set msg = messages[i] %}
    {%- set role = msg.role %}
    {%- if role not in role_indicators %}
        {{- raise_exception('Unknown role: ' ~ role) }}
    {%- endif %}
    {# ---- Case A: If the first message is "system", handle it here alone (without continue) ---- #}
    {%- if i == 0 and role == 'system' %}
            {{- role_indicators['system'] }}
            {{- msg.content }}
            {%- if tools is defined and tools %}
                {{- "\n\n" }}{{- available_tools(tools) }}
            {%- endif %}
            {{- end_of_turn -}}
    {%- else %}
    {# ---- Case B: If the first message is tools instead of system, inject the system tools preamble ---- #}
        {%- if i == 0 and tools is defined and tools %}
            {{- role_indicators['system'] }}
            {{- available_tools(tools) }}
            {{- end_of_turn -}}
        {%- endif %}
    {%- endif %}
    {%- if role == 'assistant' %}
        {{- role_indicators['assistant'] }}
        {%- if msg.content %}
            {%- if "</think>" in msg.content %}
                {%- set content = msg.content.split('</think>')[-1].strip() %}
                {%- set reasoning_content = msg.content.split('</think>')[0].strip() %}
                {%- if reasoning_content.startswith("<think>") %}
                    {%- set reasoning_content = reasoning_content[7:].strip() %}
                {%- endif %}
            {%- else %}
                {%- set content = msg.content %}
            {%- endif %}
            {%- if msg.reasoning_content %}
                {%- set reasoning_content = msg.reasoning_content %}
            {%- endif %}
            {%- if (not skip_think and loop.last) and reasoning_content is defined %}
                {{- "<think>\n" }}
                {{- reasoning_content}}
                {{- "\n</think>\n\n" }}
            {%- else %}
                {{- "<think>\n\n</think>\n\n" }}
            {%- endif %}
            {{- content }}
        {%- endif %}
        {%- if msg.tool_calls %}
            {%- if msg.content %}
                {{- "\n" }}
            {%- else %}
                {{- "<think>\n\n</think>\n\n" }}
            {%- endif %}
            {%- for tool_call in msg.tool_calls %}
                {%- if tool_call.function is defined %}
                    {%- set tool_call = tool_call.function %}
                {%- endif %}
                {%- if tool_call.arguments is defined %}
                    {%- set arguments = tool_call.arguments %}
                {%- elif tool_call.parameters is defined %}
                    {%- set arguments = tool_call.parameters %}
                {%- else %}
                    {{- raise_exception('arguments or parameters are mandatory: ' ~ tool_call) }}
                {%- endif %}
                {{- "<tool_call>" }}{"name": "{{- tool_call.name }}", "arguments": {{ arguments | tojson(ensure_ascii=False) | safe }}}{{- "</tool_call>" }}
                {%- if not loop.last %}
                    {{- "\n" }}
                {%- endif %}
            {%- endfor %}
        {%- endif %}
        {{- end_of_turn -}}
    {%- elif role == "tool" %}
        {%- if i == 0 or messages[i - 1].role != "tool" %}
            {{- role_indicators['tool'] }}
        {%- endif %}
        {%- if msg.content is defined %}
            {{- "<tool_result>" }}{"result": {{ msg.content | tojson(ensure_ascii=False) | safe }}}{{- "</tool_result>" }}
        {%- endif %}
        {%- if loop.last or messages[i + 1].role != "tool" %}
            {{- end_of_turn -}}
        {%- else %}
            {{- "\n" }}
        {%- endif %}
    {%- else %}
        {{- role_indicators[role] }}
        {{- msg.content }}
        {{- end_of_turn -}}
    {%- endif %}
{% endfor %}
{%- if add_generation_prompt %}
    {{- role_indicators['assistant'] }}
    {%- if enable_thinking is defined and enable_thinking is true %}
        {{- "<think>\n" }}
    {%- else %}
        {{- "<think>\n\n</think>\n\n" }}
    {%- endif %}
{%- endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/falcon_h1.jinja
================================================
'{{bos_token}}
{%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0].role == 'system' %}
        {{- messages[0].content + '\n\n' }}
    {%- endif %}
    {{- "You are a function calling AI model. You are provided with function signature within <tools> </tools> XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.\n<tools>\n" }}
    {%- for tool in tools %}[{{- tool | tojson }}]{%- endfor %}
    {{- "\n</tools>\nFor each function call, return a json object with function name and arguments within <tool_call> </tool_call> tags with the following schema:\n<tool_call>\n{'arguments': <args-dict>, 'name': <function-name>}\n</tool_call>\n" }}
{%- else %}
    {%- if messages[0].role == 'system' %}
        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
    {%- endif %}
{%- endif %}{% for message in messages %}{%- if message.role != 'system' %}{{'<|im_start|>' + message['role'] + '
' + message['content'] + '<|im_end|>' + '
'}}{%- endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant
' }}{% endif %}'


================================================
FILE: src/axolotl/utils/chat_templates/templates/gemma.jinja
================================================
{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '
' + message['content'] | trim + '<end_of_turn>
' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model
'}}{% endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/gemma3.jinja
================================================
{{ bos_token }}
{%- if messages[0]['role'] == 'system' -%}
    {%- if messages[0]['content'] is string -%}
        {%- set first_user_prefix = messages[0]['content'] + '

' -%}
    {%- else -%}
        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '

' -%}
    {%- endif -%}
    {%- set loop_messages = messages[1:] -%}
{%- else -%}
    {%- set first_user_prefix = "" -%}
    {%- set loop_messages = messages -%}
{%- endif -%}
{%- for message in loop_messages -%}
    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
    {%- endif -%}
    {%- if (message['role'] == 'assistant') -%}
        {%- set role = "model" -%}
    {%- else -%}
        {%- set role = message['role'] -%}
    {%- endif -%}
    {{ '<start_of_turn>' + role + '
' + (first_user_prefix if loop.first else "") }}
    {%- if message['content'] is string -%}
        {{ message['content'] | trim }}
    {%- elif message['content'] is iterable -%}
        {%- for item in message['content'] -%}
            {%- if item['type'] == 'image' -%}
                {{ '<start_of_image>' }}
            {%- elif item['type'] == 'text' -%}
                {{ item['text'] | trim }}
            {%- endif -%}
        {%- endfor -%}
    {%- else -%}
        {{ raise_exception("Invalid content type") }}
    {%- endif -%}
    {{ '<end_of_turn>
' }}
{%- endfor -%}
{%- if add_generation_prompt -%}
    {{'<start_of_turn>model
'}}
{%- endif -%}


================================================
FILE: src/axolotl/utils/chat_templates/templates/gemma3n.jinja
================================================
{{ bos_token }}
{%- if messages[0]['role'] == 'system' -%}
    {%- if messages[0]['content'] is string -%}
        {%- set first_user_prefix = messages[0]['content'] + '

' -%}
    {%- else -%}
        {%- set first_user_prefix = messages[0]['content'][0]['text'] + '

' -%}
    {%- endif -%}
    {%- set loop_messages = messages[1:] -%}
{%- else -%}
    {%- set first_user_prefix = "" -%}
    {%- set loop_messages = messages -%}
{%- endif -%}
{%- for message in loop_messages -%}
    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) -%}
        {{ raise_exception("Conversation roles must alternate user/assistant/user/assistant/...") }}
    {%- endif -%}
    {%- if (message['role'] == 'assistant') -%}
        {%- set role = "model" -%}
    {%- else -%}
        {%- set role = message['role'] -%}
    {%- endif -%}
    {{ '<start_of_turn>' + role + '
' + (first_user_prefix if loop.first else "") }}
    {%- if message['content'] is string -%}
        {{ message['content'] | trim }}
    {%- elif message['content'] is iterable -%}
        {%- for item in message['content'] -%}
            {%- if item['type'] == 'audio' -%}
                {{ '<audio_soft_token>' }}
            {%- elif item['type'] == 'image' -%}
                {{ '<image_soft_token>' }}
            {%- elif item['type'] == 'text' -%}
                {{ item['text'] | trim }}
            {%- endif -%}
        {%- endfor -%}
    {%- else -%}
        {{ raise_exception("Invalid content type") }}
    {%- endif -%}
    {{ '<end_of_turn>
' }}
{%- endfor -%}
{%- if add_generation_prompt -%}
    {{'<start_of_turn>model
'}}
{%- endif -%}


================================================
FILE: src/axolotl/utils/chat_templates/templates/jamba.jinja
================================================
{# Variables #}
{% set ns = namespace(message_count=0, is_last_checked_defined=False) %}
{##}
{% set bom_str = bom_str or "<|bom|>" %}
{% set eom_str = eom_str or "<|eom|>" %}
{% set default_system_message = "" %}
{##}
{% set documents_prefix = "<documents>" %}
{% set documents_suffix = "</documents>" %}
{% set tool_definitions_prefix = "<tool_definitions>" %}
{% set tool_definitions_suffix = "</tool_definitions>" %}
{% set active_modes_prefix = "<active_output_modes>" %}
{% set active_modes_suffix = "</active_output_modes>" %}
{##}
{% set tool_calls_prefix = "<tool_calls>" %}
{% set tool_calls_suffix = "</tool_calls>" %}
{% set citations_prefix = "<citations>" %}
{% set citations_suffix = "</citations>" %}
{##}
{% if add_generation_prompt is not defined %}
  {% set add_generation_prompt = True %}
{% endif %}
{% set role_to_predict = role_to_predict or "assistant" %}
{% if messages|length > 0 and messages[0].role == "system" %}
  {% set system_message = messages[0].content %}
  {% set loop_messages = messages[1:] %}
{% else %}
  {% set system_message = default_system_message %}
  {% set loop_messages = messages %}
{% endif %}
{##}
{##}
{# Macros #}
{% macro handle_tool_definitions(tools) %}
  {{- tool_definitions_prefix -}}
  {{- "\n# Tools" -}}
  {{- "\n\n## Functions" -}}
  {% for tool in tools %}
    {% set _ = is_param_set(tool, field="type") %}
    {% set is_tool_type_set = ns.is_last_checked_defined %}
    {% if is_tool_type_set %}
      {% if tool.type == "function" %}
        {% set tool = tool.function %}
      {% else %}
        {{ raise_exception("Currently, the only supported tool type is `function`") }}
      {% endif %}
    {% endif %}
    {{- "\n\n" + (tool|tojson(indent=2)) -}}
  {% endfor %}
  {{- "\n" + tool_definitions_suffix -}}
{% endmacro %}
{##}
{% macro handle_first_system_message(system_message, tools) %}
  {{- bom_str + handle_role("system") -}}
  {% set _ = is_param_set(system_message) %}
  {% set is_system_message_set = ns.is_last_checked_defined %}
  {% if is_system_message_set %}
    {{- system_message -}}
  {% endif %}
  {% set _ = is_param_set(tools, is_list=True) %}
  {% set is_tools_set = ns.is_last_checked_defined %}
  {% if is_tools_set %}
    {% if system_message %}
      {{- "\n\n" -}}
    {% endif %}
    {{- handle_tool_definitions(tools) -}}
  {% endif %}
  {% set ns.message_count = ns.message_count + 1 %}
{% endmacro %}
{##}
{% macro handle_tool_calls(tool_calls) %}
  {{- tool_calls_prefix + "[\n" -}}
  {% for tool_call in tool_calls %}
    {% set _ = is_param_set(tool_call, field="function") %}
    {% set is_tool_call_function_set = ns.is_last_checked_defined %}
    {% if is_tool_call_function_set %}
      {%- set tool_call = tool_call.function %}
    {%- endif %}
    {% set arguments = tool_call.arguments %}
    {% if arguments is not string %}
      {%- set arguments = arguments|tojson -%}
    {%- endif %}
    {{ "{\"name\": \"" + tool_call.name + "\", \"arguments\": " + arguments + "}" -}}
    {% if not loop.last %}
      {{- "," }}
    {% endif %}
  {% endfor %}
  {{- "\n]" + tool_calls_suffix -}}
{% endmacro %}
{##}
{% macro handle_documents(documents) %}
  {{- documents_prefix -}}
  {{- "\n# Documents" -}}
  {{- "\n\nYou can use the following documents for reference:" -}}
  {% for doc in documents %}
    {{- "\n\n## Document ID: " + loop.index0|string -}}
    {% set _ = is_param_set(doc, field="title") %}
    {% set is_doc_title_set = ns.is_last_checked_defined %}
    {% if is_doc_title_set %}
      {{- "\nTitle: " + doc.title -}}
    {% endif %}
    {% for key, value in doc.items() %}
      {% if key not in ["title", "text"] %}
        {{- "\n" + key|title + ": " + value|string -}}
      {% endif %}
    {% endfor %}
    {{- "\nText: " + doc.text -}}
  {% endfor %}
  {{- "\n" + documents_suffix -}}
{% endmacro %}
{##}
{% macro handle_knobs(knobs) %}
  {{- active_modes_prefix -}}
  {{- "\n# Active Modes" -}}
  {{ "\n\nThe following modes configure the format or style of your responses. You should adhere to all currently" -}}
  {{ " active modes simultaneously." -}}
  {% if knobs.citation_mode == "fast" %}
    {{- "\n\n## Citation Mode" -}}
    {{- "\n\nProvide a list of references only for the documents you base your response on. Format your response" -}}
    {{ " with the original answer followed by a citation section. Use this template:" -}}
    {{ " `{answer}" + citations_prefix + "DOCUMENT_IDS" + citations_suffix + "`, where DOCUMENT_IDS are the relevant document numbers" -}}
    {{ " (e.g. [2, 5, 9]), or [] if the answer cannot be supported by the provided documents." -}}
  {% endif %}
  {% if knobs.response_format == "json_object" %}
    {{- "\n\n## JSON Mode" -}}
    {{ "\n\nProvide your response in JSON format. Adhere strictly to any schema given by the user." -}}
    {{ " If an appropriate JSON format exists, use it without modification." -}}
  {% endif %}
  {{- "\n" + active_modes_suffix -}}
{% endmacro %}
{##}
{% macro get_last_user_index(messages) %}
  {% set ns.last_user_index = 0 %}
  {% for message in messages %}
    {% if message.role == 'user' %}
      {% set ns.last_user_index = loop.index0 %}
    {% endif %}
  {% endfor %}
  {{- ns.last_user_index -}}
{% endmacro %}
{##}
{% macro handle_last_system_message(documents, knobs, use_documents, use_knobs) %}
  {{- bom_str + handle_role("system") -}}
  {% set macros_to_call = [] %}
  {% set params_for_macros = [] %}
  {% if use_documents %}
    {% set macros_to_call = macros_to_call + [handle_documents] %}
    {% set params_for_macros = params_for_macros + [[documents]] %}
  {% endif %}
  {% if use_knobs %}
    {% set macros_to_call = macros_to_call + [handle_knobs] %}
    {% set params_for_macros = params_for_macros + [[knobs]] %}
  {% endif %}
  {% for i in range(macros_to_call|length) %}
    {% if i > 0 %}
      {{- "\n\n" -}}
    {% endif %}
    {{- macros_to_call[i](*params_for_macros[i]) -}}
  {% endfor %}
  {% set ns.message_count = ns.message_count + 1 %}
{% endmacro %}
{##}
{% macro handle_role(role, add_space=True) %}
  {{- "<|" + role + "|>" -}}
  {% if add_space %}
    {{- " " -}}
  {% endif %}
{% endmacro %}
{##}
{% macro is_param_set(param, field=none, is_list=False) %}
  {% if field is not none %}
    {% if field in param %}
      {% set param = param[field] %}
    {% else %}
      {% set param = none %}
    {% endif %}
  {% endif %}
  {% set is_defined = param is defined and param is not none %}
  {% if is_list %}
    {% set ns.is_last_checked_defined = is_defined and param|length > 0 %}
  {% else %}
    {% set ns.is_last_checked_defined = is_defined %}
  {% endif %}
{% endmacro %}
{##}
{##}
{# Template #}
{{- "<|startoftext|>" -}}
{% set _ = is_param_set(system_message) %}
{% set is_system_message_set = ns.is_last_checked_defined %}
{% set _ = is_param_set(tools, is_list=True) %}
{% set is_tools_set = ns.is_last_checked_defined %}
{% set has_system_message = (is_system_message_set or is_tools_set) %}
{% if has_system_message %}
  {{- handle_first_system_message(system_message, tools) -}}
{% endif %}
{% set last_user_index = get_last_user_index(loop_messages)|int %}
{% for message in loop_messages %}
  {% if loop.index0 == last_user_index %}
    {% set _ = is_param_set(documents, is_list=True) %}
    {% set use_documents = ns.is_last_checked_defined %}
    {% set _ = is_param_set(knobs) %}
    {% set use_knobs = ns.is_last_checked_defined and knobs.is_set %}
    {% set add_last_system_message = use_documents or use_knobs %}
    {% if add_last_system_message %}
      {% if ns.message_count > 0 %}
        {{- eom_str -}}
      {% endif %}
      {{- handle_last_system_message(documents, knobs, use_documents, use_knobs) -}}
    {% endif %}
  {% endif %}
  {% set role = message.role %}
  {% set _ = is_param_set(message, field="name") %}
  {% set is_message_name_set = ns.is_last_checked_defined %}
  {% if is_message_name_set %}
    {% set message_prefix = handle_role(role) + "(" + message.name + ")" %}
  {% else %}
    {% set message_prefix = handle_role(role) %}
  {% endif %}
  {% set content = (message.content or "") %}
  {% if content is not string %}
    {% set content = content|tojson %}
  {% endif %}
  {% if ns.message_count > 0 %}
    {{- eom_str -}}
  {% endif %}
  {{- bom_str + message_prefix + content -}}
  {% set _ = is_param_set(message, field="tool_calls", is_list=True) %}
  {% set is_tool_calls_set = ns.is_last_checked_defined %}
  {% if role == "assistant" and is_tool_calls_set %}
    {{- handle_tool_calls(message.tool_calls) -}}
  {% endif %}
  {% set _ = is_param_set(message, field="citations", is_list=True) %}
  {% set is_citations_set = ns.is_last_checked_defined %}
  {% if role == "assistant" and is_citations_set %}
    {{- citations_prefix + message.citations|map(attribute="document_id")|list|string + citations_suffix -}}
  {% endif %}
  {% set ns.message_count = ns.message_count + 1 %}
{% endfor %}
{% if add_generation_prompt %}
  {% if ns.message_count > 0 %}
    {{- eom_str -}}
  {% endif %}
  {{- bom_str + handle_role(role_to_predict, add_space=False) -}}
  {% set _ = is_param_set(generation_preamble) %}
  {% set is_generation_preamble_set = ns.is_last_checked_defined %}
  {% if is_generation_preamble_set and generation_preamble.strip() != "" %}
    {{- " " + generation_preamble -}}
  {% endif %}
  {% set ns.message_count = ns.message_count + 1 %}
{% else %}
  {% if ns.message_count > 0 %}
    {{- eom_str -}}
  {% endif %}
{% endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/llama3.jinja
================================================
{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% set loop_messages = messages %}{% for message in loop_messages %}{% set content = '<|start_header_id|>' + message['role'] + '<|end_header_id|>

'+ message['content'] | trim + '<|eot_id|>' %}{% if loop.index0 == 0 %}{% set content = bos_token + content %}{% endif %}{{ content }}{% endfor %}{% if add_generation_prompt %}{{ '<|start_header_id|>assistant<|end_header_id|>

' }}{% endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/llama3_2_vision.jinja
================================================
{{- bos_token }}
{%- if custom_tools is defined %}
    {%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
    {%- set tools_in_user_message = true %}
{%- endif %}
{%- if not date_string is defined %}
    {%- if strftime_now is defined %}
        {%- set date_string = strftime_now("%d %b %Y") %}
    {%- else %}
        {%- set date_string = "26 Jul 2024" %}
    {%- endif %}
{%- endif %}
{%- if not tools is defined %}
    {%- set tools = none %}
{%- endif %}

{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
    {%- set system_message = messages[0]['content']|trim %}
    {%- set messages = messages[1:] %}
{%- else %}
    {%- set system_message = "" %}
{%- endif %}

{#- Find out if there are any images #}
{% set image_ns = namespace(has_images=false) %}
{%- for message in messages %}
    {%- for content in message['content'] %}
        {%- if content['type'] == 'image' %}
            {%- set image_ns.has_images = true %}
        {%- endif %}
    {%- endfor %}
{%- endfor %}

{#- Error out if there are images and system message #}
{%- if image_ns.has_images and not system_message == "" %}
    {{- raise_exception("Prompting with images is incompatible with system messages.") }}
{%- endif %}

{#- System message if there are no images #}
{%- if not image_ns.has_images %}
    {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
    {%- if tools is not none %}
        {{- "Environment: ipython\n" }}
    {%- endif %}
    {{- "Cutting Knowledge Date: December 2023\n" }}
    {{- "Today Date: " + date_string + "\n\n" }}
    {%- if tools is not none and not tools_in_user_message %}
        {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
        {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
        {{- "Do not use variables.\n\n" }}
        {%- for t in tools %}
            {{- t | tojson(indent=4) }}
            {{- "\n\n" }}
        {%- endfor %}
    {%- endif %}
    {{- system_message }}
    {{- "<|eot_id|>" }}
{%- endif %}

{#- Custom tools are passed in a user message with some extra guidance #}
{%- if tools_in_user_message and not tools is none %}
    {#- Extract the first user message so we can plug it in here #}
    {%- if messages | length != 0 %}
        {%- set first_user_message = messages[0]['content']|trim %}
        {%- set messages = messages[1:] %}
    {%- else %}
        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
{%- endif %}
    {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
    {{- "Given the following functions, please respond with a JSON for a function call " }}
    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
    {{- "Do not use variables.\n\n" }}
    {%- for t in tools %}
        {{- t | tojson(indent=4) }}
        {{- "\n\n" }}
    {%- endfor %}
    {{- first_user_message + "<|eot_id|>"}}
{%- endif %}

{%- for message in messages %}
    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
    {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n' }}
        {%- if message['content'] is string %}
            {{- message['content'] }}
        {%- else %}
            {%- for content in message['content'] %}
                {%- if content['type'] == 'image' %}
                    {{- '<|image|>' }}
                {%- elif content['type'] == 'text' %}
                    {{- content['text'] }}
                {%- endif %}
            {%- endfor %}
        {%- endif %}
        {{- '<|eot_id|>' }}
    {%- elif 'tool_calls' in message %}
        {%- if not message.tool_calls|length == 1 %}
            {{- raise_exception("This model only supports single tool-calls at once!") }}
        {%- endif %}
        {%- set tool_call = message.tool_calls[0].function %}
        {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
        {{- '{"name": "' + tool_call.name + '", ' }}
        {{- '"parameters": ' }}
        {{- tool_call.arguments | tojson }}
        {{- "}" }}
        {{- "<|eot_id|>" }}
    {%- elif message.role == "tool" or message.role == "ipython" %}
        {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
        {%- if message.content is mapping or message.content is iterable %}
            {{- message.content | tojson }}
        {%- else %}
            {{- message.content }}
        {%- endif %}
        {{- "<|eot_id|>" }}
    {%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
{%- endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/llama4.jinja
================================================
{{- bos_token }}
{%- if custom_tools is defined %}
    {%- set tools = custom_tools %}
{%- endif %}
{%- if not tools_in_user_message is defined %}
    {%- set tools_in_user_message = true %}
{%- endif %}
{%- if not date_string is defined %}
    {%- if strftime_now is defined %}
        {%- set date_string = strftime_now("%d %b %Y") %}
    {%- else %}
        {%- set date_string = "26 Jul 2024" %}
    {%- endif %}
{%- endif %}
{%- if not tools is defined %}
    {%- set tools = none %}
{%- endif %}

{#- This block extracts the system message, so we can slot it into the right place. #}
{%- if messages[0]['role'] == 'system' %}
    {%- if messages[0]['content'] is string %}
        {%- set system_message = messages[0]['content']|trim %}
    {%- else %}
        {#- FIXME: The processor requires an array, always. #}
        {%- set system_message = messages[0]['content'][0]['text']|trim %}
    {%- endif %}
    {%- set messages = messages[1:] %}
    {%- set user_supplied_system_message = true %}
{%- else %}
    {%- set system_message = "" %}
    {%- set user_supplied_system_message = false %}
{%- endif %}

{#- System message if the user supplied one #}
{%- if user_supplied_system_message %}
    {{- "<|header_start|>system<|header_end|>\n\n" }}
    {%- if tools is not none %}
        {{- "Environment: ipython\n" }}
    {%- endif %}
    {%- if tools is not none and not tools_in_user_message %}
        {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
        {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
        {{- "Do not use variables.\n\n" }}
        {%- for t in tools %}
            {{- t | tojson(indent=4) }}
            {{- "\n\n" }}
        {%- endfor %}
    {%- endif %}
    {{- system_message }}
    {{- "<|eot|>" }}
{%- endif %}

{#- Custom tools are passed in a user message with some extra guidance #}
{%- if tools_in_user_message and not tools is none %}
    {#- Extract the first user message so we can plug it in here #}
    {%- if messages | length != 0 %}
        {%- set first_user_message = messages[0]['content']|trim %}
        {%- set messages = messages[1:] %}
    {%- else %}
        {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
{%- endif %}
    {{- '<|header_start|>user<|header_end|>\n\n' -}}
    {{- "Given the following functions, please respond with a JSON for a function call " }}
    {{- "with its proper arguments that best answers the given prompt.\n\n" }}
    {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
    {{- "Do not use variables.\n\n" }}
    {%- for t in tools %}
        {{- t | tojson(indent=4) }}
        {{- "\n\n" }}
    {%- endfor %}
    {{- first_user_message + "<|eot|>"}}
{%- endif %}

{%- for message in messages %}
    {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
    {{- '<|header_start|>' + message['role'] + '<|header_end|>\n\n' }}
        {%- if message['content'] is string %}
            {{- message['content'] }}
        {%- else %}
            {%- for content in message['content'] %}
                {%- if content['type'] == 'image' %}
                    {{- '<|image|>' }}
                {%- elif content['type'] == 'text' %}
                    {{- content['text'] }}
                {%- endif %}
            {%- endfor %}
        {%- endif %}
        {{- "<|eot|>" }}
    {%- elif 'tool_calls' in message and message.tool_calls|length > 0 %}
       {{- '<|header_start|>assistant<|header_end|>\n\n' -}}
       {{- '<|python_start|>' }}
        {%- if message['content'] is string %}
            {{- message['content'] }}
        {%- else %}
            {%- for content in message['content'] %}
                {%- if content['type'] == 'image' %}
                    {{- '<|image|>' }}
                {%- elif content['type'] == 'text' %}
                    {{- content['text'] }}
                {%- endif %}
            {%- endfor %}
        {%- endif %}
       {{- '<|python_end|>' }}
        {%- for tool_call in message.tool_calls %}
           {{- '{"name": "' + tool_call.function.name + '", ' }}
           {{- '"parameters": ' }}
           {{- tool_call.function.arguments | tojson }}
           {{- "}" }}
        {%- endfor %}
       {{- "<|eot|>" }}
    {%- elif message.role == "tool" or message.role == "ipython" %}
        {{- "<|header_start|>ipython<|header_end|>\n\n" }}
        {%- if message.content is mapping or message.content is iterable %}
            {{- message.content | tojson }}
        {%- else %}
            {{- message.content }}
        {%- endif %}
        {{- "<|eot|>" }}
    {%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|header_start|>assistant<|header_end|>\n\n' }}
{%- endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/llava.jinja
================================================
{% for message in messages %}{% if message['role'] != 'system' %}{{ message['role'].upper() + ': '}}{% endif %}{# Render all images first #}{% for content in message['content'] | selectattr('type', 'equalto', 'image') %}{{ '<image>
' }}{% endfor %}{# Render all text next #}{% if message['role'] != 'assistant' %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{{ content['text'] + ' '}}{% endfor %}{% else %}{% for content in message['content'] | selectattr('type', 'equalto', 'text') %}{% generation %}{{ content['text'] + ' '}}{% endgeneration %}{% endfor %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ 'ASSISTANT:' }}{% endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/metharme.jinja
================================================
{{ bos_token }}{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% set system_message = 'Enter RP mode. You shall reply to the user while staying in character. Your responses must be detailed, creative, immersive, and drive the scenario forward.' %}{% endif %}{{ '<|system|>' + system_message }}{% for message in loop_messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|user|>' + content.strip() }}{% elif message['role'] == 'assistant' %}{{ '<|model|>'  + content.strip() }}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|model|>' }}{% else %}{{ eos_token }}{% endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/mistral_v1.jinja
================================================
{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ ' [INST] ' + message['content'] + ' [/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/mistral_v2v3.jinja
================================================
{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST] ' + message['content'] + '[/INST]' }}{% elif message['role'] == 'assistant' %}{{ ' ' + message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/mistral_v3_tekken.jinja
================================================
{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if message['role'] == 'user' %}{{ '[INST]' + message['content'] + '[/INST]' }}{% elif message['role'] == 'assistant' %}{{ message['content'] + eos_token}}{% else %}{{ raise_exception('Only user and assistant roles are supported!') }}{% endif %}{% endfor %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/mistral_v7_tekken.jinja
================================================
{%- set today = strftime_now("%Y-%m-%d") %}
{%- set default_system_message = "You are Mistral Small 3, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\nYour knowledge base was last updated on 2023-10-01. The current date is " + today + ".\n\nWhen you're not sure about some information, you say that you don't have the information and don't make up anything.\nIf the user's question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \"What are some good restaurants around me?\" => \"Where are you?\" or \"When is the next flight to Tokyo\" => \"Where do you travel from?\")" %}

{{- bos_token }}

{%- if messages[0]['role'] == 'system' %}
    {%- if messages[0]['content'] is string %}
        {%- set system_message = messages[0]['content'] %}
    {%- else %}
        {%- set system_message = messages[0]['content'][0]['text'] %}
    {%- endif %}
    {%- set loop_messages = messages[1:] %}
{%- else %}
    {%- set system_message = default_system_message %}
    {%- set loop_messages = messages %}
{%- endif %}
{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}

{%- for message in loop_messages %}
    {%- if message['role'] == 'user' %}
        {%- if message['content'] is string %}
            {{- '[INST]' + message['content'] + '[/INST]' }}
        {%- else %}
            {{- '[INST]' }}
            {%- for block in message['content'] %}
                {%- if block['type'] == 'text' %}
                    {{- block['text'] }}
                {%- elif block['type'] in ['image', 'image_url'] %}
                    {{- '[IMG]' }}
                {%- else %}
                    {{- raise_exception('Only text and image blocks are supported in message content!') }}
                {%- endif %}
            {%- endfor %}
            {{- '[/INST]' }}
        {%- endif %}
    {%- elif message['role'] == 'system' %}
        {%- if message['content'] is string %}
            {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}
        {%- else %}
            {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }}
        {%- endif %}
    {%- elif message['role'] == 'assistant' %}
        {%- if message['content'] is string %}
            {{- message['content'] + eos_token }}
        {%- else %}
            {{- message['content'][0]['text'] + eos_token }}
        {%- endif %}
    {%- else %}
        {{- raise_exception('Only user, system and assistant roles are supported!') }}
    {%- endif %}
{%- endfor %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/phi_3.jinja
================================================
{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '
' + message['content'] + '<|end|>' + '
'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '
' + message['content'] + '<|end|>' + '
' + '<|assistant|>' + '
'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '
'}}{% endif %}{% endfor %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/phi_35.jinja
================================================
{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>
' + message['content'] + '<|end|>
'}}{% elif message['role'] == 'user' %}{{'<|user|>
' + message['content'] + '<|end|>
'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>
' + message['content'] + '<|end|>
'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>
' }}{% endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/phi_4.jinja
================================================
{% set system_message = 'You are Phi, a language model trained by Microsoft to help users. Your role as an assistant involves thoroughly exploring questions through a systematic thinking process before providing the final precise and accurate solutions. This requires engaging in a comprehensive cycle of analysis, summarizing, exploration, reassessment, reflection, backtracing, and iteration to develop well-considered thinking process. Please structure your response into two main sections: Thought and Solution using the specified format: <think> {Thought section} </think> {Solution section}. In the Thought section, detail your reasoning process in steps. Each step should include detailed considerations such as analysing questions, summarizing relevant findings, brainstorming new ideas, verifying the accuracy of the current steps, refining any errors, and revisiting previous steps. In the Solution section, based on various attempts, explorations, and reflections from the Thought section, systematically present the final solution that you deem correct. The Solution section should be logical, accurate, and concise and detail necessary steps needed to reach the conclusion. Now, try to solve the following question through the above guidelines:' -%}{%- if messages and messages[0]['role'] == 'system' -%}{%- set system_message = messages[0]['content'] -%}{%- set messages = messages[1:] -%}{%- endif -%}<|im_start|>system<|im_sep|>{{ system_message }}<|im_end|>{% for message in messages %}{% if (message['role'] == 'user') %}{{'<|im_start|>user<|im_sep|>' + message['content'] + '<|im_end|>'}}{% elif (message['role'] == 'assistant') %}{{'<|im_start|>assistant<|im_sep|>'}}{% generation %}{{message['content'] + '<|im_end|>'}}{% endgeneration %}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant<|im_sep|>' }}{% endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/pixtral.jinja
================================================
{%- if messages[0]["role"] == "system" %}
    {%- set system_message = messages[0]["content"] %}
    {%- set loop_messages = messages[1:] %}
{%- else %}
    {%- set loop_messages = messages %}
{%- endif %}

{{- bos_token }}
{%- for message in loop_messages %}
    {%- if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}
        {{- raise_exception('After the optional system message, conversation roles must alternate user/assistant/user/assistant/...') }}
    {%- endif %}
    {%- if message["role"] == "user" %}
        {%- if loop.last and system_message is defined %}
            {{- "[INST]" + system_message + "

" }}
        {%- else %}
            {{- "[INST]" }}
        {%- endif %}
        {%- if message["content"] is not string %}
            {%- for chunk in message["content"] %}
                {%- if chunk["type"] == "text" %}
                    {{- chunk["text"] }}
                {%- elif chunk["type"] == "image" %}
                    {{- "[IMG]" }}
                {%- else %}
                    {{- raise_exception("Unrecognized content type!") }}
                {%- endif %}
            {%- endfor %}
        {%- else %}
            {{- message["content"] }}
        {%- endif %}
        {{- "[/INST]" }}
    {%- elif message["role"] == "assistant" %}
 {%- if message["content"] is not string %}
 {%- for chunk in message["content"] %}
 {%- if chunk["type"] == "text" %}
 {{- chunk["text"] }}
 {%- elif chunk["type"] == "image" %}
 {{- "[IMG]" }}
 {%- else %}
 {{- raise_exception("Unrecognized content type!") }}
{%- endif %}
{%- endfor %}
{{- eos_token }}
{%- else %}
{{- message["content"] + eos_token }}
{%- endif %}
    {%- else %}
        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}
    {%- endif %}
{%- endfor %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/qwen2_vl.jinja
================================================
{% set image_count = namespace(value=0) %}{% set video_count = namespace(value=0) %}{% for message in messages %}{% if loop.first and message['role'] != 'system' %}<|im_start|>system
You are a helpful assistant.<|im_end|>
{% endif %}<|im_start|>{{ message['role'] }}
{% if message['content'] is string %}{{ message['content'] }}<|im_end|>
{% else %}{% for content in message['content'] %}{% if content['type'] == 'image' or 'image' in content or 'image_url' in content %}{% set image_count.value = image_count.value + 1 %}{% if add_vision_id %}Picture {{ image_count.value }}: {% endif %}<|vision_start|><|image_pad|><|vision_end|>{% elif content['type'] == 'video' or 'video' in content %}{% set video_count.value = video_count.value + 1 %}{% if add_vision_id %}Video {{ video_count.value }}: {% endif %}<|vision_start|><|video_pad|><|vision_end|>{% elif 'text' in content %}{{ content['text'] }}{% endif %}{% endfor %}<|im_end|>
{% endif %}{% endfor %}{% if add_generation_prompt %}<|im_start|>assistant
{% endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/qwen3.jinja
================================================
{%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0].role == 'system' %}
        {{- messages[0].content + '\n\n' }}
    {%- endif %}
    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
    {%- if messages[0].role == 'system' %}
        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
    {%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{#- Determine the real last index: use provided value or default to messages length - 1 #}
{%- if real_last_index is defined and real_last_index is not none %}
    {%- set ns.real_last_index = real_last_index %}
{%- else %}
    {%- set ns.real_last_index = messages|length - 1 %}
{%- endif %}
{%- for message in messages[::-1] %}
    {%- set index = (messages|length - 1) - loop.index0 %}
    {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
        {%- set ns.multi_step_tool = false %}
        {%- set ns.last_query_index = index %}
    {%- endif %}
{%- endfor %}
{%- for message in messages %}
    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
    {%- elif message.role == "assistant" %}
        {%- set content = message.content %}
        {%- set reasoning_content = '' %}
        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
            {%- set reasoning_content = message.reasoning_content %}
        {%- else %}
            {%- if '</think>' in message.content %}
                {%- set content = message.content.split('</think>')[-1].lstrip('\n') %}
                {%- set reasoning_content = message.content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
            {%- endif %}
        {%- endif %}
        {%- if loop.index0 > ns.last_query_index %}
            {%- if loop.index0 == ns.real_last_index or (loop.index0 != ns.real_last_index and reasoning_content) %}
                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
            {%- else %}
                {{- '<|im_start|>' + message.role + '\n' + content }}
            {%- endif %}
        {%- else %}
            {{- '<|im_start|>' + message.role + '\n' + content }}
        {%- endif %}
        {%- if message.tool_calls %}
            {%- for tool_call in message.tool_calls %}
                {%- if (loop.first and content) or (not loop.first) %}
                    {{- '\n' }}
                {%- endif %}
                {%- if tool_call.function %}
                    {%- set tool_call = tool_call.function %}
                {%- endif %}
                {{- '<tool_call>\n{"name": "' }}
                {{- tool_call.name }}
                {{- '", "arguments": ' }}
                {%- if tool_call.arguments is string %}
                    {{- tool_call.arguments }}
                {%- else %}
                    {{- tool_call.arguments | tojson }}
                {%- endif %}
                {{- '}\n</tool_call>' }}
            {%- endfor %}
        {%- endif %}
        {{- '<|im_end|>\n' }}
    {%- elif message.role == "tool" %}
        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
            {{- '<|im_start|>user' }}
        {%- endif %}
        {{- '\n<tool_response>\n' }}
        {{- message.content }}
        {{- '\n</tool_response>' }}
        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
            {{- '<|im_end|>\n' }}
        {%- endif %}
    {%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|im_start|>assistant\n' }}
    {%- if enable_thinking is defined and enable_thinking is false %}
        {{- '<think>\n\n</think>\n\n' }}
    {%- else %}
        {{- '<think>\n\n' }}
    {%- endif %}
{%- endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/qwen3_5.jinja
================================================
{%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0].role == 'system' %}
        {{- messages[0].content + '\n\n' }}
    {%- endif %}
    {{- "# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
    {%- if messages[0].role == 'system' %}
        {{- '<|im_start|>system\n' + messages[0].content + '<|im_end|>\n' }}
    {%- endif %}
{%- endif %}
{%- set ns = namespace(multi_step_tool=true, last_query_index=messages|length - 1) %}
{#- Determine the real last index: use provided value or default to messages length - 1 #}
{%- if real_last_index is defined and real_last_index is not none %}
    {%- set ns.real_last_index = real_last_index %}
{%- else %}
    {%- set ns.real_last_index = messages|length - 1 %}
{%- endif %}
{%- for message in messages[::-1] %}
    {%- set index = (messages|length - 1) - loop.index0 %}
    {%- if message['content'] is string %}
        {%- if ns.multi_step_tool and message.role == "user" and not(message.content.startswith('<tool_response>') and message.content.endswith('</tool_response>')) %}
            {%- set ns.multi_step_tool = false %}
            {%- set ns.last_query_index = index %}
        {%- endif %}
    {%- else %}
        {%- if ns.multi_step_tool and message.role == "user" %}
            {%- set ns.multi_step_tool = false %}
            {%- set ns.last_query_index = index %}
        {%- endif %}
    {%- endif %}
{%- endfor %}
{%- for message in messages %}
    {%- if (message.role == "user") or (message.role == "system" and not loop.first) %}
        {{- '<|im_start|>' + message.role + '\n' }}
        {%- if message['content'] is string %}
            {{- message.content }}
        {%- else %}
            {%- for content in message['content'] %}
                {%- if content['type'] == 'image' or 'image' in content or 'image_url' in content %}
                    {{- '<|vision_start|><|image_pad|><|vision_end|>' }}
                {%- elif content['type'] == 'video' or 'video' in content %}
                    {{- '<|vision_start|><|video_pad|><|vision_end|>' }}
                {%- elif 'text' in content %}
                    {{- content['text'] }}
                {%- endif %}
            {%- endfor %}
        {%- endif %}
        {{- '<|im_end|>\n' }}
    {%- elif message.role == "assistant" %}
        {%- if message['content'] is string %}
            {%- set content = message.content %}
        {%- else %}
            {%- set content = '' %}
            {%- for item in message['content'] %}
                {%- if 'text' in item %}
                    {%- set content = content + item['text'] %}
                {%- endif %}
            {%- endfor %}
        {%- endif %}
        {%- set reasoning_content = '' %}
        {%- if message.reasoning_content is defined and message.reasoning_content is not none %}
            {%- set reasoning_content = message.reasoning_content %}
        {%- else %}
            {%- if '</think>' in content %}
                {%- set content = content.split('</think>')[-1].lstrip('\n') %}
                {%- set reasoning_content = content.split('</think>')[0].rstrip('\n').split('<think>')[-1].lstrip('\n') %}
            {%- endif %}
        {%- endif %}
        {%- if loop.index0 > ns.last_query_index %}
            {%- if loop.index0 == ns.real_last_index or (loop.index0 != ns.real_last_index and reasoning_content) %}
                {{- '<|im_start|>' + message.role + '\n<think>\n' + reasoning_content.strip('\n') + '\n</think>\n\n' + content.lstrip('\n') }}
            {%- else %}
                {{- '<|im_start|>' + message.role + '\n' + content }}
            {%- endif %}
        {%- else %}
            {{- '<|im_start|>' + message.role + '\n' + content }}
        {%- endif %}
        {%- if message.tool_calls %}
            {%- for tool_call in message.tool_calls %}
                {%- if (loop.first and content) or (not loop.first) %}
                    {{- '\n' }}
                {%- endif %}
                {%- if tool_call.function %}
                    {%- set tool_call = tool_call.function %}
                {%- endif %}
                {{- '<tool_call>\n{"name": "' }}
                {{- tool_call.name }}
                {{- '", "arguments": ' }}
                {%- if tool_call.arguments is string %}
                    {{- tool_call.arguments }}
                {%- else %}
                    {{- tool_call.arguments | tojson }}
                {%- endif %}
                {{- '}\n</tool_call>' }}
            {%- endfor %}
        {%- endif %}
        {{- '<|im_end|>\n' }}
    {%- elif message.role == "tool" %}
        {%- if loop.first or (messages[loop.index0 - 1].role != "tool") %}
            {{- '<|im_start|>user' }}
        {%- endif %}
        {{- '\n<tool_response>\n' }}
        {{- message.content }}
        {{- '\n</tool_response>' }}
        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
            {{- '<|im_end|>\n' }}
        {%- endif %}
    {%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|im_start|>assistant\n' }}
    {%- if enable_thinking is defined and enable_thinking is false %}
        {{- '<think>\n\n</think>\n\n' }}
    {%- else %}
        {{- '<think>\n\n' }}
    {%- endif %}
{%- endif %}


================================================
FILE: src/axolotl/utils/chat_templates/templates/qwen_25.jinja
================================================
{%- if tools %}
    {{- '<|im_start|>system\n' }}
    {%- if messages[0]['role'] == 'system' %}
        {{- messages[0]['content'] }}
    {%- else %}
        {{- 'You are Qwen, created by Alibaba Cloud. You are a helpful assistant.' }}
    {%- endif %}
    {{- "\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>" }}
    {%- for tool in tools %}
        {{- "\n" }}
        {{- tool | tojson }}
    {%- endfor %}
    {{- "\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\"name\": <function-name>, \"arguments\": <args-json-object>}\n</tool_call><|im_end|>\n" }}
{%- else %}
    {%- if messages[0]['role'] == 'system' %}
        {{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}
    {%- else %}
        {{- '<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n' }}
    {%- endif %}
{%- endif %}
{%- for message in messages %}
    {%- if (message.role == "user") or (message.role == "system" and not loop.first) or (message.role == "assistant" and not message.tool_calls) %}
        {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}
    {%- elif message.role == "assistant" %}
        {{- '<|im_start|>' + message.role }}
        {%- if message.content %}
            {{- '\n' + message.content }}
        {%- endif %}
        {%- for tool_call in message.tool_calls %}
            {%- if tool_call.function is defined %}
                {%- set tool_call = tool_call.function %}
            {%- endif %}
            {{- '\n<tool_call>\n{"name": "' }}
            {{- tool_call.name }}
            {{- '", "arguments": ' }}
            {{- tool_call.arguments | tojson }}
            {{- '}\n</tool_call>' }}
        {%- endfor %}
        {{- '<|im_end|>\n' }}
    {%- elif message.role == "tool" %}
        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != "tool") %}
            {{- '<|im_start|>user' }}
        {%- endif %}
        {{- '\n<tool_response>\n' }}
        {{- message.content }}
        {{- '\n</tool_response>' }}
        {%- if loop.last or (messages[loop.index0 + 1].role != "tool") %}
            {{- '<|im_end|>\n' }}
        {%- endif %}
    {%- endif %}
{%- endfor %}
{%- if add_generation_prompt %}
    {{- '<|im_start|>assistant\n' }}
{%- endif %}


================================================
FILE: src/axolotl/utils/collators/__init__.py
================================================
"""Shared axolotl collators for multipacking, mamba, multimodal."""

from .batching import (
    BatchSamplerDataCollatorForSeq2Seq,
    DataCollatorForSeq2Seq,
    PretrainingBatchSamplerDataCollatorForSeq2Seq,
    V2BatchSamplerDataCollatorForSeq2Seq,
)
from .mamba import MambaDataCollator

__all__ = [
    "DataCollatorForSeq2Seq",
    "BatchSamplerDataCollatorForSeq2Seq",
    "V2BatchSamplerDataCollatorForSeq2Seq",
    "PretrainingBatchSamplerDataCollatorForSeq2Seq",
    "MambaDataCollator",
]


================================================
FILE: src/axolotl/utils/collators/batching.py
================================================
"""Data collators for axolotl to pad labels and position_ids for packed sequences"""

from dataclasses import dataclass
from typing import Any, List

import numpy as np
from transformers import PreTrainedTokenizerBase
from transformers.utils import PaddingStrategy


@dataclass
class DataCollatorForSeq2Seq:
    """
    Data collator that will dynamically pad the inputs received, as well as the labels and position_ids

    Args:
        tokenizer ([`PreTrainedTokenizer`] or [`PreTrainedTokenizerFast`]):
            The tokenizer used for encoding the data.
        model ([`PreTrainedModel`]):
            The model that is being trained. If set and has the *prepare_decoder_input_ids_from_labels*, use it to
            prepare the *decoder_input_ids*

            This is useful when using *label_smoothing* to avoid calculating loss twice.
        padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
            Select a strategy to pad the returned sequences (according to the model's padding side and padding index)
            among:

            - `True` or `'longest'` (default): Pad to the longest sequence in the batch (or no padding if only a single
              sequence is provided).
            - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
              acceptable input length for the model if that argument is not provided.
            - `False` or `'do_not_pad'`: No padding (i.e., can output a batch with sequences of different lengths).
        max_length (`int`, *optional*):
            Maximum length of the returned list and optionally padding length (see above).
        pad_to_multiple_of (`int`, *optional*):
            If set will pad the sequence to a multiple of the provided value.

            This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability >=
            7.5 (Volta).
        label_pad_token_id (`int`, *optional*, defaults to -100):
            The id to use when padding the labels (-100 will be automatically ignored by PyTorch loss functions).
        return_tensors (`str`):
            The type of Tensor to return. Allowable values are "np", "pt" and "tf".
    """

    tokenizer: PreTrainedTokenizerBase
    model: Any | None = None
    padding: bool | str | PaddingStrategy = True
    max_length: int | None = None
    pad_to_multiple_of: int | None = None
    label_pad_token_id: int = -100
    position_pad_token_id: int = 0
    return_tensors: str = "pt"

    def __call__(self, features, return_tensors=None):
        has_attn_mask = "attention_mask" in features[0].keys()
        labels = None
        if return_tensors is None:
            return_tensors = self.return_tensors

        for feature_name, pad_token_id in [
            ("labels", self.label_pad_token_id),
            ("position_ids", self.position_pad_token_id),
        ]:
            feat = (
                [feature[feature_name] for feature in features]
                if feature_name in features[0].keys()
                else None
            )
            labels = feat if feat and feature_name == "labels" else labels
            # We have to pad the labels before calling `tokenizer.pad` as this method won't pad them and needs them of the
            # same length to return tensors.
            if feat is not None:
                max_feature_length = max(len(l) for l in feat)  # noqa: E741
                if self.pad_to_multiple_of is not None:
                    max_feature_length = (
                        (max_feature_length + self.pad_to_multiple_of - 1)
                        // self.pad_to_multiple_of
                        * self.pad_to_multiple_of
                    )

                padding_side = self.tokenizer.padding_side
                for feature in features:
                    remainder_len = max_feature_length - len(feature[feature_name])
                    if feature_name == "position_ids":
                        remainder = list(range(remainder_len))
                    else:
                        remainder = [pad_token_id] * remainder_len
                    if isinstance(feature[feature_name], list):
                        feature[feature_name] = (
                            feature[feature_name] + remainder
                            if padding_side == "right"
                            else remainder + feature[feature_name]
                        )
                    elif padding_side == "right":
                        feature[feature_name] = np.concatenate(
                            [feature[feature_name], remainder]
                        ).astype(np.int64)
                    else:
                        feature[feature_name] = np.concatenate(
                            [remainder, feature[feature_name]]
                        ).astype(np.int64)

        features = self.tokenizer.pad(
            features,
            padding=self.padding,
            max_length=self.max_length,
            pad_to_multiple_of=self.pad_to_multiple_of,
            return_tensors=return_tensors,
        )
        if not has_attn_mask and "attention_mask" in features:
            del features["attention_mask"]

        # prepare decoder_input_ids
        if (
            labels is not None
            and self.model is not None
            and hasattr(self.model, "prepare_decoder_input_ids_from_labels")
        ):
            decoder_input_ids = self.model.prepare_decoder_input_ids_from_labels(
                labels=features["labels"]
            )
            features["decoder_input_ids"] = decoder_input_ids

        return features


@dataclass
class BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
    """
    Collator for multipack specific to the using the BatchSampler
    """

    def __call__(self, features, return_tensors=None):
        if not isinstance(features[0], list):
            features = [features]
        out_features = [{} for _ in features]
        for i, features_ in enumerate(features):
            for feature in features_[0].keys():
                if feature == "length":
                    continue
                if feature == "attention_mask":
                    arrays = [
                        (1) * np.array(item[feature])
                        for i, item in enumerate(features_)
                        if feature in item
                    ]
                    out_features[i][feature] = np.concatenate(arrays)
                else:
                    arrays = [
                        np.array(item[feature]) for item in features_ if feature in item
                    ]
                    out_features[i][feature] = np.concatenate(arrays)

        return super().__call__(out_features, return_tensors=return_tensors)


@dataclass
class V2BatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
    """
    Collator for multipack specific to the using the BatchSampler
    """

    squash_position_ids: bool = False

    def __call__(self, features, return_tensors=None):
        if not isinstance(features[0], list):
            features: List[List[dict]] = [features]
        out_features = [{} for _ in features]
        for i, features_ in enumerate(features):
            for feature in features_[0].keys():
                if feature == "length":
                    continue
                if feature == "attention_mask":
                    arrays = [
                        (i + 1) * np.array(item[feature])
                        for i, item in enumerate(features_)
                        if feature in item
                    ]
                    out_features[i][feature] = np.concatenate(arrays)
                elif feature == "position_ids" and self.squash_position_ids:
                    arrays = [
                        np.array(item[feature]) for item in features_ if feature in item
                    ]
                    # concatenate, get total length and create arange of new total position ids
                    position_ids = np.concatenate(arrays)
                    total_length = position_ids.shape[0]
                    position_ids = np.arange(total_length)
                    out_features[i][feature] = position_ids
                else:
                    arrays = [
                        np.array(item[feature]) for item in features_ if feature in item
                    ]
                    out_features[i][feature] = np.concatenate(arrays)

        return super().__call__(out_features, return_tensors=return_tensors)


@dataclass
class PretrainingBatchSamplerDataCollatorForSeq2Seq(DataCollatorForSeq2Seq):
    """
    Collator for multipack specific to the using the BatchSampler
    """

    def __init__(self, *args, multipack_attn=True, **kwargs):
        super().__init__(*args, **kwargs)
        self.multipack_attn = multipack_attn

    def __call__(self, features, return_tensors=None):
        chunked_data = {}
        for feature in features.keys():
            if feature == "length":
                continue
            if feature == "attention_mask":
                if self.multipack_attn:
                    arrays = [
                        (i + 1) * np.array(item)
                        for i, item in enumerate(features[feature])
                    ]
                else:
                    arrays = [(1) * np.array(item) for item in features[feature]]
                chunked_data[feature] = np.concatenate(arrays)
            else:
                arrays = [np.array(item) for item in features[feature]]
                chunked_data[feature] = np.concatenate(arrays)
        features = [chunked_data]
        return super().__call__(features, return_tensors=return_tensors)


================================================
FILE: src/axolotl/utils/collators/core.py
================================================
"""
basic shared collator constants
"""

IGNORE_INDEX = -100


================================================
FILE: src/axolotl/utils/collators/mamba.py
================================================
"""
collators for Mamba
"""

from dataclasses import dataclass
from typing import Dict, Sequence

import torch
import transformers

from axolotl.utils.collators.core import IGNORE_INDEX


@dataclass
class MambaDataCollator:
    """
    Collator for State Space Models (Mamba)
    """

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple(
            [torch.LongTensor(instance[key]) for instance in instances]
            for key in ("input_ids", "labels")
        )
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids,
            batch_first=True,
            padding_value=self.tokenizer.pad_token_id,
        )
        labels = torch.nn.utils.rnn.pad_sequence(
            labels, batch_first=True, padding_value=IGNORE_INDEX
        )

        return {
            "input_ids": input_ids,
            "labels": labels,
        }


================================================
FILE: src/axolotl/utils/collators/mm_chat.py
================================================
"""
Collators for multi-modal chat messages and packing
"""

from dataclasses import dataclass
from typing import Any, Optional, Union

from torch import Tensor
from transformers import PreTrainedTokenizerBase
from transformers.data.data_collator import DataCollatorMixin
from transformers.utils import PaddingStrategy

from axolotl.processing_strategies import ProcessingStrategy


@dataclass
class MultiModalChatDataCollator(DataCollatorMixin):
    """
    Collator for multi-modal chat messages
    """

    tokenizer: PreTrainedTokenizerBase
    processing_strategy: ProcessingStrategy
    packing: bool = False
    return_tensors: str = "pt"
    padding: Union[bool, str, PaddingStrategy] = True
    pad_to_multiple_of: Optional[int] = None

    def __post_init__(self):
        if self.packing:
            raise ValueError("Packing is currently not supported.")

    def torch_call(self, examples: list[dict]) -> dict[str, Any]:
        return self.process_rows(examples)

    def process_rows(
        self,
        examples: list[dict],
    ) -> dict[str, Tensor]:
        # Preprocess the examples
        examples = self.processing_strategy(examples)

        # Initialize batch
        messages = [ex["messages"] for ex in examples]

        batch = self.processing_strategy.processor.apply_chat_template(
            messages,
            add_generation_prompt=False,
            tokenize=True,
            return_tensors="pt",
            padding=True,
            return_dict=True,
            chat_template=self.processing_strategy.chat_template,
        )

        # Process the labels
        batch["labels"] = self.processing_strategy.process_labels(batch["input_ids"])

        return batch


================================================
FILE: src/axolotl/utils/comet_.py
================================================
"""Module for wandb utilities"""

import os

from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)

COMET_ENV_MAPPING_OVERRIDE = {
    "comet_mode": "COMET_START_MODE",
    "comet_online": "COMET_START_ONLINE",
}
COMET_EXPERIMENT_CONFIG_ENV_MAPPING_OVERRIDE = {
    "auto_histogram_activation_logging": "COMET_AUTO_LOG_HISTOGRAM_ACTIVATIONS",
    "auto_histogram_epoch_rate": "COMET_AUTO_LOG_HISTOGRAM_EPOCH_RATE",
    "auto_histogram_gradient_logging": "COMET_AUTO_LOG_HISTOGRAM_GRADIENTS",
    "auto_histogram_tensorboard_logging": "COMET_AUTO_LOG_HISTOGRAM_TENSORBOARD",
    "auto_histogram_weight_logging": "COMET_AUTO_LOG_HISTOGRAM_WEIGHTS",
    "auto_log_co2": "COMET_AUTO_LOG_CO2",
    "auto_metric_logging": "COMET_AUTO_LOG_METRICS",
    "auto_metric_step_rate": "COMET_AUTO_LOG_METRIC_STEP_RATE",
    "auto_output_logging": "COMET_AUTO_LOG_OUTPUT_LOGGER",
    "auto_param_logging": "COMET_AUTO_LOG_PARAMETERS",
    "comet_disabled": "COMET_AUTO_LOG_DISABLE",
    "display_summary_level": "COMET_DISPLAY_SUMMARY_LEVEL",
    "distributed_node_identifier": "COMET_DISTRIBUTED_NODE_IDENTIFIER",
    "log_code": "COMET_AUTO_LOG_CODE",
    "log_env_cpu": "COMET_AUTO_LOG_ENV_CPU",
    "log_env_details": "COMET_AUTO_LOG_ENV_DETAILS",
    "log_env_disk": "COMET_AUTO_LOG_ENV_DISK",
    "log_env_gpu": "COMET_AUTO_LOG_ENV_GPU",
    "log_env_host": "COMET_AUTO_LOG_ENV_HOST",
    "log_env_network": "COMET_AUTO_LOG_ENV_NETWORK",
    "log_git_metadata": "COMET_AUTO_LOG_GIT_METADATA",
    "log_git_patch": "COMET_AUTO_LOG_GIT_PATCH",
    "log_graph": "COMET_AUTO_LOG_GRAPH",
    "name": "COMET_START_EXPERIMENT_NAME",
    "offline_directory": "COMET_OFFLINE_DIRECTORY",
    "parse_args": "COMET_AUTO_LOG_CLI_ARGUMENTS",
    "tags": "COMET_START_EXPERIMENT_TAGS",
}


def python_value_to_environ_value(python_value):
    if isinstance(python_value, bool):
        if python_value is True:
            return "true"

        return "false"

    if isinstance(python_value, int):
        return str(python_value)

    if isinstance(python_value, list):  # Comet only have one list of string parameter
        return ",".join(map(str, python_value))

    return python_value


def setup_comet_env_vars(cfg: DictDefault):
    # TODO, we need to convert Axolotl configuration to environment variables
    # as Transformers integration are call first and would create an
    # Experiment first

    for key in cfg.keys():
        if key.startswith("comet_") and key != "comet_experiment_config":
            value = cfg.get(key, "")

            if value is not None and value != "":
                env_variable_name = COMET_ENV_MAPPING_OVERRIDE.get(key, key.upper())
                final_value = python_value_to_environ_value(value)
                os.environ[env_variable_name] = final_value

    if cfg.comet_experiment_config:
        for key, value in cfg.comet_experiment_config.items():
            if value is not None and value != "":
                config_env_variable_name = (
                    COMET_EXPERIMENT_CONFIG_ENV_MAPPING_OVERRIDE.get(key)
                )

                if config_env_variable_name is None:
                    LOG.warning(
                        f"Unknown Comet Experiment Config name {key}, ignoring it"
                    )
                    continue

                final_value = python_value_to_environ_value(value)
                os.environ[config_env_variable_name] = final_value

    # Enable comet if project name is present
    if cfg.comet_project_name and len(cfg.comet_project_name) > 0:
        cfg.use_comet = True


================================================
FILE: src/axolotl/utils/config/__init__.py
================================================
"""Module for working with config dicts"""

import json
import os
from typing import Optional

import torch
from transformers.utils import is_torch_bf16_gpu_available
from transformers.utils.import_utils import (
    is_torch_greater_or_equal,
    is_torch_npu_available,
)

from axolotl.integrations.base import PluginManager
from axolotl.integrations.config import merge_input_args
from axolotl.loaders import MULTIMODAL_AUTO_MODEL_MAPPING
from axolotl.loaders.utils import load_model_config
from axolotl.utils.bench import log_gpu_memory_usage
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger
from axolotl.utils.schemas.config import (
    AxolotlConfigWCapabilities as AxolotlConfigWCapabilitiesBase,
    AxolotlInputConfig as AxolotlInputConfigBase,
)
from axolotl.utils.schemas.datasets import DPODataset, KTODataset, SFTDataset

LOG = get_logger(__name__)


def choose_device(cfg):
    def get_device():
        try:
            if torch.cuda.is_available():
                return f"cuda:{cfg.local_rank}"

            if torch.backends.mps.is_available():
                return "mps"

            if is_torch_npu_available():
                return f"npu:{cfg.local_rank}"

            raise SystemError("No CUDA/mps/npu device found")
        except Exception:
            return "cpu"

    cfg.device = get_device()
    if cfg.world_size == 1:
        cfg.device_map = cfg.device_map or "auto"
    else:
        if cfg.device.startswith("cuda"):
            cfg.device_map = {"": torch.cuda.current_device()}
        elif cfg.device.startswith("npu"):
            cfg.device_map = {"npu": torch.npu.current_device()}
        else:
            cfg.device_map = {"": cfg.device}

    # in `accelerate launch`, we need to not pass through any device map and let
    # accelerate figure out which parts of the model to put on which gpu
    accelerate_vars = [var for var in os.environ if var.startswith("ACCELERATE_USE_")]
    if accelerate_vars:
        cfg.device_map = None


def resolve_dtype(cfg):
    if (
        not cfg.fp16 and cfg.bf16 == "auto" and not cfg.use_ray
    ):  # if we use ray we want to defer this check to the worker node
        if is_torch_bf16_gpu_available():
            LOG.debug("bf16 support detected, enabling for this configuration.")
            cfg.bf16 = True
        else:
            LOG.debug("bf16 support not detected, disabling for this configuration.")
            cfg.bf16 = False
            if cfg.fp16 is None and not cfg.float16:
                cfg.fp16 = True

    if cfg.fp16 and cfg.bf16 == "auto":
        cfg.bf16 = False

    if cfg.device == "mps":
        cfg.load_in_8bit = False
        cfg.tf32 = False
        if cfg.bf16 and cfg.fp16 is not False:
            cfg.fp16 = True
        cfg.bf16 = False
    else:
        if cfg.tf32 is True:
            torch.set_float32_matmul_precision("high")
            if is_torch_greater_or_equal("2.9.0"):
                torch.backends.fp32_precision = "tf32"
                torch.backends.cuda.matmul.fp32_precision = "tf32"
                torch.backends.cudnn.fp32_precision = "tf32"
            else:
                torch.backends.cuda.matmul.allow_tf32 = True
                torch.backends.cudnn.allow_tf32 = True
        if cfg.bf16:
            cfg.fp16 = False

    if cfg.bf16 or cfg.bfloat16:
        cfg.torch_dtype = torch.bfloat16
    elif cfg.load_in_8bit or cfg.fp16 or cfg.float16:
        cfg.torch_dtype = torch.float16
    else:
        cfg.torch_dtype = torch.float32


def normalize_config(cfg):
    # setup some derived config / hyperparams
    cfg.gradient_accumulation_steps = cfg.gradient_accumulation_steps or (
        cfg.batch_size // cfg.micro_batch_size
    )
    cfg.batch_size = (
        cfg.batch_size or cfg.micro_batch_size * cfg.gradient_accumulation_steps
    )
    if cfg.eval_batch_size is None:
        cfg.eval_batch_size = cfg.micro_batch_size
    cfg.world_size = int(os.environ.get("WORLD_SIZE", 1))
    cfg.local_rank = int(os.environ.get("LOCAL_RANK", 0))
    cfg.eval_table_size = cfg.eval_table_size or 0
    cfg.eval_max_new_tokens = cfg.eval_max_new_tokens or 128
    cfg.eval_causal_lm_metrics = cfg.eval_causal_lm_metrics or [
        "sacrebleu",
        "comet",
        "ter",
        "chrf",
    ]
    choose_device(cfg)
    cfg.ddp = cfg.ddp if cfg.ddp is not None else cfg.world_size != 1
    if cfg.world_size != 1:
        cfg.device_map = {"": int(os.environ.get("LOCAL_RANK", 0))}
        if cfg.fsdp or cfg.fsdp_config or cfg.ddp:
            effective_world_size = (
                cfg.world_size
                // (cfg.context_parallel_size or 1)
                // (cfg.tensor_parallel_size or 1)
            )
            cfg.batch_size = cfg.batch_size * effective_world_size

    if not cfg.use_ray:
        # delay resolving dtype until on worker node when launching with ray
        resolve_dtype(cfg)

    if cfg.deepspeed:
        if isinstance(cfg.deepspeed, str) and os.path.exists(cfg.deepspeed):
            ds_config_path = cfg.deepspeed
            with open(ds_config_path, encoding="utf-8") as f:
                cfg.deepspeed = json.load(f)

    if cfg.saves_per_epoch:
        save_steps = 1.0 / (cfg.saves_per_epoch * cfg.num_epochs)
        if save_steps < 1.0:  # prevent saves on every step
            cfg.save_steps = save_steps
        elif save_steps > 1:
            LOG.warning(
                f"Invalid value for save_steps ({save_steps}) from saves_per_epoch and/or num_epochs. Saving at training end only."
            )
    if (cfg.val_set_size or cfg.test_datasets) and cfg.evals_per_epoch:
        eval_steps = 1.0 / (cfg.evals_per_epoch * cfg.num_epochs)
        if eval_steps < 1.0:  # prevent evals on every step
            cfg.eval_steps = eval_steps
        elif eval_steps > 1:
            LOG.warning(
                f"Invalid value for eval_steps ({eval_steps}) from evals_per_epoch and/or num_epochs. Skipping evaluations."
            )

    if not cfg.base_model_config:
        cfg.base_model_config = cfg.base_model

    # Apply pre-config load patches (e.g., for Kimi Linear remote code patching)
    from axolotl.loaders.patch_manager import PatchManager

    PatchManager.apply_pre_config_load_patches(cfg)

    model_config = load_model_config(cfg)

    cfg.tokenizer_config = (
        cfg.tokenizer_config or cfg.base_model_config or cfg.base_model
    )

    cfg.is_multimodal = (
        hasattr(model_config, "model_type")
        and model_config.model_type in MULTIMODAL_AUTO_MODEL_MAPPING
        or any(
            multimodal_name in cfg.base_model.lower()
            for multimodal_name in [
                "pixtral",
            ]
        )
        or cfg.is_multimodal
    )
    if cfg.is_multimodal:
        cfg.processor_config = (
            cfg.processor_config or cfg.base_model_config or cfg.base_model
        )

    cfg.model_config_type = model_config.model_type

    # Resolve inner text backbone type for VLM wrappers (e.g. mistral3 -> mistral4)
    if callable(getattr(model_config, "get_text_config", None)):
        text_config = model_config.get_text_config()
        if (
            hasattr(text_config, "model_type")
            and text_config.model_type != model_config.model_type
        ):
            cfg.model_config_type_text = text_config.model_type

    # figure out if the model is llama
    cfg.is_llama_derived_model = (
        (
            hasattr(model_config, "model_type")
            and model_config.model_type in ["llama", "mllama_text_model"]
        )
        or cfg.is_llama_derived_model
        or "llama" in cfg.base_model.lower()
        or (cfg.type_of_model and "llama" in cfg.type_of_model.lower())
    )

    # figure out if the model is falcon
    cfg.is_falcon_derived_model = (
        (
            hasattr(model_config, "model_type")
            and model_config.model_type
            in [
                "falcon",
                "RefinedWebModel",
                "RefinedWeb",
            ]
        )
        or cfg.is_falcon_derived_model
        or "falcon" in cfg.base_model.lower()
        or (cfg.type_of_model and "rwforcausallm" in cfg.type_of_model.lower())
    )

    cfg.is_mistral_derived_model = (
        (
            hasattr(model_config, "model_type")
            and model_config.model_type
            in [
                "mistral",
            ]
        )
        or cfg.is_mistral_derived_model
        or "mistral" in cfg.base_model.lower().split("/")[-1]
        or (cfg.type_of_model and "mistral" in cfg.type_of_model.lower())
    )

    cfg.is_qwen_derived_model = (
        hasattr(model_config, "model_type")
        and model_config.model_type
        in [
            "qwen",
        ]
    ) or cfg.is_qwen_derived_model

    if isinstance(cfg.pretraining_dataset, dict):
        cfg.pretraining_dataset = [cfg.pretraining_dataset]

    if (
        cfg.gradient_checkpointing
        and cfg.unfrozen_parameters is None
        and cfg.gradient_checkpointing_kwargs is None
        and cfg.rl is None
    ):
        cfg.gradient_checkpointing_kwargs = {"use_reentrant": True}

    log_gpu_memory_usage(LOG, "baseline", cfg.device)


def normalize_cfg_datasets(cfg):
    """
    helpers for mapping chat_template to various dataset configurations as necessary
    """

    if cfg.chat_template:
        if cfg.datasets:
            for idx, ds_cfg in enumerate(cfg.datasets):
                if (
                    ds_cfg.type in ["orpo.chat_template", "chat_template"]
                    and not ds_cfg.chat_template
                ):
                    LOG.info(
                        f"updating dataset {ds_cfg.path} with `chat_template: {cfg.chat_template}` to match your chat_template"
                    )
                    cfg.datasets[idx].chat_template = cfg.chat_template
                    cfg.datasets[idx].chat_template_jinja = cfg.chat_template_jinja


def validate_config(
    cfg: DictDefault,
    capabilities: Optional[dict] = None,
    env_capabilities: Optional[dict] = None,
) -> DictDefault:
    AxolotlConfigWCapabilities = AxolotlConfigWCapabilitiesBase
    AxolotlInputConfig = AxolotlInputConfigBase

    if cfg.plugins:
        (
            AxolotlConfigWCapabilities,
            AxolotlInputConfig,
        ) = merge_input_args()

    # Convert datasets to proper format if needed
    if cfg.get("datasets"):
        for idx, ds_cfg in enumerate(cfg["datasets"]):
            if cfg.get("rl") in ["dpo", "ipo", "simpo"] and not isinstance(
                ds_cfg, DPODataset
            ):
                cfg["datasets"][idx] = DPODataset(**ds_cfg)
            elif cfg.get("rl") == "kto" and not isinstance(ds_cfg, KTODataset):
                cfg["datasets"][idx] = KTODataset(**dict(ds_cfg))
            elif not isinstance(ds_cfg, SFTDataset):
                cfg["datasets"][idx] = SFTDataset(**dict(ds_cfg))

    if capabilities or env_capabilities:
        if (capabilities and env_capabilities is None) or (
            env_capabilities and capabilities is None
        ):
            raise ValueError(
                "Both capabilities and env_capabilities must be provided or not provided."
            )

        return DictDefault(
            dict(
                AxolotlConfigWCapabilities(
                    **cfg.to_dict(),
                    capabilities=capabilities,
                    env_capabilities=env_capabilities,
                ).model_dump(exclude_none=True)
            )
        )

    return DictDefault(
        dict(AxolotlInputConfig(**cfg.to_dict()).model_dump(exclude_none=True))
    )


def prepare_plugins(cfg):
    """
    Prepare the plugins for the configuration
    """

    if cfg.get("plugins"):
        plugin_manager = PluginManager.get_instance()
        for plugin_name in cfg["plugins"]:
            plugin_manager.register(plugin_name)


================================================
FILE: src/axolotl/utils/config/models/__init__.py
================================================


================================================
FILE: src/axolotl/utils/ctx_managers/__init__.py
================================================
"""Init for context manager submodule"""

# flake8: noqa

from .sequence_parallel import SequenceParallelContextManager


================================================
FILE: src/axolotl/utils/ctx_managers/sequence_parallel.py
================================================
"""Module for Axolotl trainer sequence parallelism manager and utilities"""

import functools
import inspect

import torch
import torch.distributed as dist
from torch import nn
from torch.distributed import DeviceMesh
from torch.utils.hooks import RemovableHandle
from transformers.modeling_outputs import CausalLMOutputWithPast
from transformers.utils import ModelOutput

from axolotl.monkeypatch.ring_attn import (
    get_ring_attn_group,
    register_ring_attn_from_device_mesh,
    update_ring_attn_params,
)
from axolotl.utils.schemas.enums import RingAttnFunc


# TODO(djsaunde): implement zigzag, stripe patterns here (and elsewhere) in this
# module. Currently, we just focus on batch ring and varlen llama3 for simplicity.
def apply_sequence_parallelism(
    batch: dict[str, torch.Tensor],
    local_rank: int,
    local_world_size: int,
    gradient_accumulation_steps: int,
    ring_attn_func: RingAttnFunc,
) -> tuple[dict[str, torch.Tensor], int, int]:
    """
    Apply sequence parallelism slicing to a batch.

    Special handling is implemented for integer logits_to_keep, which indicates
    to only keep the last N tokens in the sequence during generation.

    Args:
        batch: Batch dictionary (e.g., input_ids, attention_mask, etc.).
        local_rank: Local rank in the sequence parallel group.
        local_world_size: World size of the sequence parallel group.
        gradient_accumulation_steps: Number of steps to accumulate gradients over.
        ring_attn_func: Which ring attention function to use. Currently unused, but
            related to above TODO.

    Returns:
        tuple of:
            - Batch dictionary with sliced tensors.
            - The original sequence length before padding.
            - The number of padding tokens added.
    """
    batch_size, original_seq_len = batch["input_ids"].shape

    # Update ring attention params if needed
    if batch.get("position_ids") is not None and batch_size == 1:
        update_ring_attn_params(position_ids=batch["position_ids"])
    else:
        # If position_ids aren't already in the batch, create them
        batch["position_ids"] = torch.arange(
            0,
            original_seq_len,
            dtype=torch.long,
            device=batch["input_ids"].device,
        ).expand(batch["input_ids"].size(0), -1)

    if "logits_to_keep" in batch and isinstance(batch["logits_to_keep"], int):
        logits_to_keep = batch["logits_to_keep"]

        # Calculate which positions in the full sequence contain the last N tokens
        start_position = max(0, original_seq_len - logits_to_keep)
        chunk_size = original_seq_len // local_world_size
        rank_start = local_rank * chunk_size
        rank_end = rank_start + chunk_size

        # Create a boolean mask tensor for this rank's chunk
        mask = torch.zeros(
            chunk_size,
            dtype=torch.bool,
            device=batch["input_ids"].device,
        )

        if rank_end > start_position:
            # Calculate how many of the last N tokens fall within this rank's range
            tokens_in_rank = min(rank_end, original_seq_len) - max(
                rank_start, start_position
            )

            # Calculate where these tokens start in the local chunk
            local_start_idx = max(0, start_position - rank_start)

            # Set the appropriate positions in the mask to True
            mask[local_start_idx : local_start_idx + tokens_in_rank] = True

        # Replace the integer with the boolean mask
        batch["logits_to_keep"] = mask

    # Add padding to make sequence length divisible by local_world_size
    total_seq_len = original_seq_len
    pad_len = 0
    divisor = min(local_world_size, 64)
    if total_seq_len % divisor != 0:
        pad_len = divisor - (total_seq_len % divisor)

        # Apply padding to all relevant tensors
        for key in batch:
            if (
                isinstance(batch[key], torch.Tensor)
                and batch[key].dim() > 1
                and batch[key].size(1) == total_seq_len
            ):
                # Create padding tensor
                pad_value = -100 if key == "labels" else 0
                padding = torch.full(
                    (batch[key].size(0), pad_len, *batch[key].shape[2:]),
                    pad_value,
                    dtype=batch[key].dtype,
                    device=batch[key].device,
                )

                # Concatenate padding to the right side of the tensor
                batch[key] = torch.cat([batch[key], padding], dim=1)
            if key == "logits_to_keep":
                # Create padding tensor
                padding = torch.ones(
                    1,
                    dtype=batch[key].dtype,
                    device=batch[key].device,
                )

                # Concatenate padding to the right side of the tensor
                batch[key] = torch.cat([batch[key], padding], dim=0)

        # Update the total sequence length after padding
        total_seq_len = batch["input_ids"].size(1)

    # Slice batch for sequence parallel
    for key in batch:
        if not isinstance(batch[key], torch.Tensor) or batch[key].dim() <= 1:
            continue

        # Split in sequential fashion and grab this rank's chunk
        if batch[key].size(1) == total_seq_len:
            batch[key] = (
                batch[key].chunk(local_world_size, dim=1)[local_rank].contiguous()
            )
        elif key == "logits_to_keep":
            batch[key] = (
                batch[key].chunk(local_world_size, dim=0)[local_rank].contiguous()
            )

        # Handle num_items_in_batch
        if "num_items_in_batch" in batch:
            # Approximation; this needed since num_items_in_batch may be counted across
            # all samples in a gradient accumulated batch, not on a per-step basis.
            local_valid_tokens = (batch["labels"] != -100).sum()

            # All-reduce across sequence parallel ranks to get global token count
            cp_group = get_ring_attn_group()
            global_valid_tokens = local_valid_tokens.clone()
            # we use AVG instead of SUM as using sum seems to scale down the loss by over-accounting the number of tokens
            dist.all_reduce(global_valid_tokens, op=dist.ReduceOp.AVG, group=cp_group)
            global_valid_tokens = int(global_valid_tokens.item())

            batch["num_items_in_batch"] = (
                global_valid_tokens * gradient_accumulation_steps
            )

    return batch, original_seq_len, pad_len


class SequenceParallelContextManager:
    """Context manager for sequence parallelism operations.

    This class provides a context that will automatically apply sequence parallelism
    during model forward passes using a pre-forward hook, and gather outputs from
    across the sequence parallelism group using a post-forward hook.

    Args:
        models: List of models to apply sequence parallelism to pre- and post- forward
            hooks.
        context_parallel_size: Number of processes to split sequences over.
        gradient_accumulation_steps: Number of steps to accumulate gradients over.
        ring_attn_func: Which ring attention function to use. Currently unused.
        heads_k_stride: Sequence parallelism K head stride size. Passed through to
            `varlen_llama3` `ring_flash_attn` implementation.
        gather_outputs: Whether to gather outputs after model forward pass across the
            sequence parallel group.
    """

    def __init__(
        self,
        models: list[nn.Module],
        context_parallel_size: int,
        gradient_accumulation_steps: int,
        ring_attn_func: RingAttnFunc,
        heads_k_stride: int | None,
        gather_outputs: bool,
        device_mesh: DeviceMesh | None = None,
    ):
        self.models = models
        self.context_parallel_size = context_parallel_size
        self.gradient_accumulation_steps = gradient_accumulation_steps
        self.ring_attn_func = ring_attn_func
        self.heads_k_stride = heads_k_stride
        self.gather_outputs = gather_outputs
        self.device_mesh = device_mesh

        self._register_ring_attn()

        # Set distributed info for local rank
        self.process_group = get_ring_attn_group()
        self.local_rank = dist.get_rank(self.process_group)
        self.local_world_size = dist.get_world_size(self.process_group)

        # Will store hook handles for removal
        self.hook_handles: list[RemovableHandle] = []

        # Store original sequence length and padding information
        self.original_seq_len = 0
        self.pad_len = 0

        # Track local valid token count for eval loss correction across CP ranks
        self._local_valid_tokens: torch.Tensor | None = None

        # Create a partially applied version of the apply_sequence_parallelism function
        self.apply_sequence_parallelism = functools.partial(
            apply_sequence_parallelism,
            local_rank=self.local_rank,
            local_world_size=self.local_world_size,
            gradient_accumulation_steps=self.gradient_accumulation_steps,
            ring_attn_func=self.ring_attn_func,
        )

    def __enter__(self):
        self._register_model_hooks()

        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        # Remove all hooks
        for handle in self.hook_handles:
            handle.remove()
        self.hook_handles = []

        # TODO(djsaunde): Un-patch attention and accelerate functions (low priority)

    def _register_ring_attn(self):
        # Initialize ring attn for sequence parallelism
        register_ring_attn_from_device_mesh(
            device_mesh=self.device_mesh,
            context_parallel_dim=("cp",),
            heads_k_stride=self.heads_k_stride,
            ring_attn_func=self.ring_attn_func,
        )

    def _register_model_hooks(self):
        # Forward pre-hook to apply sequence parallelism
        def sequence_parallel_pre_hook(_, args, kwargs):
            # Get parameter names from the model's forward function
            forward_params = list(
                inspect.signature(self.models[0].forward).parameters.keys()
            )

            updated_kwargs = kwargs.copy()
            for i, arg in enumerate(args):
                if i < len(forward_params):
                    updated_kwargs[forward_params[i]] = arg

            # Any excess positional arguments are kept as-is
            remaining_args = args[len(forward_params) :]

            # Apply sequence parallelism to updated kwargs
            updated_kwargs, self.original_seq_len, self.pad_len = (
                self.apply_sequence_parallelism(updated_kwargs)
            )

            # Track local valid tokens for eval loss correction
            if "labels" in updated_kwargs and not self.models[0].training:
                self._local_valid_tokens = (
                    (updated_kwargs["labels"] != -100).sum().float()
                )
                # Strip num_items_in_batch during eval so the model uses
                # reduction='mean', allowing the post-hook weighted all-reduce
                # formula (loss * local_valid) to correctly recover the loss sum
                updated_kwargs.pop("num_items_in_batch", None)
            else:
                self._local_valid_tokens = None

            return remaining_args, updated_kwargs

        # Forward post-hook to gather outputs
        def sequence_parallel_post_hook(_, __, output: ModelOutput) -> ModelOutput:
            # Gather the sharded outputs
            output = self._gather_outputs(output)

            # Remove padding if it was added
            if self.pad_len > 0:
                for key, value in output.items():
                    if isinstance(value, torch.Tensor) and value.dim() > 1:
                        if value.size(1) == self.original_seq_len + self.pad_len:
                            # Slice to remove padding
                            output[key] = value[:, : self.original_seq_len].contiguous()

            return output

        # Post-hook to correct eval loss via weighted all-reduce across CP ranks
        def eval_loss_correction_post_hook(_, __, output: ModelOutput) -> ModelOutput:
            if self._local_valid_tokens is None:
                return output
            if not hasattr(output, "loss") or output.loss is None:
                return output

            local_valid = self._local_valid_tokens.to(output.loss.device)
            loss = output.loss.detach().clone()

            # Handle rank with zero valid tokens (loss is NaN)
            if local_valid.item() == 0:
                weighted_loss = torch.zeros(1, device=loss.device, dtype=loss.dtype)
            else:
                weighted_loss = loss * local_valid

            total_valid = local_valid.clone()
            dist.all_reduce(
                weighted_loss,
                op=dist.ReduceOp.SUM,
                group=self.process_group,
            )
            dist.all_reduce(
                total_valid,
                op=dist.ReduceOp.SUM,
                group=self.process_group,
            )

            if total_valid.item() > 0:
                output["loss"] = (weighted_loss / total_valid).squeeze()
            else:
                output["loss"] = torch.tensor(
                    float("nan"), device=loss.device, dtype=loss.dtype
                )

            self._local_valid_tokens = None
            return output

        # Register hooks
        for model in self.models:
            self.hook_handles.append(
                model.register_forward_pre_hook(
                    sequence_parallel_pre_hook, with_kwargs=True
                )
            )
            if self.gather_outputs:
                self.hook_handles.append(
                    model.register_forward_hook(sequence_parallel_post_hook)
                )
            # Always register eval loss correction hook
            self.hook_handles.append(
                model.register_forward_hook(eval_loss_correction_post_hook)
            )

    def _gather_outputs(self, output: CausalLMOutputWithPast) -> CausalLMOutputWithPast:
        """Gather sharded outputs from all ranks and reconstruct the full tensor."""
        for key, value in output.items():
            if isinstance(value, torch.Tensor) and value.dim() > 1:
                output[key] = AllGatherWithGrad.apply(value, self.process_group)

        return output


class AllGatherWithGrad(torch.autograd.Function):
    """Custom autograd function for all-gather to preserve gradients."""

    @staticmethod
    def forward(
        ctx: torch.autograd.function.FunctionCtx,
        input_tensor: torch.Tensor,
        group: dist.ProcessGroup,
    ) -> torch.Tensor:
        """
        Forward pass of all-gather of data with sequence dimension.

        Args:
            ctx: `torch.autograd` function context.
            input_tensor: Tensor from model output with sequence dimension.
            group: `torch.distributed` process group.

        Returns:
            Tensor from gathering the `input_tensor` from across the process group and
                concatenating along the sequence dimension.
        """
        ctx.group = group
        ctx.rank = dist.get_rank(group)
        world_size = dist.get_world_size(group)

        # Gather shape metadata
        local_shape = torch.tensor(list(input_tensor.shape), device=input_tensor.device)
        all_shapes = [torch.zeros_like(local_shape) for _ in range(world_size)]
        dist.all_gather(all_shapes, local_shape, group=group)

        # Store sequence lengths for backward pass
        seq_lens = [int(shape[1].item()) for shape in all_shapes]
        ctx.seq_lens = seq_lens

        # Perform all_gather operation
        gathered = [
            torch.zeros(
                tuple(shape.tolist()),
                dtype=input_tensor.dtype,
                device=input_tensor.device,
            )
            for shape in all_shapes
        ]
        dist.all_gather(gathered, input_tensor, group=group)

        # Concatenate tensors along sequence dimension
        result = torch.cat(gathered, dim=1)

        return result

    @staticmethod
    def backward(
        ctx: torch.autograd.function.FunctionCtx, grad_output: torch.Tensor
    ) -> tuple[torch.Tensor, None]:
        """
        Backward pass for all-gather operation.

        Extracts the gradient slice corresponding to this rank's original input
        from the full gradient tensor.

        Args:
            ctx: `torch.autograd` function context.
            grad_output: Gradient from subsequent layers with respect to the
                concatenated output tensor.

        Returns:
            Tuple containing the gradient slice for this rank's input tensor and `None`
                for the process group parameter which doesn't require gradients.
        """
        rank = ctx.rank
        seq_lens = ctx.seq_lens

        # Extract gradient for this rank's chunk
        offset = sum(seq_lens[:rank])
        grad_slice = grad_output[:, offset : offset + seq_lens[rank]].contiguous()

        return grad_slice, None


================================================
FILE: src/axolotl/utils/data/__init__.py
================================================
"""Init for `axolotl.utils.data` module."""

from axolotl.utils.data.rl import prepare_preference_datasets
from axolotl.utils.data.sft import (
    get_dataset_wrapper,
    prepare_datasets,
)
from axolotl.utils.data.streaming import (
    encode_streaming,
    wrap_streaming_dataset,
)
from axolotl.utils.data.utils import md5

__all__ = [
    "encode_streaming",
    "wrap_streaming_dataset",
    "prepare_preference_datasets",
    "get_dataset_wrapper",
    "prepare_datasets",
    "md5",
]


================================================
FILE: src/axolotl/utils/data/lock.py
================================================
"""Logic for loading / preparing a dataset once over all processes."""

import time
from pathlib import Path
from typing import Any, Callable

from filelock import FileLock

from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
from axolotl.utils.dict import DictDefault

LOCK_FILE_NAME = "datasets_prep.lock"
READY_FILE_NAME = "datasets_ready.flag"
PROCESS_COUNTER_FILE_NAME = "process_counter.txt"


class FileLockLoader:
    """
    Simple class for abstracting single process data loading / processing. The first
    process that creates a lock file does the work; the remaining procesees simply load
    the preprocessed dataset once the first process is done.
    """

    def __init__(self, cfg: DictDefault):
        self.cfg = cfg
        self.dataset_prepared_path = (
            cfg.dataset_prepared_path or DEFAULT_DATASET_PREPARED_PATH
        )
        self.lock_file_path = Path(self.dataset_prepared_path) / LOCK_FILE_NAME
        self.ready_flag_path = Path(self.dataset_prepared_path) / READY_FILE_NAME
        self.counter_path = Path(self.dataset_prepared_path) / PROCESS_COUNTER_FILE_NAME

    def load(self, load_fn: Callable[[], Any]) -> Any:
        with FileLock(str(self.lock_file_path)):
            self._increment_counter()

            if not self.ready_flag_path.exists():
                result = load_fn()
                self.ready_flag_path.touch()
                return result

            while not self.ready_flag_path.exists():
                time.sleep(1)
            return load_fn()

    def _increment_counter(self):
        """Safely increment the process counter."""
        if self.counter_path.exists():
            counter_content = self.counter_path.read_text().strip()
            count = int(counter_content) if counter_content else 0
        else:
            count = 0
        self.counter_path.write_text(str(count + 1))

    def cleanup(self):
        """Clean up ready flag when last process is done."""
        try:
            with FileLock(str(self.lock_file_path)):
                counter_content = self.counter_path.read_text().strip()
                count = int(counter_content) if counter_content else 0
                count -= 1

                if count <= 0:
                    # Last process cleans everything up
                    self.ready_flag_path.unlink(missing_ok=True)
                    self.counter_path.unlink(missing_ok=True)
                else:
                    # Still have active processes
                    self.counter_path.write_text(str(count))
        except FileNotFoundError:
            # Lock file might have already been deleted by another process
            pass


================================================
FILE: src/axolotl/utils/data/rl.py
================================================
"""Data handling specific to RL trainers."""

import inspect
from functools import partial
from typing import Any, Callable, Literal

from datasets import Dataset, DatasetDict
from transformers import PreTrainedTokenizer

from axolotl.loaders import load_tokenizer
from axolotl.prompt_strategies.dpo import load as load_dpo
from axolotl.prompt_strategies.kto import load as load_kto
from axolotl.prompt_strategies.orpo import load as load_orpo
from axolotl.utils.data.lock import FileLockLoader
from axolotl.utils.data.shared import (
    create_train_validation_split,
    datasets_with_name_generator,
    generate_dataset_hash_from_config,
    load_dataset_with_config,
    load_preprocessed_dataset,
    merge_datasets,
    save_preprocessed_dataset,
    try_load_from_hub,
)
from axolotl.utils.data.utils import (
    deduplicate_and_log_datasets,
    retry_on_request_exceptions,
)
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger
from axolotl.utils.schemas.enums import RLType

LOG = get_logger(__name__)


@retry_on_request_exceptions(max_retries=3, delay=5)
def prepare_preference_datasets(
    cfg: DictDefault, tokenizer: PreTrainedTokenizer
) -> tuple[Dataset, Dataset | None]:
    """Load and prepare preference datasets for RL training.

    Loads training and evaluation datasets, handling preprocessing, caching, and
    deduplication as configured. Uses FileLock for distributed coordination.

    Args:
        cfg: Configuration object containing dataset and training settings.
        tokenizer: Tokenizer to use for processing text.

    Returns:
        Tuple of (train_dataset, eval_dataset). eval_dataset may be None
            if no evaluation dataset is configured.
    """

    def _load_datasets():
        # Load training dataset
        train_dataset = _load_or_create_dataset_split(cfg, tokenizer, split="train")

        # Load or create evaluation dataset
        eval_dataset: Dataset | None = None
        if cfg.test_datasets:
            eval_dataset = _load_or_create_dataset_split(cfg, tokenizer, split="test")
        elif cfg.val_set_size:
            # Create validation split from training data
            train_dataset, eval_dataset = create_train_validation_split(
                train_dataset, cfg, cfg.val_set_size
            )

        return train_dataset, eval_dataset

    # Prepare datasets (with file locking logic for multiple ranks)
    loader = FileLockLoader(cfg)
    try:
        train_dataset, eval_dataset = loader.load(_load_datasets)
    finally:
        loader.cleanup()

    # Apply deduplication if configured
    if cfg.dataset_exact_deduplication:
        train_dataset, eval_dataset = deduplicate_and_log_datasets(
            dataset=train_dataset, other_dataset=eval_dataset
        )

    return train_dataset, eval_dataset


def _map_dataset(
    cfg: DictDefault,
    dataset: Dataset | DatasetDict,
    ds_transform_fn: Callable[..., Any],
    tokenizer: Any | None = None,
    **map_kwargs: Any,
) -> Dataset:
    """Apply transformation function to dataset.

    Args:
        cfg: Configuration object.
        dataset: Dataset to transform.
        ds_transform_fn: Transformation function to apply.
        tokenizer: Optional tokenizer for transformation.
        **map_kwargs: Additional arguments for dataset mapping.

    Returns:
        Transformed dataset.
    """
    sig = inspect.signature(ds_transform_fn)
    if "tokenizer" in sig.parameters:
        if not tokenizer:
            tokenizer = load_tokenizer(cfg)
        ds_transform_fn = partial(ds_transform_fn, tokenizer=tokenizer)

    if isinstance(dataset, DatasetDict):
        dataset = dataset["train"]

    dataset = dataset.map(
        ds_transform_fn,
        num_proc=cfg.dataset_num_proc,
        load_from_cache_file=not cfg.is_preprocess,
        desc="Mapping RL Dataset",
        **map_kwargs,
    )

    return dataset


def _drop_long_sequences(
    sample: dict[str, Any], rl: RLType, tokenizer: Any, sequence_len: int
) -> bool:
    """Filter out samples that exceed maximum sequence length.

    Args:
        sample: Dataset sample to check.
        rl: Reinforcement learning type.
        tokenizer: Tokenizer for length calculation.
        sequence_len: Maximum allowed sequence length.

    Returns:
        True if sample should be kept, False if it should be dropped.

    Raises:
        ValueError: If required keys are missing or RL type is unknown.
    """
    if rl in {RLType.DPO, RLType.IPO, RLType.ORPO, RLType.SIMPO}:
        if not (
            sample.get("prompt") and sample.get("chosen") and sample.get("rejected")
        ):
            raise ValueError(
                "Prompt, chosen and rejected keys are required for DPO/ORPO datasets"
            )

        prompt = sample["prompt"]
        chosen = sample["chosen"]
        rejected = sample["rejected"]

        len_prompt = len(tokenizer(prompt, add_special_tokens=False)["input_ids"])
        len_chosen = len(tokenizer(chosen, add_special_tokens=False)["input_ids"])
        len_rejected = len(tokenizer(rejected, add_special_tokens=False)["input_ids"])

        return (len_prompt + len_chosen) <= sequence_len and (
            len_prompt + len_rejected
        ) <= sequence_len

    if rl is RLType.KTO:
        if not (sample.get("prompt") and sample.get("completion")):
            raise ValueError("Prompt and completion keys are required for KTO datasets")

        prompt = sample["prompt"]
        completion = sample["completion"]

        len_prompt = len(tokenizer(prompt, add_special_tokens=False)["input_ids"])
        len_completion = len(
            tokenizer(completion, add_special_tokens=False)["input_ids"]
        )

        return (len_prompt + len_completion) <= sequence_len

    if rl in {RLType.GRPO, RLType.GDPO}:
        return True

    raise ValueError("Unknown RL type")


def _load_split(cfg: DictDefault, split: Literal["train", "test"]) -> Dataset:
    """Load and process dataset split for RL training.

    Args:
        cfg: Configuration object containing dataset settings.
        split: Dataset split to load ("train" or "test").

    Returns:
        Combined and processed dataset for the specified split.
    """
    datasets_configs = cfg.datasets if split == "train" else cfg.test_datasets
    split_datasets: list[Dataset | DatasetDict] = []

    for dataset_config in datasets_with_name_generator(datasets_configs):
        dataset: Dataset | DatasetDict = load_dataset_with_config(
            dataset_config, cfg.hf_use_auth_token, streaming=False
        )
        split_datasets.append(dataset)

    tokenizer = load_tokenizer(cfg)

    for i, dataset in enumerate(split_datasets):
        _type = datasets_configs[i]["type"]
        if _type:
            if isinstance(_type, DictDefault):
                _type = "user_defined.default"
            if cfg.rl is RLType.ORPO:
                ds_transform_fn = load_orpo(_type, cfg, dataset_idx=i)
            elif cfg.rl is RLType.KTO:
                ds_transform_fn = load_kto(_type, cfg, dataset_idx=i)
            else:
                ds_transform_fn = load_dpo(_type, cfg, dataset_idx=i)

            map_kwargs: dict[str, Any] = {}
            if isinstance(ds_transform_fn, tuple):
                ds_transform_fn, map_kwargs = ds_transform_fn
            split_datasets[i] = _map_dataset(
                cfg, dataset, ds_transform_fn, tokenizer, **map_kwargs
            )
        else:
            # If no `type` is provided, assume the dataset is already in the expected format with
            # "prompt", "chosen", and "rejected" already preprocessed
            split_datasets[i] = dataset

        if not cfg.skip_prepare_dataset:
            drop_long = partial(
                _drop_long_sequences,
                rl=cfg.rl,
                tokenizer=tokenizer,
                sequence_len=cfg.sequence_len,
            )

            prior_len = len(split_datasets[i])
            split_datasets[i] = split_datasets[i].filter(
                drop_long,
                num_proc=cfg.dataset_num_proc,
                load_from_cache_file=not cfg.is_preprocess,
                desc="Dropping Long Sequences",
            )
            dropped = prior_len - len(split_datasets[i])
            if dropped:
                LOG.warning(f"Dropped {dropped} long samples from dataset index {i}")

    # Merge datasets
    dataset = merge_datasets(split_datasets, cfg)

    if not cfg.skip_prepare_dataset:
        # Deduplicate before saving so the saved dataset is already de-duplicated
        if cfg.dataset_exact_deduplication:
            dataset, _ = deduplicate_and_log_datasets(dataset=dataset)

        # Save preprocessed dataset
        dataset_hash = generate_dataset_hash_from_config(
            cfg, datasets_configs, tokenizer.name_or_path
        )
        save_preprocessed_dataset(cfg, dataset, dataset_hash, split)

    return dataset


def _load_or_create_dataset_split(
    cfg: DictDefault, tokenizer: PreTrainedTokenizer, split: Literal["train", "test"]
) -> Dataset:
    """Load preprocessed dataset or create new one for given split.

    Args:
        cfg: Configuration object.
        tokenizer: Tokenizer to use for processing text.
        split: Dataset split to load.

    Returns:
        Tuple of (dataset, is_preprocessed).
    """
    # Select correct dataset configuration based on split
    datasets_config = cfg.datasets if split == "train" else cfg.test_datasets

    # Generate dataset hash for caching
    dataset_hash = generate_dataset_hash_from_config(
        cfg, datasets_config, tokenizer.name_or_path
    )

    # Try loading from hub if push_dataset_to_hub is configured
    dataset = None
    if cfg.push_dataset_to_hub:
        dataset = try_load_from_hub(cfg, dataset_hash, split)

    # Attempt to load preprocessed dataset
    if dataset is None:
        dataset = load_preprocessed_dataset(cfg, dataset_hash)

    # Otherwise, load it
    if dataset is None:
        dataset = _load_split(cfg, split=split)

    return dataset


================================================
FILE: src/axolotl/utils/data/sft.py
================================================
"""Data handling specific to SFT."""

import functools
import os
import tempfile
from typing import Literal

from datasets import (
    Dataset,
    DatasetDict,
    IterableDataset,
    IterableDatasetDict,
    load_dataset,
)
from transformers import PreTrainedTokenizer, ProcessorMixin

from axolotl.prompters import Prompter
from axolotl.utils.data.lock import FileLockLoader
from axolotl.utils.data.shared import (
    create_train_validation_split,
    datasets_with_name_generator,
    generate_dataset_hash_from_config,
    load_dataset_with_config,
    load_preprocessed_dataset,
    merge_datasets,
    save_preprocessed_dataset,
    try_load_from_hub,
)
from axolotl.utils.data.streaming import wrap_streaming_dataset
from axolotl.utils.data.utils import (
    deduplicate_and_log_datasets,
    handle_long_seq_in_dataset,
    retry_on_request_exceptions,
)
from axolotl.utils.data.wrappers import get_dataset_wrapper
from axolotl.utils.dict import DictDefault
from axolotl.utils.distributed import is_local_main_process
from axolotl.utils.logging import get_logger
from axolotl.utils.trainer import (
    calculate_total_num_steps,
    process_datasets_for_packing,
)

LOG = get_logger(__name__)


@retry_on_request_exceptions(max_retries=3, delay=5)
def prepare_datasets(
    cfg: DictDefault,
    tokenizer: PreTrainedTokenizer,
    processor: ProcessorMixin | None = None,
) -> tuple[IterableDataset | Dataset, Dataset | None, int, list[Prompter | None]]:
    """Prepare training and evaluation datasets based on configuration.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        tokenizer: Tokenizer to use for processing text.
        processor: Optional processor for multimodal datasets.

    Returns:
        Tuple of (train_dataset, eval_dataset, total_steps, prompters).
    """
    if cfg.streaming or cfg.pretraining_dataset:
        return _prepare_streaming_dataset(cfg, tokenizer, processor)
    return _prepare_standard_dataset(cfg, tokenizer, processor)


def _prepare_standard_dataset(
    cfg: DictDefault,
    tokenizer: PreTrainedTokenizer,
    processor: ProcessorMixin | None,
) -> tuple[Dataset, Dataset | None, int, list[Prompter | None]]:
    """Prepare standard (non-pretraining) datasets."""

    def _load_datasets():
        # Always load training dataset
        train_dataset, eval_dataset, prompters = _load_and_prepare_datasets(
            tokenizer,
            cfg,
            split="train",
            processor=processor,
        )

        # Overwrite eval_dataset if test data exists
        if cfg.test_datasets:
            _, eval_dataset, _ = _load_and_prepare_datasets(
                tokenizer,
                cfg,
                split="test",
                processor=processor,
            )

        return train_dataset, eval_dataset, prompters

    # Prepare datasets (with file locking logic for multiple ranks)
    loader = FileLockLoader(cfg)
    try:
        train_dataset, eval_dataset, prompters = loader.load(_load_datasets)
    finally:
        loader.cleanup()

    if os.environ.get("AXOLOTL_IS_PREPROCESS") == "1":
        return train_dataset, eval_dataset, -1, prompters

    # Validate sample packing configuration for evaluation
    if eval_dataset and cfg.sample_packing and cfg.eval_sample_packing is not False:
        total_eval_steps = calculate_total_num_steps(cfg, eval_dataset, update=False)
        if total_eval_steps == 0:
            raise ValueError(
                "eval dataset split is too small for sample_packing. "
                "You should set `eval_sample_packing: False` in your config."
            )

    # Calculate total number of training steps
    if cfg.max_steps:
        total_num_steps = min(
            calculate_total_num_steps(cfg, train_dataset), cfg.max_steps
        )
    else:
        total_num_steps = calculate_total_num_steps(cfg, train_dataset)
    LOG.info(f"Maximum number of steps set at {total_num_steps}")
    return train_dataset, eval_dataset, total_num_steps, prompters


def _prepare_streaming_dataset(
    cfg: DictDefault,
    tokenizer: PreTrainedTokenizer,
    processor: ProcessorMixin | None,
) -> tuple[IterableDataset, Dataset | None, int, list[Prompter | None]]:
    """
    Prepare dataset for streaming mode.

    Note: Streaming datasets are loaded incrementally from the source.
    """
    if cfg.pretraining_dataset:
        dataset_config = _extract_pretraining_config(cfg)
        train_dataset = _load_streaming_dataset(dataset_config, cfg, tokenizer)
    elif cfg.sample_packing:
        # TODO(djsaunde): Implement for multiple datasets
        dataset_config = DictDefault(cfg.datasets[0])

        # Ensure we have a split set - default to 'train' if not specified
        if not hasattr(dataset_config, "split") or not dataset_config.split:
            dataset_config.split = "train"
        train_dataset = _load_streaming_dataset(dataset_config, cfg, tokenizer)
    else:
        # Use legacy loading function for non-packed streaming datasets
        train_dataset, eval_dataset, prompters = _load_and_prepare_datasets(
            tokenizer,
            cfg,
            split="train",
            processor=processor,
            streaming=True,
        )

        # Return early for non-packed streaming datasets
        total_num_steps = cfg.max_steps if cfg.max_steps else -1
        return train_dataset, eval_dataset, total_num_steps, prompters

    # Load evaluation dataset if specified
    eval_dataset = None
    if cfg.test_datasets:
        _, eval_dataset, _ = _load_and_prepare_datasets(
            tokenizer,
            cfg,
            split="test",
            processor=processor,
            streaming=False,
        )

    # For streaming, we return max_steps directly from config or -1 if not set
    total_num_steps = cfg.max_steps if cfg.max_steps else -1
    return train_dataset, eval_dataset, total_num_steps, []


def _extract_pretraining_config(cfg: DictDefault) -> DictDefault:
    """Extract pretraining configuration from the main config."""
    if isinstance(cfg.pretraining_dataset, list) and isinstance(
        cfg.pretraining_dataset[0], dict
    ):
        config = cfg.pretraining_dataset[0]
        return DictDefault(
            {
                "path": config["path"],
                "name": config["name"],
                "skip": config["skip"],
                "split": config.get("split", "train"),
                "data_files": config.get("data_files"),
                "type": config.get("type", "pretrain"),
            }
        )
    # Simple string path case
    return DictDefault(
        {
            "path": cfg.pretraining_dataset,
            "name": None,
            "skip": 0,
            "split": "train",
            "data_files": None,
            "type": "pretrain",
        }
    )


def _load_streaming_dataset(
    pretraining_config: DictDefault, cfg: DictDefault, tokenizer: PreTrainedTokenizer
) -> IterableDataset:
    """Load and prepare a streaming dataset for pretraining."""
    # Create dataset wrapper partial function
    dataset_wrapper_partial = functools.partial(
        get_dataset_wrapper,
        dataset_config=pretraining_config,
        tokenizer=tokenizer,
        cfg=cfg,
        dataset_base_type=pretraining_config["type"],
    )

    # Load the actual dataset
    if (
        cfg.accelerator_config
        and cfg.accelerator_config.dispatch_batches
        and not is_local_main_process()
    ):
        iter_dataset = _create_placeholder_dataset()
    else:
        iter_dataset = load_dataset(
            pretraining_config["path"],
            streaming=True,
            split=pretraining_config["split"],
            name=pretraining_config["name"],
            data_files=pretraining_config["data_files"],
        )

    # Apply skip if specified
    if pretraining_config["skip"]:
        LOG.info(f"Skipping {pretraining_config['skip']} samples from the dataset")
        iter_dataset = iter_dataset.skip(pretraining_config["skip"])

    # Wrap the dataset for pretraining
    train_dataset = wrap_streaming_dataset(
        iter_dataset,
        tokenizer,
        cfg,
        dataset_wrapper_partial,
    )

    # Format for PyTorch
    return train_dataset.with_format("torch")


def _create_placeholder_dataset() -> IterableDataset:
    """Create a minimal placeholder dataset for non-main processes."""
    with tempfile.NamedTemporaryFile(mode="w+", delete=False) as f:
        f.write("text\n")
        f.write("lorem ipsum dolor sit amet\n")
        f.seek(0)
        return load_dataset("csv", data_files=f.name, split="train", streaming=True)


def _load_tokenized_prepared_datasets(
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    split: Literal["train", "test"] = "train",
    processor: ProcessorMixin | None = None,
    streaming: bool = False,
) -> tuple[Dataset | DatasetDict, list[Prompter | None]]:
    """Load or create tokenized and prepared datasets for training or testing.

    Args:
        tokenizer: Tokenizer for processing text.
        cfg: Configuration object.
        split: Dataset split to load ('train' or 'test').
        processor: Optional processor for multimodal datasets.
        streaming: Whether to use iterable preprocessing.

    Returns:
        Tuple of (dataset, prompters list).
    """
    # Select correct dataset configuration based on split
    datasets_configs = cfg.datasets if split == "train" else cfg.test_datasets

    # Generate dataset hash for caching
    dataset_hash = generate_dataset_hash_from_config(
        cfg, datasets_configs, tokenizer.name_or_path
    )

    # Try loading from hub if push_dataset_to_hub is configured
    dataset = None
    if cfg.push_dataset_to_hub:
        dataset = try_load_from_hub(cfg, dataset_hash, split)

    # If not found on hub, try loading from disk
    if dataset is None:
        dataset = load_preprocessed_dataset(cfg, dataset_hash)

    # If not found on disk or skipping prepared dataset, load and process raw datasets
    prompters: list[Prompter | None] = []
    if dataset is None:
        dataset, prompters = _load_raw_datasets(
            cfg,
            datasets_configs,
            tokenizer,
            split,
            processor,
            streaming,
        )

    return dataset, prompters


def _load_raw_datasets(
    cfg: DictDefault,
    datasets_configs: list,
    tokenizer: PreTrainedTokenizer,
    split: str,
    processor: ProcessorMixin | None = None,
    streaming: bool = False,
) -> tuple[Dataset, list[Prompter | None]]:
    """Load, process, merge, and save raw datasets."""
    LOG.info("Loading raw datasets...", main_process_only=False)
    if not cfg.is_preprocess and not cfg.skip_prepare_dataset:
        LOG.warning(
            "Processing datasets during training can lead to VRAM instability. Please "
            "pre-process your dataset using `axolotl preprocess path/to/config.yml`."
        )

    # Load and process individual datasets
    datasets = []
    prompters = []
    for dataset_config in datasets_with_name_generator(datasets_configs):
        dataset_wrapper, dataset_prompter = _load_and_process_single_dataset(
            dataset_config=dataset_config,
            cfg=cfg,
            tokenizer=tokenizer,
            split=split,
            seed=cfg.seed,
            processor=processor,
            streaming=streaming,
        )
        datasets.append(dataset_wrapper)
        prompters.append(dataset_prompter)

    # Merge datasets
    dataset = merge_datasets(datasets, cfg)

    if not cfg.skip_prepare_dataset and not streaming:
        if split == "test" and cfg.eval_sequence_len:
            dataset = handle_long_seq_in_dataset(dataset, cfg.eval_sequence_len, cfg)
        else:
            dataset = handle_long_seq_in_dataset(dataset, cfg.sequence_len, cfg)
        if (split == "train" and cfg.sample_packing) or (
            split == "test" and cfg.eval_sample_packing
        ):
            dataset, _ = process_datasets_for_packing(cfg, dataset, None)

        # Deduplicate before saving so the saved dataset is already de-duplicated
        if cfg.dataset_exact_deduplication:
            dataset, _ = deduplicate_and_log_datasets(dataset=dataset)

        # Save the prepared dataset
        dataset_hash = generate_dataset_hash_from_config(
            cfg, datasets_configs, tokenizer.name_or_path
        )
        save_preprocessed_dataset(cfg, dataset, dataset_hash, split)

    return dataset, prompters


def _load_and_process_single_dataset(
    dataset_config: DictDefault,
    cfg: DictDefault,
    tokenizer: PreTrainedTokenizer,
    split: str,
    seed: int,
    processor: ProcessorMixin | None = None,
    streaming: bool = False,
) -> tuple[Dataset | IterableDataset, Prompter | None]:
    """Load and process a single dataset based on the passed config."""
    # Load the dataset
    dataset = load_dataset_with_config(
        dataset_config, cfg.hf_use_auth_token, streaming=streaming
    )

    # Parse dataset type
    d_base_type, d_prompt_style = _parse_dataset_type(dataset_config.type)

    # Select the appropriate split
    if isinstance(dataset, (DatasetDict, IterableDatasetDict)):
        if dataset_config.split and dataset_config.split in dataset:
            dataset = dataset[dataset_config.split]
        elif split in dataset:
            dataset = dataset[split]
        else:
            raise ValueError(
                f"no {split} split found for dataset {dataset_config.path}, you may "
                "specify a split with 'split: ...'"
            )

    # Apply sharding if configured
    if dataset_config.shards:
        shards_idx = dataset_config.get("shards_idx", 0)
        dataset = dataset.shuffle(seed=seed).shard(
            num_shards=dataset_config.shards, index=shards_idx
        )

    # Apply dataset wrapper
    dataset_wrapper, dataset_prompter = get_dataset_wrapper(
        dataset_config=dataset_config,
        tokenizer=tokenizer,
        cfg=cfg,
        dataset_base_type=d_base_type,
        dataset=dataset,
        dataset_prompt_style=d_prompt_style,
        processor=processor,
    )

    return dataset_wrapper, dataset_prompter


def _parse_dataset_type(d_type: str) -> tuple[str | None, str | None]:
    """Parse the dataset type string into base type and prompt style."""
    if not isinstance(d_type, str):
        return None, None

    d_type_split = d_type.split(":")
    d_base_type = d_type_split[0]
    d_prompt_style = d_type_split[1] if len(d_type_split) > 1 else None

    return d_base_type, d_prompt_style


def _handle_train_dataset_split(
    dataset: Dataset, cfg: DictDefault
) -> tuple[Dataset, Dataset | None]:
    """Handle processing for train split, including validation set creation."""
    val_set_size = (
        int(cfg.val_set_size) if cfg.val_set_size > 1 else float(cfg.val_set_size)
    )

    if val_set_size:
        # Create train/validation split
        train_dataset, eval_dataset = create_train_validation_split(
            dataset, cfg, val_set_size
        )
        return train_dataset, eval_dataset

    # No validation split - deduplication already applied during preprocessing
    return dataset, None


def _apply_dataset_sharding(dataset: Dataset, cfg: DictDefault) -> Dataset:
    """Apply dataset sharding if configured.

    Args:
        dataset: Dataset to shard.
        cfg: Configuration object containing shard settings.

    Returns:
        Sharded dataset or original dataset if no sharding configured.
    """
    if cfg.dataset_shard_num and cfg.dataset_shard_idx is not None:
        LOG.info(
            f"Using index #{cfg.dataset_shard_idx} of {cfg.dataset_shard_num} shards"
        )
        dataset = dataset.shard(
            num_shards=cfg.dataset_shard_num,
            index=cfg.dataset_shard_idx,
        )
    return dataset


def _load_and_prepare_datasets(
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    split: Literal["train", "test"] = "train",
    processor: ProcessorMixin | None = None,
    streaming: bool = False,
) -> tuple[Dataset | None, Dataset | None, list[Prompter | None]]:
    """Load and prepare datasets with optional validation split and sharding.

    Args:
        tokenizer: Tokenizer for processing text.
        cfg: Configuration object.
        split: Dataset split to load ('train' or 'test').
        processor: Optional processor for multimodal datasets.
        streaming: Whether to use iterable preprocessing.

    Returns:
        Tuple of (train_dataset, eval_dataset, prompters).
    """
    # Load the base dataset
    dataset, prompters = _load_tokenized_prepared_datasets(
        tokenizer,
        cfg,
        split=split,
        processor=processor,
        streaming=streaming,
    )

    # Apply dataset sharding if configured using shared function
    dataset = _apply_dataset_sharding(dataset, cfg)

    # Apply deduplication and create train / validation splits based on the split type
    if split == "train":
        train_dataset, eval_dataset = _handle_train_dataset_split(dataset, cfg)
    else:
        # Deduplication already applied during preprocessing
        train_dataset, eval_dataset = None, dataset

    return train_dataset, eval_dataset, prompters


================================================
FILE: src/axolotl/utils/data/shared.py
================================================
"""Dataset loading shared utils."""

from __future__ import annotations

import functools
import os
from pathlib import Path
from typing import TYPE_CHECKING, Any, Generator

from datasets import (
    Dataset,
    DatasetDict,
    IterableDataset,
    IterableDatasetDict,
    concatenate_datasets,
    load_dataset,
    load_from_disk,
)
from huggingface_hub import hf_hub_download, snapshot_download
from huggingface_hub.errors import (
    HFValidationError,
    RepositoryNotFoundError,
    RevisionNotFoundError,
)

from axolotl.common.const import DEFAULT_DATASET_PREPARED_PATH
from axolotl.utils.data.utils import deduplicate_and_log_datasets, md5
from axolotl.utils.datasets import get_default_process_count
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

if TYPE_CHECKING:
    from adlfs import AzureBlobFileSystem
    from gcsfs import GCSFileSystem
    from ocifs import OCIFileSystem
    from s3fs import S3FileSystem

LOG = get_logger(__name__)

EXTENSIONS_TO_DATASET_TYPES = {
    ".parquet": "parquet",
    ".arrow": "arrow",
    ".csv": "csv",
    ".txt": "text",
}


def get_dataset_type(dataset_config: DictDefault) -> str:
    """Get the dataset type from the path if it's not specified."""
    if dataset_config.ds_type:
        return dataset_config.ds_type

    for extension, dataset_type in EXTENSIONS_TO_DATASET_TYPES.items():
        if extension in dataset_config.path:
            return dataset_type

    return "json"


def datasets_with_name_generator(
    dataset_configs: list[DictDefault],
) -> Generator[DictDefault, None, None]:
    """Yields expanded dataset configurations based on multiple names or preprocessing
    shards.

    When a dataset config has a list of names, it yields separate configs for each
    name. When a dataset config specifies preprocessing shards, it yields configs for
    each shard.

    Args:
        dataset_configs: List of dataset configuration objects.

    Yields:
        Individual dataset configurations, expanded as needed for names or shards.
    """
    for config in dataset_configs:
        if config.name and isinstance(config.name, list):
            for name in config.name:
                yield DictDefault({**config, "name": name})
        elif config.preprocess_shards and not config.shards:
            for shard_idx in range(config.preprocess_shards):
                yield DictDefault(
                    {
                        **config,
                        "shards": config.preprocess_shards,
                        "shards_idx": shard_idx,
                    }
                )
        else:
            yield config


def load_dataset_with_config(
    dataset_config: DictDefault, use_auth_token: bool, streaming=False
) -> Dataset | IterableDataset:
    """Load a dataset from a config. Handles datasets that are stored locally, in the
    HuggingFace Hub, in a remote filesystem (S3, GCS, Azure, OCI), a URL, or
    `data_files`.

    Args:
        dataset_config: Single dataset config.
        use_auth_token: Whether to use HF auth token.
        streaming: Whether to stream the dataset.

    Returns:
        Loaded dataset.
    """
    # Set up common kwargs for dataset loading
    load_dataset_kwargs = {
        "split": dataset_config.split if dataset_config.split else None,
        "name": dataset_config.name,
        "streaming": streaming,
        "trust_remote_code": dataset_config.trust_remote_code,
    }

    # First check if it's a local path
    if Path(dataset_config.path).exists():
        return _load_from_local_path(dataset_config, load_dataset_kwargs)

    # Check if it's a HuggingFace dataset
    is_hub_dataset = _check_if_hub_dataset(dataset_config, use_auth_token)

    # Check if it's a cloud storage path and get appropriate filesystem
    remote_fs, storage_options = _get_remote_filesystem(dataset_config.path)
    is_cloud_dataset = False
    if remote_fs:
        try:
            is_cloud_dataset = remote_fs.exists(dataset_config.path)
        except (FileNotFoundError, ConnectionError):
            pass

    # Load from appropriate source
    if is_hub_dataset:
        return _load_from_hub(dataset_config, use_auth_token, load_dataset_kwargs)
    if is_cloud_dataset:
        return _load_from_cloud(
            dataset_config, remote_fs, storage_options, load_dataset_kwargs
        )
    if dataset_config.path.startswith("https://"):
        return _load_from_url(dataset_config, load_dataset_kwargs)
    if dataset_config.data_files:
        return _load_from_data_files(dataset_config, load_dataset_kwargs)

    raise ValueError(
        f"The dataset could not be loaded. This could be due to a misconfigured dataset path "
        f"({dataset_config.path}). Try double-check your path / name / data_files. "
        f"This is not caused by the dataset type."
    )


def _check_if_hub_dataset(dataset_config: DictDefault, use_auth_token: bool) -> bool:
    """Check if a dataset exists on the HuggingFace Hub."""
    try:
        snapshot_download(
            repo_id=dataset_config.path,
            repo_type="dataset",
            token=use_auth_token,
            revision=dataset_config.revision,
            ignore_patterns=["*"],
        )
        return True
    except (
        RepositoryNotFoundError,
        RevisionNotFoundError,
        FileNotFoundError,
        ConnectionError,
        HFValidationError,
        ValueError,
    ):
        return False


def _get_remote_filesystem(
    path: str,
) -> tuple[
    S3FileSystem | GCSFileSystem | AzureBlobFileSystem | OCIFileSystem | None, dict
]:
    """Get the appropriate filesystem for a remote path."""
    if path.startswith("s3://"):
        try:
            import s3fs

            storage_options = {"anon": False}
            return s3fs.S3FileSystem(**storage_options), storage_options
        except ImportError as exc:
            raise ImportError("s3:// paths require s3fs to be installed") from exc

    elif path.startswith(("gs://", "gcs://")):
        try:
            import gcsfs

            storage_options = {"token": None}  # type: ignore  # nosec B105
            return gcsfs.GCSFileSystem(**storage_options), storage_options
        except ImportError as exc:
            raise ImportError(
                "gs:// or gcs:// paths require gcsfs to be installed"
            ) from exc

    elif path.startswith(("adl://", "abfs://", "az://")):
        try:
            import adlfs

            storage_options = {"anon": False}
            return adlfs.AzureBlobFileSystem(**storage_options), storage_options
        except ImportError as exc:
            raise ImportError(
                "adl:// or abfs:// paths require adlfs to be installed"
            ) from exc

    elif path.startswith("oci://"):
        try:
            import ocifs

            storage_options = {}
            return ocifs.OCIFileSystem(**storage_options), storage_options
        except ImportError as exc:
            raise ImportError("oci:// paths require ocifs to be installed") from exc

    return None, {}


def _load_from_local_path(
    dataset_config: DictDefault, load_dataset_kwargs: dict
) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
    """Load a dataset from a local path."""
    local_path = Path(dataset_config.path)

    if local_path.is_dir():
        if dataset_config.data_files:
            dataset_type = get_dataset_type(dataset_config)
            return load_dataset(
                dataset_type,
                data_files=dataset_config.data_files,
                **load_dataset_kwargs,
            )
        try:
            return load_from_disk(dataset_config.path)
        except FileNotFoundError:
            return load_dataset(dataset_config.path, **load_dataset_kwargs)
    elif local_path.is_file():
        dataset_type = get_dataset_type(dataset_config)

        # For single file datasets, HF always creates only a "train" split
        if dataset_type in ("json", "csv", "text"):
            load_dataset_kwargs["split"] = "train"

        return load_dataset(
            dataset_type,
            data_files=dataset_config.path,
            **load_dataset_kwargs,
        )
    else:
        raise ValueError(
            "Unhandled dataset load: local path exists, but is neither a directory or a file"
        )


def _load_from_hub(
    dataset_config: DictDefault, use_auth_token: bool, load_dataset_kwargs: dict
) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
    """Load a dataset from the HuggingFace Hub."""
    return load_dataset(
        dataset_config.path,
        data_files=dataset_config.data_files,
        token=use_auth_token,
        revision=dataset_config.revision,
        **load_dataset_kwargs,
    )


def _load_from_cloud(
    dataset_config: DictDefault,
    remote_fs: S3FileSystem | GCSFileSystem | AzureBlobFileSystem | OCIFileSystem,
    storage_options: dict,
    load_dataset_kwargs: dict,
) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
    """Load a dataset from cloud storage."""
    if remote_fs.isdir(dataset_config.path):
        return load_from_disk(
            dataset_config.path,
            storage_options=storage_options,
        )

    if remote_fs.isfile(dataset_config.path):
        dataset_type = get_dataset_type(dataset_config)
        return load_dataset(
            dataset_type,
            data_files=dataset_config.path,
            storage_options=storage_options,
            **load_dataset_kwargs,
        )

    raise ValueError(
        f"Cloud path {dataset_config.path} is neither a directory nor a file"
    )


def _load_from_url(
    dataset_config: DictDefault, load_dataset_kwargs: dict
) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
    """Load a dataset from a URL."""
    dataset_type = get_dataset_type(dataset_config)
    return load_dataset(
        dataset_type,
        data_files=dataset_config.path,
        **load_dataset_kwargs,
    )


def _load_from_data_files(
    dataset_config: DictDefault, load_dataset_kwargs: dict
) -> Dataset | IterableDataset | DatasetDict | IterableDatasetDict:
    """Load a dataset from data files."""
    file_path = None

    if isinstance(dataset_config.data_files, str):
        file_path = hf_hub_download(
            repo_id=dataset_config.path,
            repo_type="dataset",
            filename=dataset_config.data_files,
            revision=dataset_config.revision,
        )
    elif isinstance(dataset_config.data_files, list):
        file_path = [
            hf_hub_download(
                repo_id=dataset_config.path,
                repo_type="dataset",
                filename=file,
                revision=dataset_config.revision,
            )
            for file in dataset_config.data_files
        ]
    else:
        raise ValueError("data_files must be either a string or list of strings")

    return load_dataset("json", data_files=file_path, **load_dataset_kwargs)


def generate_split_fingerprints(
    dataset: Dataset, val_set_size: int | float, seed: int
) -> tuple[str, str]:
    """Generate consistent fingerprints for train/test splits."""
    fingerprint = dataset._fingerprint

    train_hash_input = f"{fingerprint}|{val_set_size}|train|{seed}"
    test_hash_input = f"{fingerprint}|{val_set_size}|test|{seed}"

    train_fingerprint = md5(train_hash_input)
    test_fingerprint = md5(test_hash_input)

    return train_fingerprint, test_fingerprint


def get_prepared_dataset_path(cfg: DictDefault, dataset_hash: str) -> Path:
    """Get standardized path for prepared datasets.

    Args:
        cfg: Configuration object.
        dataset_hash: Hash identifying the specific dataset configuration.

    Returns:
        Path where the prepared dataset should be stored.
    """
    base_path = cfg.dataset_prepared_path or DEFAULT_DATASET_PREPARED_PATH
    return Path(base_path) / dataset_hash


def create_train_validation_split(
    dataset: Dataset, cfg: DictDefault, val_set_size: int | float
) -> tuple[Dataset, Dataset]:
    """Create train/validation split with consistent fingerprinting.

    Args:
        dataset: Dataset to split.
        cfg: Configuration object containing seed and other settings.
        val_set_size: Size of validation set (absolute number or fraction).

    Returns:
        Tuple of (train_dataset, eval_dataset).
    """
    train_fingerprint, test_fingerprint = generate_split_fingerprints(
        dataset, val_set_size, cfg.seed
    )

    # Apply deduplication before splitting if configured
    if cfg.dataset_exact_deduplication:
        dataset, _ = deduplicate_and_log_datasets(dataset=dataset)

    split_dataset = dataset.train_test_split(
        test_size=val_set_size,
        shuffle=False,
        seed=cfg.seed,
        train_new_fingerprint=train_fingerprint,
        test_new_fingerprint=test_fingerprint,
    )

    return split_dataset["train"], split_dataset["test"]


def _generate_from_iterable_dataset(
    dataset: IterableDataset, worker_id: list[int], num_workers: list[int]
) -> Generator[Any, None, None]:
    """Generator function to correctly split the dataset for each worker"""
    for i, item in enumerate(dataset):
        if i % num_workers[0] == worker_id[0]:
            yield item


def save_preprocessed_dataset(
    cfg: DictDefault,
    dataset: Dataset,
    dataset_hash: str,
    split: str,
) -> None:
    """Save preprocessed dataset to disk and optionally push to the HF Hub."""
    prepared_ds_path = get_prepared_dataset_path(cfg, dataset_hash)
    num_workers = cfg.dataset_num_proc or get_default_process_count()
    if isinstance(dataset, IterableDataset):
        ds_from_iter = Dataset.from_generator(
            functools.partial(_generate_from_iterable_dataset, dataset),
            features=dataset.features,
            num_proc=num_workers,
            split=split,
            gen_kwargs={
                "worker_id": list(range(num_workers)),
                "num_workers": [num_workers] * num_workers,
            },
        )
        ds_from_iter.save_to_disk(
            str(prepared_ds_path),
            num_proc=num_workers,
            max_shard_size=None,
            num_shards=cfg.num_dataset_shards_to_save,
        )
    else:
        min_rows_per_proc = 256
        os.makedirs(prepared_ds_path, exist_ok=True)
        dataset.save_to_disk(
            str(prepared_ds_path),
            num_proc=min(max(1, len(dataset) // min_rows_per_proc), num_workers),
            max_shard_size=None,
            num_shards=cfg.num_dataset_shards_to_save,
        )
    if cfg.push_dataset_to_hub:
        LOG.info(
            "Pushing merged prepared dataset to Huggingface hub at "
            f"{cfg.push_dataset_to_hub} (version {dataset_hash})...",
            main_process_only=False,
        )
        dataset.push_to_hub(
            cfg.push_dataset_to_hub,
            dataset_hash,
            private=True,
        )


def load_preprocessed_dataset(cfg: DictDefault, dataset_hash: str) -> Dataset | None:
    """Load preprocessed dataset from disk if available.

    Args:
        cfg: Configuration object.
        dataset_hash: Hash identifying the dataset configuration.

    Returns:
        Loaded dataset if found and conditions are met, None otherwise.
    """
    prepared_ds_path = get_prepared_dataset_path(cfg, dataset_hash)

    if (
        cfg.dataset_prepared_path
        and any(prepared_ds_path.glob("*"))
        and not cfg.skip_prepare_dataset
        and not cfg.is_preprocess
    ):
        LOG.info(
            f"Loading prepared dataset from disk at {prepared_ds_path}...",
        )
        return load_from_disk(str(prepared_ds_path))

    LOG.info(
        f"Unable to find prepared dataset in {prepared_ds_path}",
    )
    return None


def try_load_from_hub(
    cfg: DictDefault, dataset_hash: str, split: str
) -> Dataset | None:
    """Try to load the prepared dataset from HuggingFace Hub."""
    try:
        LOG.info(
            "Attempting to load prepared dataset from HuggingFace Hub at "
            f"{cfg.push_dataset_to_hub} (version {dataset_hash})..."
        )
        dataset = load_dataset(
            cfg.push_dataset_to_hub,
            dataset_hash,
            token=cfg.hf_use_auth_token,
        )
        return dataset[split]
    except Exception:
        LOG.info("Unable to find prepared dataset in HuggingFace Hub")
        return None


def generate_dataset_hash_from_config(
    cfg: DictDefault, cfg_datasets: list, tokenizer_name: str
) -> str:
    """Generate a hash to uniquely identify a dataset configuration for SFT.

    Args:
        cfg: Main configuration object.
        cfg_datasets: List of dataset configurations.
        tokenizer_name: Name of the tokenizer being used.

    Returns:
        MD5 hash string representing the configuration.
    """
    config_str = (
        f"{cfg.sequence_len}@{cfg.sample_packing}@{cfg.eval_sample_packing}@"
        f"{cfg.group_by_length}@{cfg.kd_temperature or 1.0}@"
        f"{cfg.dataset_exact_deduplication or False}|"
        f"{'|'.join(sorted([f'{d.path}:{d.type}:{d.shards}:{d.conversation}:{d.split}:{d.temperature or 1.0}' for d in cfg_datasets]))}"
        f"|{tokenizer_name}"
    )
    return str(md5(config_str))


def merge_datasets(datasets: list[Dataset], cfg: DictDefault) -> Dataset:
    """Merge multiple datasets into one with optional shuffling.

    Args:
        datasets: List of datasets to merge.
        cfg: Configuration object containing shuffle settings.

    Returns:
        Merged dataset.
    """
    if len(datasets) == 1:
        ds = datasets[0]

        # Do not shuffle if curriculum sampling is enabled or
        # shuffle_merged_datasets is disabled
        if cfg.curriculum_sampling or not cfg.shuffle_merged_datasets:
            return ds

        return ds.shuffle(seed=cfg.seed)

    # If enabled, shuffle each dataset independently before merging.
    # This allows curriculum learning strategies to be applied at the dataset level.
    if cfg.shuffle_before_merging_datasets:
        LOG.info("Shuffling each dataset individually before merging...")
        datasets = [ds.shuffle(seed=cfg.seed) for ds in datasets]

    LOG.info("Merging datasets...")
    merged_dataset = concatenate_datasets(datasets)

    if cfg.shuffle_merged_datasets:
        LOG.debug("Shuffling merged datasets...")
        if cfg.curriculum_sampling:
            LOG.warning(
                "Shuffling merged datasets with curriculum sampling is not recommended. "
                "This will randomize the order of samples."
            )
        merged_dataset = merged_dataset.shuffle(seed=cfg.seed)
    else:
        LOG.debug("Not shuffling merged datasets.")

    return merged_dataset


================================================
FILE: src/axolotl/utils/data/streaming.py
================================================
"""Data handling specific to streaming datasets."""

import functools
from collections import defaultdict
from typing import Callable, Dict, List, Optional

import torch
from datasets import Dataset
from torch.utils.data import RandomSampler
from transformers import PreTrainedTokenizerBase

from axolotl.utils.collators import PretrainingBatchSamplerDataCollatorForSeq2Seq
from axolotl.utils.logging import get_logger
from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths
from axolotl.utils.trainer import process_pretraining_datasets_for_packing

LOG = get_logger(__name__)


def encode_streaming(
    examples: Dict[str, List],
    tokenizer: PreTrainedTokenizerBase,
    max_tokens: int,
    text_column: str = "text",
    concatenate: bool = True,
) -> Dict[str, List]:
    res = tokenizer(
        examples[text_column],
        truncation=True,
        max_length=max_tokens - 2,
        add_special_tokens=True,
    )
    # Convert to PyTorch tensors
    input_ids = [torch.tensor(seq) for seq in res["input_ids"]]
    targets = [torch.tensor(seq) for seq in res["input_ids"]]
    attention_mask = [torch.tensor(seq) for seq in res["attention_mask"]]
    if not concatenate:
        return {
            "input_ids": [seq.tolist() for seq in input_ids],
            "labels": [seq.tolist() for seq in targets],
            "attention_mask": [seq.tolist() for seq in attention_mask],
        }

    new_input_ids = []
    new_labels = []
    new_attention_mask = []
    # Append EOS and PAD tokens to input_ids, and correct attention_mask
    for i, _ in enumerate(input_ids):
        input_ids[i] = torch.cat(
            (
                input_ids[i],
                torch.tensor([tokenizer.eos_token_id, tokenizer.pad_token_id]),
            ),
            dim=0,
        )
        targets[i] = torch.cat(
            (
                targets[i],
                torch.tensor([tokenizer.eos_token_id, -100]),
            ),
            dim=0,
        )
        attention_mask[i] = torch.cat((attention_mask[i], torch.tensor([1, 0])), dim=0)

    # Concatenate tokens so that their lengths are less than max_tokens
    buffer_input_ids = torch.tensor([], dtype=torch.long)
    buffer_labels = torch.tensor([], dtype=torch.long)
    buffer_attention_mask = torch.tensor([], dtype=torch.long)

    for ids, labels, mask in zip(input_ids, targets, attention_mask, strict=False):
        if buffer_input_ids.numel() == max_tokens:
            new_input_ids.append(buffer_input_ids)
            new_labels.append(buffer_labels)
            new_attention_mask.append(buffer_attention_mask)
            buffer_input_ids = torch.tensor([], dtype=torch.long)
            buffer_labels = torch.tensor([], dtype=torch.long)
            buffer_attention_mask = torch.tensor([], dtype=torch.long)
            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
            buffer_labels = torch.cat((buffer_labels, labels), dim=0)
            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
        elif buffer_input_ids.numel() + ids.numel() <= max_tokens:
            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
            buffer_labels = torch.cat((buffer_labels, labels), dim=0)
            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)
        else:
            buffer_input_ids = torch.cat(
                (
                    buffer_input_ids,
                    torch.full(
                        (max_tokens - buffer_input_ids.numel(),),
                        tokenizer.pad_token_id,
                        dtype=torch.long,
                    ),
                ),
                dim=0,
            )
            buffer_labels = torch.cat(
                (
                    buffer_labels,
                    torch.full(
                        (max_tokens - buffer_labels.numel(),),
                        -100,
                        dtype=torch.long,
                    ),
                ),
                dim=0,
            )
            buffer_attention_mask = torch.cat(
                (
                    buffer_attention_mask,
                    torch.full(
                        (max_tokens - buffer_attention_mask.numel(),),
                        0,
                        dtype=torch.long,
                    ),
                ),
                dim=0,
            )
            new_input_ids.append(buffer_input_ids)
            new_labels.append(buffer_labels)
            new_attention_mask.append(buffer_attention_mask)
            buffer_input_ids = torch.tensor([], dtype=torch.long)
            buffer_labels = torch.tensor([], dtype=torch.long)
            buffer_attention_mask = torch.tensor([], dtype=torch.long)

            buffer_input_ids = torch.cat((buffer_input_ids, ids), dim=0)
            buffer_labels = torch.cat((buffer_labels, labels), dim=0)
            buffer_attention_mask = torch.cat((buffer_attention_mask, mask), dim=0)

    if buffer_input_ids.numel() > 0:  # for any leftover tokens
        while buffer_input_ids.numel() < max_tokens:  # make all sequences equal in size
            buffer_input_ids = torch.cat(
                (
                    buffer_input_ids,
                    torch.full(
                        (max_tokens - buffer_input_ids.numel(),),
                        tokenizer.pad_token_id,
                        dtype=torch.long,
                    ),
                ),
                dim=0,
            )
            buffer_labels = torch.cat(
                (
                    buffer_labels,
                    torch.full(
                        (max_tokens - buffer_labels.numel(),),
                        -100,
                        dtype=torch.long,
                    ),
                ),
                dim=0,
            )
            buffer_attention_mask = torch.cat(
                (
                    buffer_attention_mask,
                    torch.full(
                        (max_tokens - buffer_attention_mask.numel(),),
                        0,
                        dtype=torch.long,
                    ),
                ),
                dim=0,
            )
        new_input_ids.append(buffer_input_ids)
        new_labels.append(buffer_labels)
        new_attention_mask.append(buffer_attention_mask)

    ret = {
        "input_ids": [seq.tolist() for seq in new_input_ids],
        "labels": [seq.tolist() for seq in new_labels],
        "attention_mask": [seq.tolist() for seq in new_attention_mask],
    }

    LOG.debug(len(ret["input_ids"]))
    return ret


def wrap_streaming_dataset(
    dataset,
    tokenizer,
    cfg,
    ds_wrapper_fn,
):
    if cfg.sample_packing:
        # For SFT (non-pretraining) datasets, always use multipack_attn=True to ensure
        # attention isolation between packed sequences
        multipack_attn = (
            True if not cfg.pretraining_dataset else cfg.pretrain_multipack_attn
        )

        collate_fn = PretrainingBatchSamplerDataCollatorForSeq2Seq(
            tokenizer,
            return_tensors="pt",
            padding=True,
            pad_to_multiple_of=cfg.sequence_len,
            multipack_attn=multipack_attn,
        )
        encode = functools.partial(
            encode_packed_streaming,
            collate_fn,
            ds_wrapper_fn,
            max_seq_length=cfg.sequence_len,
            batch_size=cfg.micro_batch_size,
            multipack_attn=multipack_attn,
            bin_size=cfg.sample_packing_bin_size,
        )

        # Set this to 1 so downstream data_loader doesn't try to increase the batch size
        # again
        cfg.micro_batch_size = 1
    else:
        # NOTE: This is not reachable for SFT datasets since we use the pre-existing
        # loading function for non-packed streaming datasets. Refer to
        # _prepare_streaming_datasets in sft.py for that code path.
        text_column = (
            getattr(cfg.pretraining_dataset[0], "text_column", "text") or "text"
        )
        encode = functools.partial(
            encode_streaming,
            tokenizer=tokenizer,
            max_tokens=cfg.sequence_len,
            text_column=text_column,
            concatenate=cfg.pretraining_sample_concatenation is True,
        )

    if cfg.shuffle_merged_datasets:
        dataset = dataset.shuffle(
            seed=cfg.seed, buffer_size=cfg.streaming_multipack_buffer_size
        )
    else:
        LOG.debug("NOT shuffling merged pretraining datasets")

    # remove all the existing columns after mapping since they end up having
    # a different length than the encoded/tokenized column
    # this is empty during streaming/pretraining
    remove_columns = []
    if dataset.features is None:
        for first_row in dataset:
            remove_columns = list(first_row.keys())
            break
    else:
        remove_columns = list(dataset.features.keys())

    dataset = dataset.map(
        encode,
        batched=True,
        batch_size=cfg.streaming_multipack_buffer_size,
        remove_columns=remove_columns,
    )
    return dataset


def encode_packed_streaming(
    collate_fn,
    ds_wrapper: Callable,
    examples: Dict[str, List],
    bin_size: int,
    max_seq_length: int = 2048,
    batch_size: int = 4,
    multipack_attn: Optional[bool] = True,
) -> Dict[str, List]:
    # tokenize all the examples
    # rows get split with stride (overlap)
    train_dataset = ds_wrapper(dataset=Dataset.from_dict(examples))[0]

    train_dataset = process_pretraining_datasets_for_packing(
        train_dataset,
        max_seq_length,
        skip_position_ids=not multipack_attn,
        # FIXME using attention mask unpad/pad with trainer and packed pretraining is broken atm
        # workaround by using the position id logic for now in trainer
        drop_attention_mask=multipack_attn,
    )

    sampler = MultipackBatchSampler(
        sampler=RandomSampler(train_dataset),
        lengths=get_dataset_lengths(train_dataset),
        batch_size=1,
        batch_max_len=batch_size * max_seq_length,
        drop_last=True,
        num_processes=1,
        bin_size=bin_size,
    )

    chunked_data = defaultdict(list)

    for batch in sampler:
        for data in batch:
            features = train_dataset[data]
            if "num_truncated_tokens" in features:
                del features["num_truncated_tokens"]
            if "overflow_to_sample_mapping" in features:
                del features["overflow_to_sample_mapping"]
            if "labels" not in features:
                features["labels"] = features["input_ids"].copy()
            collated_features = collate_fn(features)

            for feature in features.keys():
                if feature == "length":
                    continue
                chunked_data[feature].append(collated_features[feature].squeeze(0))

    return chunked_data


================================================
FILE: src/axolotl/utils/data/utils.py
================================================
"""Data handling helpers"""

import contextlib
import functools
import hashlib
import time
from enum import Enum
from typing import Callable

import huggingface_hub
import numpy as np
import requests
from datasets import Dataset, IterableDataset

from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger
from axolotl.utils.samplers.utils import get_dataset_lengths
from axolotl.utils.trainer import filter_sequences_by_length

LOG = get_logger(__name__)


class RetryStrategy(Enum):
    """Enum for retry strategies."""

    CONSTANT = 1
    LINEAR = 2
    EXPONENTIAL = 3


def retry_on_request_exceptions(
    max_retries=3, delay=1, retry_strategy: RetryStrategy = RetryStrategy.LINEAR
) -> Callable:
    """Decorator that retries function calls on specific request exceptions.

    Args:
        max_retries: Maximum number of retry attempts.
        delay: Base delay between retries in seconds.
        retry_strategy: Strategy for calculating retry delays.

    Returns:
        Decorated function with retry logic.
    """

    def decorator(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            for attempt in range(max_retries):
                try:
                    return func(*args, **kwargs)
                except (
                    requests.exceptions.ReadTimeout,
                    requests.exceptions.ConnectionError,
                    requests.exceptions.HTTPError,
                    huggingface_hub.errors.HfHubHTTPError,
                ) as exc:
                    if attempt < max_retries - 1:
                        if retry_strategy == RetryStrategy.EXPONENTIAL:
                            step_delay = delay * 2**attempt
                        elif retry_strategy == RetryStrategy.LINEAR:
                            step_delay = delay * (attempt + 1)
                        else:
                            step_delay = delay  # Use constant delay.
                        time.sleep(step_delay)
                    else:
                        raise exc

        return wrapper

    return decorator


def md5(to_hash: str, encoding: str = "utf-8") -> str:
    """Generate MD5 hash of a string."""
    try:
        return hashlib.md5(to_hash.encode(encoding), usedforsecurity=False).hexdigest()
    except TypeError:
        return hashlib.md5(to_hash.encode(encoding)).hexdigest()  # nosec


def sha256(to_hash: str, encoding: str = "utf-8") -> str:
    """Generate SHA256 hash of a string."""
    return hashlib.sha256(to_hash.encode(encoding)).hexdigest()


def _deduplicate_dataset(
    dataset: Dataset,
    seen_hashes: set[str] | None = None,
) -> tuple[Dataset, set[str]]:
    """Remove duplicate rows from a dataset using SHA256 hashes.

    Args:
        dataset: Dataset to deduplicate.
        seen_hashes: Set of previously seen row hashes (for cross-deduplication).

    Returns:
        Tuple of deduplicated dataset and the set of seen hashes.
    """
    if seen_hashes is None:
        seen_hashes = set()

    unique_indices = []
    for idx, row in enumerate(dataset):
        row_hash = sha256(str(row))  # Using SHA256 for collision resistance
        if row_hash not in seen_hashes:
            seen_hashes.add(row_hash)
            unique_indices.append(idx)

    return dataset.select(unique_indices), seen_hashes


def deduplicate_and_log_datasets(
    dataset: Dataset,
    other_dataset: Dataset | None = None,
    dataset_name: str | None = "train",
    other_name: str | None = "eval",
) -> tuple[Dataset, Dataset | None]:
    """Deduplicate datasets, with optional cross-dataset deduplication.

    Args:
        dataset: Primary dataset to deduplicate.
        other_dataset: Optional second dataset to deduplicate against the first.
        dataset_name: Name for the primary dataset (for logging).
        other_name: Name for the second dataset (for logging).

    Returns:
        Tuple of (deduplicated_dataset, deduplicated_other_dataset).
    """
    # Deduplicate primary dataset
    LOG.info(
        f"Starting deduplication for {dataset_name} dataset. Original size: {len(dataset)}"
    )
    dataset, seen_rows = _deduplicate_dataset(dataset)
    LOG.info(
        f"Deduplication complete for {dataset_name} dataset. New size: {len(dataset)}"
    )

    # Deduplicate second dataset if provided
    if other_dataset is not None:
        LOG.info(
            f"Starting deduplication for {other_name} dataset. Original size: {len(other_dataset)}"
        )
        other_dataset, _ = _deduplicate_dataset(other_dataset, seen_rows)
        LOG.info(
            f"Deduplication complete for {other_name} dataset. New size: {len(other_dataset)}"
        )

    return dataset, other_dataset


def keep_min_len(sample, min_sequence_len=2):
    """
    Batched filter function that keeps only samples with sequence length >= min_sequence_len.
    Returns a list of booleans indicating which samples to keep.
    """
    min_sequence_len = min_sequence_len or 2

    input_ids = sample["input_ids"]

    # Batched (input_ids is a list of lists)
    results = []
    for seq in input_ids:
        results.append(len(seq) >= min_sequence_len)
    return results


def truncate_long_seq(sample, sequence_len=2048):
    """
    Truncate samples whose sequence length is too long (> sequence_len).
    Modifies the sample in-place and returns the modified sample.
    """
    input_ids = sample["input_ids"]

    # Batched (input_ids is a list of lists)
    for i, seq in enumerate(input_ids):
        length = len(seq)
        if length > sequence_len:
            sample["input_ids"][i] = seq[:sequence_len]
            if "attention_mask" in sample:
                sample["attention_mask"][i] = sample["attention_mask"][i][:sequence_len]
            if "labels" in sample:
                sample["labels"][i] = sample["labels"][i][:sequence_len]
            if "position_ids" in sample:
                sample["position_ids"][i] = sample["position_ids"][i][:sequence_len]
    return sample


def _should_skip_processing(dataset: Dataset) -> bool:
    """Check if dataset should skip long sequence handling."""
    if (
        hasattr(dataset, "column_names")
        and dataset.column_names
        and "input_ids" not in dataset.column_names
    ):
        LOG.warning(
            "Dataset does not contain 'input_ids' column. Skip drop long seq. This is "
            "expected for reward modeling."
        )
        return True
    elif not hasattr(dataset, "column_names") or dataset.column_names is None:
        LOG.info(
            "Dataset is streaming (IterableDataset), skipping long sequence handling"
        )
        return True
    return False


def _log_dataset_stats(dataset: Dataset) -> None:
    """Log min/max sequence lengths for debugging."""
    with contextlib.suppress(AttributeError, ValueError):
        ds_lengths = get_dataset_lengths(dataset, from_arrow=True)
        LOG.info(f"min_input_len: {np.min(ds_lengths)}")
        LOG.info(f"max_input_len: {np.max(ds_lengths)}")


def _build_filter_kwargs(dataset: Dataset, cfg: DictDefault) -> dict:
    """Build kwargs for dataset filter/map operations."""
    kwargs = {}
    if not isinstance(dataset, IterableDataset):
        kwargs["num_proc"] = cfg.dataset_num_proc
        kwargs["load_from_cache_file"] = not cfg.is_preprocess
    return kwargs


def _filter_short_sequences(
    dataset: Dataset, min_len: int, filter_kwargs: dict
) -> tuple[Dataset, int]:
    """Filter out sequences shorter than min_len. Returns (dataset, num_dropped)."""
    prior_len = len(dataset) if hasattr(dataset, "__len__") else None

    desc_kwargs = {}
    if filter_kwargs:
        desc_kwargs["desc"] = f"Filtering Short Sequences (<{min_len})"

    dataset = dataset.filter(
        functools.partial(keep_min_len, min_sequence_len=min_len),
        batched=True,
        **filter_kwargs,
        **desc_kwargs,
    )

    dropped = 0
    if prior_len:
        dropped = prior_len - len(dataset)
        if dropped > 0:
            LOG.info(f"Dropped {dropped} short sequences (<{min_len} tokens)")

    return dataset, dropped


def _truncate_long_sequences(
    dataset: Dataset, max_len: int, map_kwargs: dict
) -> Dataset:
    """Truncate sequences longer than max_len."""
    desc_kwargs = {}
    if map_kwargs:
        desc_kwargs["desc"] = f"Truncating Sequences (target_len={max_len})"

    dataset = dataset.map(
        functools.partial(truncate_long_seq, sequence_len=max_len),
        batched=True,
        **map_kwargs,
        **desc_kwargs,
    )
    LOG.info(f"Truncated long sequences to max length {max_len}")
    return dataset


def _drop_outside_range(
    dataset: Dataset,
    max_len: int,
    min_len: int,
    raise_on_long: bool,
    filter_kwargs: dict,
) -> tuple[Dataset, int]:
    """Drop sequences outside valid length range [min_len, max_len].

    Returns (dataset, num_dropped)."""
    prior_len = len(dataset) if hasattr(dataset, "__len__") else None

    desc_kwargs = {}
    if filter_kwargs:
        action = (
            "Checking Sequence Lengths"
            if raise_on_long
            else "Dropping Invalid Sequences"
        )
        desc_kwargs["desc"] = f"{action} (<{min_len} or >{max_len})"

    dataset = dataset.filter(
        functools.partial(
            filter_sequences_by_length,
            sequence_len=max_len,
            min_sequence_len=min_len,
            raise_on_drop=raise_on_long,
        ),
        batched=True,
        **filter_kwargs,
        **desc_kwargs,
    )

    dropped = 0
    if not raise_on_long and prior_len:
        dropped = prior_len - len(dataset)
        if dropped > 0:
            LOG.info(
                f"Dropped {dropped} sequences outside valid range "
                f"([{min_len}, {max_len}])"
            )

    return dataset, dropped


def handle_long_seq_in_dataset(
    dataset: Dataset, sequence_len: int, cfg: DictDefault
) -> Dataset:
    """Remove sequences longer than configured maximum from dataset.

    Args:
        dataset: Dataset to filter.
        sequence_len: Maximum length for sequences to keep
        cfg: Dictionary mapping `axolotl` config keys to values.

    Returns:
        Filtered dataset with long sequences handled according to the excess_length_strategy value:
            'drop' (default)    excludes any sequence longer than sequence_len
            'truncate'          truncates them down to sequence_len
            'raise'             raises a ValueError if any sequence was found that was longer than sequence_len
    """
    # Early returns for special cases
    if _should_skip_processing(dataset):
        return dataset

    excess_length_strategy = (cfg.excess_length_strategy or "drop").lower()

    _log_dataset_stats(dataset)

    # Setup kwargs
    filter_kwargs = _build_filter_kwargs(dataset, cfg)

    # Handle sequences based on strategy
    if excess_length_strategy == "truncate":
        dataset, _ = _filter_short_sequences(dataset, cfg.min_sample_len, filter_kwargs)
        dataset = _truncate_long_sequences(dataset, sequence_len, filter_kwargs)
    else:
        raise_on_long = excess_length_strategy == "raise"
        dataset, _ = _drop_outside_range(
            dataset, sequence_len, cfg.min_sample_len, raise_on_long, filter_kwargs
        )

    return dataset


================================================
FILE: src/axolotl/utils/data/wrappers.py
================================================
"""Data handling specific to SFT."""

import logging
from typing import Any, NoReturn, cast

from datasets import (
    Dataset,
    IterableDataset,
    Sequence,
    Value,
)
from transformers import PreTrainedTokenizer
from transformers.processing_utils import ProcessorMixin

from axolotl.datasets import TokenizedPromptDataset, wrap_dataset_for_tokenized_prompt
from axolotl.prompt_strategies import load
from axolotl.prompt_strategies.bradley_terry import load as bradley_terry_load
from axolotl.prompt_tokenizers import (
    AlpacaMultipleChoicePromptTokenizingStrategy,
    AlpacaPromptTokenizingStrategy,
    AlpacaReflectionPTStrategy,
    DatasetWrappingStrategy,
    GPTeacherPromptTokenizingStrategy,
    JeopardyPromptTokenizingStrategy,
    OpenAssistantPromptTokenizingStrategy,
    PromptTokenizingStrategy,
    SummarizeTLDRPromptTokenizingStrategy,
)
from axolotl.prompters import (
    AlpacaPrompter,
    GPTeacherPrompter,
    JeopardyPrompter,
    MultipleChoiceConcisePrompter,
    MultipleChoiceExplainPrompter,
    Prompter,
    ReflectAlpacaPrompter,
    SummarizeTLDRPrompter,
    UnsupportedPrompter,
)
from axolotl.utils.dict import DictDefault

LOG = logging.getLogger(__name__)


def handle_unknown_dataset_strategy(dataset_config: DictDefault) -> NoReturn:
    """Raise error for unknown dataset strategy."""
    ds_type = dataset_config.type
    suffix = ""
    if ":load_" in ds_type:
        suffix = f"Did you mean {ds_type.replace(':load_', '.load_')}?"

    error_message = f"unhandled prompt tokenization strategy: {ds_type}. {suffix}"
    LOG.error(error_message)
    raise ValueError(error_message)


def get_dataset_wrapper(
    dataset_config: DictDefault,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset_base_type: str | None,
    dataset: Dataset | IterableDataset,
    dataset_prompt_style: str | None = None,
    processor: ProcessorMixin | None = None,
) -> tuple[Dataset | IterableDataset, Prompter | None]:
    """Create an appropriate dataset wrapper and prompter based on dataset
    configuration.

    Args:
        dataset_config: Configuration for the dataset.
        tokenizer: Tokenizer to use for processing text.
        cfg: Global configuration object.
        dataset_base_type: The base type of the dataset.
        dataset: The actual dataset object.
        dataset_prompt_style: Optional prompt style specification.
        processor: Optional processor for multimodal datasets.

    Returns:
        tuple of (dataset_wrapper, dataset_prompter).
    """
    # Common parameters for dataset wrapping
    dataset_kwargs: dict[str, Any] = {
        "process_count": cfg.dataset_num_proc,
        "keep_in_memory": cfg.dataset_keep_in_memory is True,
    }

    LOG.info(
        f"Loading dataset: {dataset_config['path']} with base_type: "
        f"{dataset_base_type} and prompt_style: {dataset_prompt_style}"
    )

    # Dataset is already tokenized
    if _is_dataset_already_tokenized(dataset):
        return dataset, UnsupportedPrompter()

    # Custom dataset type definition
    if isinstance(dataset_config.type, DictDefault):
        return _handle_custom_dataset_type(
            dataset_config, tokenizer, cfg, dataset, dataset_kwargs
        )

    # Skip preparation if configured
    if cfg.skip_prepare_dataset:
        return dataset, None

    # Bradley-Terry dataset
    if dataset_config.type.startswith("bradley_terry"):
        return _handle_bradley_terry_dataset(
            dataset_config, tokenizer, cfg, dataset, dataset_kwargs
        )

    # Stepwise supervised dataset
    if dataset_config.type.startswith("stepwise_supervised"):
        return _handle_stepwise_supervised_dataset(
            dataset_config, tokenizer, cfg, dataset, dataset_kwargs
        )

    # Try to load prompt tokenizer / dataset wrapper strategy from registry
    dataset_strategy = load(
        dataset_config.type, tokenizer, cfg, dataset_config, processor=processor
    )
    if dataset_strategy:
        return _handle_loaded_strategy(dataset_strategy, dataset, dataset_kwargs)

    # Known dataset types with specific handling
    if dataset_base_type in DATASET_HANDLERS:
        handler = DATASET_HANDLERS[dataset_base_type]
        return handler(dataset_prompt_style, tokenizer, cfg, dataset, dataset_kwargs)

    # Unhandled dataset type
    handle_unknown_dataset_strategy(dataset_config)


def _is_dataset_already_tokenized(dataset: Dataset | IterableDataset) -> bool:
    """Check if the dataset is already tokenized."""
    return (
        isinstance(dataset, Dataset)
        and "input_ids" in dataset.features
        and "attention_mask" in dataset.features
        and "labels" in dataset.features
    )


def _handle_custom_dataset_type(
    dataset_config: DictDefault,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle a custom dataset type defined in the configuration."""
    dataset_strategy = cast(
        PromptTokenizingStrategy,
        load("user_defined", tokenizer, cfg, dataset_config.type.to_dict()),
    )
    dataset_prompter = UnsupportedPrompter()
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter


def _handle_bradley_terry_dataset(
    dataset_config: DictDefault,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
) -> tuple[Dataset | IterableDataset, Prompter | None]:
    """Handle a Bradley-Terry dataset."""
    bt_type = dataset_config.type.split(".", 1)[1]
    dataset_strategy = bradley_terry_load(bt_type, tokenizer, cfg, dataset_config)

    if not dataset_strategy:
        handle_unknown_dataset_strategy(dataset_config)

    dataset_prompter = UnsupportedPrompter()
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )

    return dataset_wrapper, dataset_prompter


def _handle_stepwise_supervised_dataset(
    dataset_config: DictDefault,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle a stepwise supervised dataset."""
    dataset_prompter = UnsupportedPrompter()
    dataset_strategy = load(dataset_config.type, tokenizer, cfg, dataset_config)

    # We need to explicitly cast boolean labels to int
    # for compatibility with how trl's PRMTrainer works
    if isinstance(dataset, Dataset):
        dataset = dataset.cast_column("labels", Sequence(Value("int64")))

    dataset_wrapper = TokenizedPromptDataset(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter


def _handle_loaded_strategy(
    dataset_strategy: PromptTokenizingStrategy | DatasetWrappingStrategy,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
) -> tuple[Dataset | IterableDataset, Prompter | None]:
    """Handle a dataset with a strategy loaded from the registry."""
    if isinstance(dataset_strategy, DatasetWrappingStrategy):
        return dataset_strategy.wrap_dataset(dataset, **dataset_kwargs), None

    dataset_prompter = UnsupportedPrompter()
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter


def _handle_alpaca_dataset(
    dataset_prompt_style: str | None,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle an Alpaca dataset."""
    dataset_prompter = AlpacaPrompter(dataset_prompt_style)
    dataset_strategy = AlpacaPromptTokenizingStrategy(
        dataset_prompter,
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter


def _handle_explainchoice_dataset(
    dataset_prompt_style: str | None,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle an ExplainChoice dataset."""
    dataset_prompter = MultipleChoiceExplainPrompter(dataset_prompt_style)
    dataset_strategy = AlpacaMultipleChoicePromptTokenizingStrategy(
        dataset_prompter,
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter


def _handle_concisechoice_dataset(
    dataset_prompt_style: str | None,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle a ConciseChoice dataset."""
    dataset_prompter = MultipleChoiceConcisePrompter(dataset_prompt_style)
    dataset_strategy = AlpacaMultipleChoicePromptTokenizingStrategy(
        dataset_prompter,
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter


def _handle_summarizetldr_dataset(
    dataset_prompt_style: str | None,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle a SummarizeTLDR dataset."""
    dataset_prompter = SummarizeTLDRPrompter(dataset_prompt_style)
    dataset_strategy = SummarizeTLDRPromptTokenizingStrategy(
        dataset_prompter,
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter


def _handle_jeopardy_dataset(
    dataset_prompt_style: str | None,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle a Jeopardy dataset."""
    dataset_prompter = JeopardyPrompter(dataset_prompt_style)
    dataset_strategy = JeopardyPromptTokenizingStrategy(
        dataset_prompter,
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter


def _handle_oasst_dataset(
    dataset_prompt_style: str | None,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle an OpenAssistant dataset."""
    dataset_prompter = AlpacaPrompter(dataset_prompt_style)
    dataset_strategy = OpenAssistantPromptTokenizingStrategy(
        dataset_prompter,
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter


def _handle_gpteacher_dataset(
    dataset_prompt_style: str | None,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle a GPTeacher dataset."""
    dataset_prompter = GPTeacherPrompter(dataset_prompt_style)
    dataset_strategy = GPTeacherPromptTokenizingStrategy(
        dataset_prompter,
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter


def _handle_reflection_dataset(
    dataset_prompt_style: str | None,
    tokenizer: PreTrainedTokenizer,
    cfg: DictDefault,
    dataset: Dataset | IterableDataset,
    dataset_kwargs: dict[str, Any],
) -> tuple[Dataset | IterableDataset, Prompter]:
    """Handle a Reflection dataset."""
    dataset_prompter = ReflectAlpacaPrompter(dataset_prompt_style)
    dataset_strategy = AlpacaReflectionPTStrategy(
        dataset_prompter,
        tokenizer,
        cfg.train_on_inputs,
        cfg.sequence_len,
    )
    dataset_wrapper = wrap_dataset_for_tokenized_prompt(
        dataset_strategy,
        dataset,
        **dataset_kwargs,
    )
    return dataset_wrapper, dataset_prompter


DATASET_HANDLERS = {
    "alpaca": _handle_alpaca_dataset,
    "explainchoice": _handle_explainchoice_dataset,
    "concisechoice": _handle_concisechoice_dataset,
    "summarizetldr": _handle_summarizetldr_dataset,
    "jeopardy": _handle_jeopardy_dataset,
    "oasst": _handle_oasst_dataset,
    "gpteacher": _handle_gpteacher_dataset,
    "reflection": _handle_reflection_dataset,
}


================================================
FILE: src/axolotl/utils/datasets.py
================================================
"""helper functions for datasets"""

import os

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def get_default_process_count():
    if axolotl_dataset_num_proc := os.environ.get("AXOLOTL_DATASET_NUM_PROC"):
        return int(axolotl_dataset_num_proc)
    if axolotl_dataset_processes := os.environ.get("AXOLOTL_DATASET_PROCESSES"):
        LOG.warning(
            "AXOLOTL_DATASET_PROCESSES and `dataset_processes` are deprecated and will be "
            "removed in a future version. Please use `dataset_num_proc` instead."
        )
        return int(axolotl_dataset_processes)
    if runpod_cpu_count := os.environ.get("RUNPOD_CPU_COUNT"):
        return int(runpod_cpu_count)
    return os.cpu_count()


================================================
FILE: src/axolotl/utils/dict.py
================================================
"""Module containing the DictDefault class"""

from addict import Dict


class DictDefault(Dict):
    """
    A Dict that returns None instead of returning empty Dict for missing keys.
    """

    def __missing__(self, key):
        return None

    def __or__(self, other):
        return DictDefault(super().__ror__(other))

    def __setitem__(self, name, value):
        # workaround for pickle/unpickle issues and __frozen not being available
        try:
            isFrozen = hasattr(self, "__frozen") and object.__getattribute__(
                self, "__frozen"
            )
        except AttributeError:
            isFrozen = False

        if isFrozen and name not in super().keys():
            raise KeyError(name)
        super(Dict, self).__setitem__(name, value)
        try:
            p = object.__getattribute__(self, "__parent")
            key = object.__getattribute__(self, "__key")
        except AttributeError:
            p = None
            key = None
        if p is not None:
            p[key] = self
            object.__delattr__(self, "__parent")
            object.__delattr__(self, "__key")


def remove_none_values(obj):
    """
    Remove null from a dictionary-like obj or list.
    These can appear due to Dataset loading causing schema merge.
    See https://github.com/axolotl-ai-cloud/axolotl/pull/2909
    """
    if hasattr(obj, "items"):
        return {k: remove_none_values(v) for k, v in obj.items() if v is not None}
    if isinstance(obj, list):
        return [remove_none_values(elem) for elem in obj]
    return obj


================================================
FILE: src/axolotl/utils/distributed.py
================================================
"""Utilities for distributed functionality."""

import os
import pickle  # nosec
from contextlib import contextmanager
from datetime import timedelta

import torch
import torch.distributed as dist
from accelerate import PartialState
from accelerate.utils import ParallelismConfig
from transformers.utils.import_utils import (
    is_torch_cuda_available,
    is_torch_mps_available,
    is_torch_npu_available,
)

distributed_state = None


def get_device_type() -> torch.device:
    device = torch.device("cpu")
    if is_torch_cuda_available():
        device = torch.device("cuda")
    elif is_torch_mps_available():
        device = torch.device("mps")
    elif is_torch_npu_available():
        device = torch.device("npu")
    return device


def get_device_count() -> int:
    cur_device = get_device_type()
    if "cuda" in str(cur_device):
        return torch.cuda.device_count()
    if "npu" in str(cur_device):
        return torch.npu.device_count()
    return 1


def get_current_device() -> int:
    cur_device = get_device_type()
    if "cuda" in str(cur_device):
        return torch.cuda.current_device()
    if "npu" in str(cur_device):
        return torch.npu.current_device()
    return 0


def init_distributed_state():
    global distributed_state
    if distributed_state is None:
        timeout = int(os.environ.get("AXOLOTL_NCCL_TIMEOUT", 1800))
        try:
            distributed_state = PartialState(timeout=timedelta(seconds=timeout))
        except ValueError:
            pass


def get_distributed_state() -> PartialState | None:
    return distributed_state


def is_distributed() -> bool:
    """Check if distributed training is initialized."""
    init_distributed_state()

    if distributed_state is None:
        return False

    return distributed_state.use_distributed and distributed_state.initialized


def barrier():
    """
    Acts as a barrier to wait for all processes. This ensures that all processes
    reach the barrier before proceeding further.
    """
    if is_distributed():
        dist.barrier()


def is_main_process() -> bool:
    """
    Check if the current process is the main process. If not in distributed mode,
    always return `True`.

    We use a simpler logic when the distributed state is not initialized: we just log
    on the 0-th local rank.

    Returns:
        `True` if the current process is the main process, `False` otherwise.
    """
    if get_distributed_state() is None:
        return os.environ.get("LOCAL_RANK", "0") == "0"
    if not is_distributed():
        return True
    return dist.get_rank() == 0


def is_local_main_process() -> bool:
    if get_distributed_state() is None:
        return os.environ.get("LOCAL_RANK", "0") == "0"
    return PartialState().is_local_main_process


def get_world_size() -> int:
    return int(os.getenv("WORLD_SIZE", "1"))


def cleanup_distributed():
    """
    Destroy process group if torch distributed is initialized. Called in training early
    termination or when training successfully completes.
    """
    # Ensure that all operations are completed before destroying the process group
    if torch.cuda.is_available():
        torch.cuda.synchronize()

    if torch.xpu.is_available():
        torch.xpu.synchronize()

    # Destroy the process group
    if torch.distributed.is_initialized():
        torch.distributed.destroy_process_group()


@contextmanager
def zero_first(is_main: bool):
    """
    runs the wrapped context so that rank 0 runs first before other ranks
    """
    if not is_main:  # other ranks wait first
        barrier()
    yield
    if is_main:  # then rank 0 waits after it has run the context
        barrier()


def gather_scalar_from_all_ranks(fn, world_size=1):
    """
    Run a callable 'fn' on all ranks and gather the results on the specified rank.

    Args:
    - fn (callable): A function that computes the value. This should not have any side effects.
    - rank (int, optional): The rank that gathers the values. Default is 0.
    - world_size (int, optional): Total number of processes in the current distributed setup.

    Returns:
    - A list of computed values from all ranks if on the gathering rank, otherwise None.
    """
    value_scalar = fn()
    if not is_distributed():
        return [value_scalar]
    value_tensor = torch.tensor(
        value_scalar, device=f"{get_device_type()}:{get_current_device()}"
    ).float()

    if not is_main_process():
        dist.gather(value_tensor, dst=0)
    else:
        gathered_tensors = [torch.zeros_like(value_tensor) for _ in range(world_size)]
        dist.gather(value_tensor, gather_list=gathered_tensors, dst=0)

        # Convert tensors back to their original type (int or float)
        gathered_values = []
        for tensor in gathered_tensors:
            if tensor == tensor.int():
                gathered_values.append(int(tensor.item()))
            else:
                gathered_values.append(float(tensor.item()))
        return gathered_values
    return None


def broadcast_dict(vals: dict):
    if not is_distributed():
        return vals

    cur_device = get_device_type()
    if is_main_process():
        data_byte = pickle.dumps(vals)
        data_tensor = torch.ByteTensor(list(data_byte)).to(cur_device)
        data_size = torch.IntTensor([len(data_byte)]).to(cur_device)
    else:
        data_tensor = torch.empty([1024], dtype=torch.uint8, device=cur_device)
        data_size = torch.IntTensor([0]).to(cur_device)

    dist.broadcast(data_size, 0)
    if not is_main_process():
        # resize
        data_tensor = data_tensor.new_empty([data_size.item()])

    dist.broadcast(data_tensor, 0)

    if not is_main_process():
        data_list = data_tensor.cpu().tolist()
        data_byte = bytes(data_list[: data_size.item()])
        vals = pickle.loads(data_byte)  # nosec

    return vals


def compute_and_broadcast(fn):
    """
    Compute a value using the function 'fn' only on the specified rank (default is 0).
    The value is then broadcasted to all other ranks.

    Args:
    - fn (callable): A function that computes the value. This should not have any side effects.
    - rank (int, optional): The rank that computes the value. Default is 0.

    Returns:
    - The computed value (int or float).
    """
    cur_device = f"{get_device_type()}:{get_current_device()}"
    if is_main_process():
        value_scalar = fn()
        value_tensor = torch.tensor(
            value_scalar, device=cur_device, dtype=torch.float32
        )
    else:
        value_tensor = torch.tensor(
            0.0, device=cur_device, dtype=torch.float32
        )  # Placeholder tensor

    # Broadcast the tensor to all processes.
    barrier()
    dist.broadcast(value_tensor, src=0)

    # Convert the tensor back to its original type (int or float)
    if value_tensor == value_tensor.int():
        return int(value_tensor.item())
    return float(value_tensor.item())


def gather_from_all_ranks(fn, world_size=1):
    """
    Run a callable 'fn' on all ranks and gather the results on the specified rank.

    Args:
    - fn (callable): A function that computes the value. This should not have any side effects.
    - rank (int, optional): The rank that gathers the values. Default is 0.
    - world_size (int, optional): Total number of processes in the current distributed setup.

    Returns:
    - A list of computed values from all ranks if on the gathering rank, otherwise None.
    """
    value_scalar = fn()
    value_tensor = torch.tensor(
        value_scalar, device=f"{get_device_type()}:{get_current_device()}"
    ).float()

    # Placeholder tensor for gathering results
    if is_main_process():
        gathered_tensors = [torch.zeros_like(value_tensor) for _ in range(world_size)]
    else:
        gathered_tensors = None

    dist.gather(value_tensor, gather_list=gathered_tensors, dst=0)

    if is_main_process():
        # Convert tensors back to their original type (int or float)
        gathered_values = []
        for tensor in gathered_tensors:
            if tensor == tensor.int():
                gathered_values.append(int(tensor.item()))
            else:
                gathered_values.append(float(tensor.item()))
        return gathered_values
    return None


def reduce_and_broadcast(fn1, fn2):
    """
    Run a callable 'fn1' on all ranks, gather the results, reduce them using 'fn2',
    and then broadcast the reduced result to all ranks.

    Args:
    - fn1 (callable): A function that computes the value on each rank.
    - fn2 (callable): A reduction function that takes a list of values and returns a single value.
    - world_size (int, optional): Total number of processes in the current distributed setup.

    Returns:
    - The reduced and broadcasted value.
    """

    # Gather values from all ranks using fn1
    if not is_distributed():
        return fn2([fn1()])

    gathered_values = gather_from_all_ranks(fn1, world_size=dist.get_world_size())

    # Use compute_and_broadcast to compute the reduced value on the main process
    # and then broadcast it to all ranks
    return compute_and_broadcast(lambda: fn2(gathered_values))


def build_parallelism_config(cfg):
    pc_kwargs = _get_parallel_config_kwargs(
        get_world_size(),
        cfg.tensor_parallel_size,
        cfg.context_parallel_size,
        cfg.dp_shard_size,
        cfg.dp_replicate_size,
        bool(cfg.fsdp or cfg.fsdp_config),
    )

    if pc_kwargs:
        parallelism_config = ParallelismConfig(
            **pc_kwargs,
        )
        device_mesh = parallelism_config.build_device_mesh("cuda")

        return parallelism_config, device_mesh
    return None, None


def _get_parallel_config_kwargs(
    world_size: int,
    tensor_parallel_size: int = 1,
    context_parallel_size: int = 1,
    dp_shard_size: int | None = None,
    dp_replicate_size: int | None = None,
    is_fsdp: bool = False,
):
    pc_kwargs = {}
    remaining_world_size = world_size

    if tensor_parallel_size and tensor_parallel_size > 1:
        pc_kwargs["tp_size"] = tensor_parallel_size
        remaining_world_size = remaining_world_size // tensor_parallel_size

    if context_parallel_size and context_parallel_size > 1:
        pc_kwargs["cp_size"] = context_parallel_size
        remaining_world_size = remaining_world_size // context_parallel_size

    if dp_shard_size is None and dp_replicate_size in (None, 1):
        if remaining_world_size > 1:
            pc_kwargs["dp_shard_size"] = remaining_world_size
            remaining_world_size = 1

    if dp_replicate_size and dp_replicate_size > 1:
        pc_kwargs["dp_replicate_size"] = dp_replicate_size
        remaining_world_size = remaining_world_size // dp_replicate_size

    if remaining_world_size > 1 and dp_shard_size and dp_shard_size > 1:
        if not is_fsdp:
            raise ValueError(
                "dp_shard_size was configured without a corresponding fsdp_config! "
                "Please ensure you have configured FSDP using fsdp_config."
            )
        pc_kwargs["dp_shard_size"] = dp_shard_size
        remaining_world_size = remaining_world_size // dp_shard_size
        if remaining_world_size > 1 and "dp_replicate_size" not in pc_kwargs:
            pc_kwargs["dp_replicate_size"] = remaining_world_size
            remaining_world_size = 1

    if remaining_world_size > 1:
        if "dp_shard_size" not in pc_kwargs and is_fsdp:
            pc_kwargs["dp_shard_size"] = remaining_world_size
            remaining_world_size = 1

    if remaining_world_size > 1:
        raise ValueError(
            f"The configured parallelisms are incompatible with the current world size ({get_world_size()})!\n"
            f"{pc_kwargs}"
        )

    return pc_kwargs


================================================
FILE: src/axolotl/utils/environment.py
================================================
"""
utils to get GPU info for the current environment
"""

import os
from importlib.metadata import version

import torch
from accelerate.utils.environment import (
    check_cuda_p2p_ib_support as accelerate_check_cuda_p2p_ib_support,
)
from packaging.version import Version, parse

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def check_cuda_p2p_ib_support():
    if not accelerate_check_cuda_p2p_ib_support():
        return False
    if not check_cuda_p2p_support():
        return False
    return True


def check_cuda_p2p_support() -> bool:
    try:
        world_size = int(os.environ.get("WORLD_SIZE", "1"))
        local_rank = int(os.environ.get("LOCAL_RANK", "0"))
    except ValueError:
        return True

    if world_size > 1:
        node_world_size = int(os.environ.get("NODE_WORLD_SIZE", "8"))
        local_other_rank = (local_rank // node_world_size) * node_world_size
        local_other_rank += 1 if (local_rank % node_world_size) == 0 else 0
        try:
            can_p2p = torch.cuda.can_device_access_peer(local_rank, local_other_rank)
        except AssertionError as exc:
            # some sort of logic error in indexing processes, assume p2p is fine for now
            LOG.warning(exc)
            return True
        return can_p2p

    return True


def get_package_version(package: str) -> Version:
    version_str = version(package)
    return parse(version_str)


def is_package_version_ge(package: str, version_: str) -> bool:
    package_version = get_package_version(package)
    return package_version >= parse(version_)


================================================
FILE: src/axolotl/utils/freeze.py
================================================
"""
module to freeze/unfreeze parameters by name
"""

import re
from typing import Callable, List, Tuple, Union

from axolotl.utils.distributed import is_main_process
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def freeze_layers_except(model, regex_patterns):
    """
    Freezes all layers of the given model except for the layers that match given regex patterns.
    Periods in the patterns are treated as literal periods, not as wildcard characters.

    Parameters:
    - model (nn.Module): The PyTorch model to be modified.
    - regex_patterns (list of str): List of regex patterns to match layer names to keep unfrozen.
      Note that you cannot use a dot as a wildcard character in the patterns since it is reserved for separating layer names.
      Also, to match the entire layer name, the pattern should start with "^" and end with "$", otherwise it will match any part of the layer name.
      The range pattern part is optional and it is not compiled as a regex pattern which means you must put "$" before the range pattern if you want to match the entire layer name.
      E.g., ["^model.embed_tokens.weight$[:32000]", "layers.2[0-9]+.block_sparse_moe.gate.[a-z]+$"]

    Returns:
    None; the model is modified in place.
    """
    if isinstance(regex_patterns, str):
        regex_patterns = [regex_patterns]

    patterns = [LayerNamePattern(pattern) for pattern in regex_patterns]

    # Unfreeze layers that match the regex patterns
    for name, param in model.named_parameters():
        param.requires_grad = False
        unfrozen_ranges = []
        for pattern in patterns:
            if not pattern.match(name):
                continue

            param.requires_grad = True

            if pattern.range is not None:
                unfrozen_ranges.append(pattern.range)

        merged_unfrozen_ranges = _merge_ranges(unfrozen_ranges, len(param))

        if param.requires_grad and is_main_process():
            unfrozen_ranges = (
                f" with ranges {merged_unfrozen_ranges}"
                if merged_unfrozen_ranges
                else ""
            )
            LOG.debug(f"Unfrozen {name}{unfrozen_ranges}")

        if not merged_unfrozen_ranges:
            continue

        # The range list we need is actually the inverted of the merged ranges
        ranges_to_freeze = _invert_ranges(merged_unfrozen_ranges, len(param))

        param.register_hook(_create_freeze_parameters_hook(ranges_to_freeze))

    if is_main_process() and all(
        not param.requires_grad for param in model.parameters()
    ):
        LOG.warning("All parameters are frozen. Model will not be trained.")


def _invert_ranges(
    given_ranges: List[Tuple[int, int]], layer_size: int
) -> List[Tuple[int, int]]:
    """
    Inverts a list of ranges to obtain the ranges not covered by the given ranges.

    Parameters:
    - given_ranges (List[Tuple[int, int]]): List of ranges to invert. Each range is represented as a tuple of start (inclusive) and end (exclusive) indices.
    - layer_size (int): The length of the layer. E.g., len(model.layer.weight)
    Returns:
    - List[Tuple[int, int]]: List of inverted ranges, where each range is represented as a tuple of start (inclusive) and end (exclusive) indices.
    """
    if not given_ranges:
        return [(0, layer_size)]

    inverted_ranges = []
    current_start = 0

    for start, end in sorted(given_ranges):
        if start > current_start:
            inverted_ranges.append((current_start, start))
        current_start = max(current_start, end)

    # Handle the case where the last given range does not reach the end of the total_size
    if current_start < layer_size:
        inverted_ranges.append((current_start, layer_size))

    return inverted_ranges


def _merge_ranges(
    given_ranges: List[Tuple[int, Union[int, None]]], layer_size: int
) -> List[Tuple[int, int]]:
    """
    Merges overlapping ranges and sorts the given ranges.

    This function takes a list of ranges and merges any overlapping ranges. The ranges are represented
    as tuples, where the first element is the start index (inclusive) and the second element is the end
    index (exclusive). The end index can be None, indicating that the range extends to the end of the
    sequence.

    Parameters:
    - given_ranges (List[Tuple[int, int | None]]): List of ranges to merge.
    - layer_size (int): The length of the layer. E.g., len(model.layer.weight)

    Returns:
    - List[Tuple[int, int]]: List of merged ranges, as start (inclusive) and end (exclusive) indices.
    """
    # End of each range can be determined now since we have the total size
    processed_ranges = [
        (start, end if end is not None else layer_size) for start, end in given_ranges
    ]
    for start, end in processed_ranges:
        if start < 0 or end > layer_size > 0 or start >= end:
            raise ValueError(f"invalid unfreeze range: start={start}, end={end}")

    # No need to merge if there's only one or no ranges
    if len(processed_ranges) <= 1:
        return processed_ranges

    sorted_ranges = sorted(processed_ranges)

    merged_ranges = [sorted_ranges[0]]
    for start, end in sorted_ranges[1:]:
        prev_start, prev_end = merged_ranges[-1]
        if start <= prev_end:
            merged_ranges[-1] = (prev_start, max(prev_end, end))
        else:
            merged_ranges.append((start, end))

    return merged_ranges


def _create_freeze_parameters_hook(ranges_to_freeze: List[Tuple[int, int]]) -> Callable:
    """
    Create a hook to freeze parameters in specified ranges by setting their gradients to zero.

    This function takes a list of tuples representing the ranges of indices to freeze. Each tuple should contain
    two integers representing the start and end indices of the range.

    Parameters:
    - ranges_to_freeze (List[Tuple[int, int]]): Ranges of indices to freeze.

    Returns:
    - Callable: A hook function to be used with `register_hook` on parameters.

    Example usage:
    ```
    ranges_to_freeze = [(0, 10), (20, 30)]
    hook = _create_freeze_parameters_hook(ranges_to_freeze)
    model.register_hook(hook)
    ```
    """

    def freeze_parameters_hook(gradients):
        for start, end in ranges_to_freeze:
            gradients[start:end].zero_()

    return freeze_parameters_hook


class LayerNamePattern:
    """
    Represents a regex pattern for layer names, potentially including a parameter index range.
    """

    def __init__(self, pattern: str):
        """
        Initializes a new instance of the LayerNamePattern class.

        Parameters:
        - pattern (str): The regex pattern for layer names, potentially including a parameter index range.
        """
        self.raw_pattern = pattern
        name_pattern, self.range = self._parse_pattern(pattern)
        self.name_regex = re.compile(re.sub(r"\.(?!\+)", "\\.", name_pattern))

    def match(self, name: str) -> bool:
        """
        Checks if the given layer name matches the regex pattern.

        Parameters:
        - name (str): The layer name to check.

        Returns:
        - bool: True if the layer name matches the pattern, False otherwise.
        """
        return self.name_regex.match(name) is not None

    def _parse_pattern(
        self, pattern: str
    ) -> Tuple[str, Union[Tuple[int, Union[int, None]], None]]:
        """
        Extracts the range pattern from the given pattern.

        Parameters:
        - pattern (str): The pattern to extract the range from.

        Returns:
        - Tuple[str, Tuple[int, int | None] | None]: A tuple containing the regex pattern to match the layer name without the range pattern and the range of layer indices to match, if specified.
        """
        match = re.match(r"^(.+)\[([0-9]*)(?::([0-9]*))?\]$", pattern)
        if not match:
            return pattern, None

        base_pattern, start_part, end_part = match.groups()

        if end_part is None and start_part.isdecimal():
            index = int(start_part)
            return base_pattern, (index, index + 1)

        # [:end] or [start:] or [start:end]
        start = int(start_part) if start_part else 0
        end = int(end_part) if end_part else None

        if end is not None and start >= end:
            raise ValueError(
                f"Invalid range in layer name pattern: {pattern}."
                "End of range must be greater than start."
            )
        return base_pattern, (start, end)


================================================
FILE: src/axolotl/utils/generation/__init__.py
================================================
"""Generation utilities for monitoring during training."""

from .sft import format_generation_for_logging, generate_samples

__all__ = ["generate_samples", "format_generation_for_logging"]


================================================
FILE: src/axolotl/utils/generation/sft.py
================================================
"""Sample generation utilities for SFT/Pretrain training."""

from typing import Any, List, Optional

import torch
from accelerate.utils import extract_model_from_parallel
from colorama import Fore, Style

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def generate_samples(
    model: torch.nn.Module,
    tokenizer: Any,
    dataloader: Any,
    num_generation_samples: int = 3,
    max_new_tokens: int = 50,
    temperature: float = 0.7,
    top_p: Optional[float] = None,
    top_k: Optional[int] = None,
    do_sample: bool = True,
    prompt_ratio: float = 0.5,
) -> List[dict]:
    """
    Generate samples from the model during training for monitoring.

    Args:
        model: The model to generate from
        tokenizer: The tokenizer to use for encoding/decoding
        dataloader: Dataloader to sample prompts from
        num_generation_samples: Number of samples to generate
        max_new_tokens: Maximum new tokens to generate
        temperature: Sampling temperature (0.0 = greedy)
        top_p: Nucleus sampling parameter
        top_k: Top-k sampling parameter
        do_sample: Whether to use sampling vs greedy decoding
        prompt_ratio: Ratio of sequence to use as prompt (0.0-1.0)

    Returns:
        List of dicts with 'prompt', 'generated', and 'full_text' keys
    """
    unwrapped_model = extract_model_from_parallel(model)

    training = unwrapped_model.training
    unwrapped_model.eval()

    device = next(unwrapped_model.parameters()).device

    generations = []

    try:
        with torch.no_grad():
            samples_collected = 0

            for batch in dataloader:
                if samples_collected >= num_generation_samples:
                    break

                input_ids = batch["input_ids"].to(device)
                attention_mask = batch.get("attention_mask")
                if attention_mask is not None:
                    attention_mask = attention_mask.to(device)
                batch_size = input_ids.shape[0]

                indices = torch.randperm(batch_size)[
                    : num_generation_samples - samples_collected
                ]

                for idx in indices:
                    if samples_collected >= num_generation_samples:
                        break

                    sequence = input_ids[idx]

                    if attention_mask is not None:
                        seq_len = attention_mask[idx].sum().item()
                    else:
                        seq_len = sequence.shape[0]

                    if seq_len < 5:
                        continue

                    prompt_len = max(1, int(seq_len * prompt_ratio))
                    prompt_ids = sequence[:prompt_len].unsqueeze(0)

                    try:
                        generation_config = {
                            "max_new_tokens": max_new_tokens,
                            "do_sample": do_sample,
                            "pad_token_id": tokenizer.pad_token_id
                            if tokenizer.pad_token_id is not None
                            else tokenizer.eos_token_id,
                        }

                        if do_sample:
                            generation_config["temperature"] = temperature
                            if top_p is not None:
                                generation_config["top_p"] = top_p
                            if top_k is not None:
                                generation_config["top_k"] = top_k

                        generated_ids = unwrapped_model.generate(
                            prompt_ids, **generation_config
                        )

                        prompt_text = tokenizer.decode(
                            prompt_ids[0], skip_special_tokens=True
                        )
                        generated_text = tokenizer.decode(
                            generated_ids[0][prompt_len:], skip_special_tokens=True
                        )
                        full_text = tokenizer.decode(
                            generated_ids[0], skip_special_tokens=True
                        )

                        generations.append(
                            {
                                "prompt": prompt_text,
                                "generated": generated_text,
                                "full_text": full_text,
                            }
                        )

                        samples_collected += 1

                    except Exception as e:
                        LOG.warning(f"Failed to generate sample: {e}", exc_info=True)
                        continue

    except Exception as e:
        LOG.warning(f"Error during sample generation: {e}", exc_info=True)

    if training:
        unwrapped_model.train()
    else:
        unwrapped_model.eval()

    return generations


def format_generation_for_logging(
    sample: dict, sample_idx: int, step: int
) -> tuple[str, str]:
    """
    Format a generation sample for pretty logging.

    Args:
        sample: Dict with 'prompt', 'generated', and 'full_text' keys
        sample_idx: Index of the sample
        step: Current training step

    Returns:
        Tuple of (console_text, wandb_text)
    """
    console_text = (
        f"\n{Style.BRIGHT}{Fore.CYAN}{'=' * 80}{Style.RESET_ALL}\n"
        f"{Style.BRIGHT}{Fore.GREEN}Sample {sample_idx + 1} (Step {step}){Style.RESET_ALL}\n"
        f"{Style.BRIGHT}{Fore.CYAN}{'=' * 80}{Style.RESET_ALL}\n"
        f"{Style.BRIGHT}{Fore.YELLOW}[PROMPT]{Style.RESET_ALL}\n{sample['prompt']}\n\n"
        f"{Style.BRIGHT}{Fore.MAGENTA}[GENERATED]{Style.RESET_ALL}\n{sample['generated']}\n"
        f"{Style.BRIGHT}{Fore.CYAN}{'=' * 80}{Style.RESET_ALL}\n"
    )
    wandb_text = (
        f"\n{'=' * 80}\n"
        f"Sample {sample_idx + 1} (Step {step})\n"
        f"{'=' * 80}\n"
        f"[PROMPT]\n{sample['prompt']}\n\n"
        f"[GENERATED]\n{sample['generated']}\n"
        f"{'=' * 80}\n"
    )

    return console_text, wandb_text


================================================
FILE: src/axolotl/utils/import_helper.py
================================================
"""
Helper for importing modules from strings
"""

import importlib


def get_cls_from_module_str(module_str: str):
    # use importlib to dynamically load the reward function from the module
    if not isinstance(module_str, str) or not module_str.strip():
        raise ValueError("module_str must be a non-empty string")

    parts = module_str.split(".")
    if len(parts) < 2:
        raise ValueError(f"Invalid module string format: {module_str}")

    try:
        cls_name = parts[-1]
        module_path = ".".join(parts[:-1])
        mod = importlib.import_module(module_path)
        mod_cls = getattr(mod, cls_name)
        return mod_cls
    except ImportError as e:
        raise ImportError(f"Failed to import module '{module_path}': {e}") from e
    except AttributeError as e:
        raise AttributeError(
            f"Class '{cls_name}' not found in module '{module_path}': {e}"
        ) from e


================================================
FILE: src/axolotl/utils/logging.py
================================================
"""Logging helpers to only log on main process."""

import functools
import logging
import warnings

from axolotl.utils.distributed import is_main_process

# Suppress noisy bitsandbytes warnings about dtype casting during quantization
warnings.filterwarnings(
    "ignore",
    message=".*MatMul8bitLt: inputs will be cast from.*",
    category=UserWarning,
)

# Adapted from Accelerate
# https://github.com/huggingface/accelerate/blob/main/src/accelerate/logging.py


class MultiProcessAdapter(logging.LoggerAdapter):
    """
    Logger adapter for distributed logging, specifically to only log on main process.
    """

    @staticmethod
    def _should_log(main_process_only: bool):
        return not main_process_only or is_main_process()

    def log(self, level, msg, *args, **kwargs):
        main_process_only = kwargs.pop("main_process_only", True)
        kwargs.setdefault("stacklevel", 2)

        if self.isEnabledFor(level) and self._should_log(main_process_only):
            msg, kwargs = self.process(msg, kwargs)
            self.logger.log(level, msg, *args, **kwargs)

    @functools.lru_cache(maxsize=10)
    def warning_once(self, *args, **kwargs):
        """
        This method is identical to `logger.warning()`, but will emit the warning with the same message only once

        Note: The cache is for the function arguments, so 2 different callers using the same arguments will hit the
        cache. The assumption here is that all warning messages are unique across the code. If they aren't then need to
        switch to another type of cache that includes the caller frame information in the hashing function.
        """
        self.warning(*args, **kwargs)


def get_logger(name: str, log_level: str | None = None) -> MultiProcessAdapter:
    logger = logging.getLogger(name)
    logger.setLevel(logging.DEBUG)
    return MultiProcessAdapter(logger, extra={})


================================================
FILE: src/axolotl/utils/lora.py
================================================
# Copyright 2025 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
module to get the state dict of a merged lora model
"""

import torch
from peft.tuners.tuners_utils import onload_layer
from peft.utils import ModulesToSaveWrapper, _get_submodules


def get_lora_merged_state_dict(
    model: torch.nn.Module,
) -> dict:
    r"""
    Create and return a state_dict that has the LoRA deltas
    merged into the base model’s weights, without modifying `model` in place.

    Arguments:
        model (torch.nn.Module): A model that has LoRA/PEFT adapters attached.

    Returns:
        dict: A state_dict of the merged parameters.
    """

    base_model_prefix = "base_model.model."
    state_dict = {}
    key_list = [key for key, _ in model.named_modules() if model.prefix not in key]
    for key in key_list:
        try:
            _, target, _ = _get_submodules(model, key)
        except AttributeError:
            continue
        with onload_layer(target):
            weight_key = key.replace(base_model_prefix, "") + ".weight"
            bias_key = key.replace(base_model_prefix, "") + ".bias"
            if hasattr(target, "base_layer"):
                target.merge(safe_merge=True, adapter_names=None)
                # get the state_dict of target.base_layer
                layer_state_dict = target.base_layer.state_dict()
                state_dict[weight_key] = layer_state_dict["weight"]
            elif isinstance(target, ModulesToSaveWrapper):
                # save any additional trainable modules part of `modules_to_save`
                new_module = target.modules_to_save[target.active_adapter]
                if hasattr(new_module, "base_layer"):
                    # check if the module is itself a tuner layer
                    new_module.merge(safe_merge=True, adapter_names=None)
                layer_state_dict = new_module.state_dict()
                state_dict[weight_key] = layer_state_dict["weight"]
            elif hasattr(target, "weight"):
                if any(
                    skip in key
                    for skip in [
                        ".original_module",
                        ".modules_to_save",
                        ".base_layer",
                    ]
                ):
                    continue
                layer_state_dict = target.state_dict()
                state_dict[weight_key] = layer_state_dict["weight"]
                if hasattr(target, "bias") and "bias" in layer_state_dict.keys():
                    state_dict[bias_key] = layer_state_dict["bias"]
    return state_dict


================================================
FILE: src/axolotl/utils/mistral/__init__.py
================================================
"""Init for `axolotl.utils.mistral` module."""

from axolotl.utils.mistral.mistral3_processor import Mistral3Processor
from axolotl.utils.mistral.mistral_tokenizer import HFMistralTokenizer

__all__ = ["HFMistralTokenizer", "Mistral3Processor"]


================================================
FILE: src/axolotl/utils/mistral/mistral3_processor.py
================================================
"""Processor for Mistral3 multimodal models with image support"""

from typing import Any, Dict, Optional, Union

import torch
from transformers import ProcessorMixin
from transformers.feature_extraction_utils import BatchFeature
from transformers.processing_utils import ProcessingKwargs
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput

from axolotl.utils.mistral.mistral_tokenizer import HFMistralTokenizer


class Mistral3ProcessorKwargs(ProcessingKwargs):
    _defaults: Dict[str, Dict[str, Any]] = {
        "text_kwargs": {
            "padding": True,
        },
        "common_kwargs": {
            "return_tensors": "pt",
            "return_dict": True,
            "tokenize": True,
        },
    }


class Mistral3Processor(ProcessorMixin):
    """
    Processor for Mistral3 multimodal models that handles text and images.
    Wraps HFMistralTokenizer and adds image processing capabilities.
    """

    def __init__(self, tokenizer: HFMistralTokenizer):
        super().__init__(tokenizer)

    @property
    def audio_tokenizer(self) -> None:
        """Audio tokenizer is not supported. Dummy method to satisfy HuggingFace API."""
        return None

    def _merge_kwargs(
        self, processor_kwargs_class: Any, **kwargs: Any
    ) -> Dict[str, Dict[str, Any]]:
        """Merge kwargs with defaults similar to ProcessorMixin"""
        defaults = processor_kwargs_class._defaults
        output_kwargs: Dict[str, Dict[str, Any]] = {}

        for kwarg_type, default_values in defaults.items():
            output_kwargs[kwarg_type] = {**default_values}

        # Update with provided kwargs
        for key, value in kwargs.items():
            # Try to match key to appropriate kwarg type
            if key in ["padding", "truncation", "max_length"]:
                output_kwargs.setdefault("text_kwargs", {}).update({key: value})
            elif key in ["return_tensors", "return_dict", "tokenize"]:
                output_kwargs.setdefault("common_kwargs", {}).update({key: value})
            else:
                # Add to text_kwargs by default
                output_kwargs.setdefault("text_kwargs", {}).update({key: value})

        return output_kwargs

    def apply_chat_template(
        self,
        conversation: Union[list[dict[str, str]], list[list[dict[str, str]]]],
        **kwargs: Any,
    ) -> Union[BatchFeature, str, list[str]]:
        """
        Apply chat template with image support for Mistral3.

        Similar to VoxtralProcessor, this method extracts images from the conversation,
        calls the tokenizer's apply_chat_template, then adds pixel_values and image_sizes
        to the result.
        """
        output_kwargs = self._merge_kwargs(Mistral3ProcessorKwargs, **kwargs)
        text_kwargs = output_kwargs["text_kwargs"]
        common_kwargs = output_kwargs["common_kwargs"]

        return_tensors = common_kwargs.pop("return_tensors", "pt")
        if return_tensors != "pt":
            raise ValueError(
                f"{self.__class__.__name__} only supports `return_tensors='pt'`."
            )

        return_dict = common_kwargs.pop("return_dict", False)
        tokenize = common_kwargs.pop("tokenize", False)

        # Determine if batched
        if isinstance(conversation, (list, tuple)) and (
            isinstance(conversation[0], (list, tuple))
            or hasattr(conversation[0], "content")
        ):
            is_batched = True
            conversations = conversation
        else:
            is_batched = False
            conversations = [conversation]  # type: ignore

        # Call tokenizer's apply_chat_template
        tokenizer_kwargs = {**text_kwargs, **common_kwargs}
        tokenizer_kwargs["return_tensors"] = return_tensors
        tokenizer_kwargs["tokenize"] = tokenize
        tokenizer_kwargs["return_dict"] = return_dict

        encoded_instruct_inputs = self.tokenizer.apply_chat_template(
            conversations,
            **tokenizer_kwargs,
        )

        if tokenize:
            if return_dict:
                # The tokenizer already handles pixel_values, we just need to add image_sizes
                if hasattr(encoded_instruct_inputs, "items"):
                    data: Dict[str, Any] = dict(encoded_instruct_inputs)  # type: ignore
                elif hasattr(encoded_instruct_inputs, "data"):
                    data = encoded_instruct_inputs.data  # type: ignore
                else:
                    raise ValueError("Unknown data type")

                if "pixel_values" in data:
                    pixel_values = data["pixel_values"]

                    # MistralTokenizer returns a Double, so we convert to fp32
                    data["pixel_values"] = pixel_values.to(dtype=torch.float32)

                    # Always batched: [B, C, H, W] -> image_sizes: [B, 2]
                    # Since tensor is homogeneous, all images have same H, W
                    batch_size = pixel_values.shape[0]
                    image_sizes = torch.tensor([pixel_values.shape[-2:]] * batch_size)
                    data["image_sizes"] = image_sizes

                return BatchFeature(data=data, tensor_type=return_tensors)

        if not is_batched:
            return encoded_instruct_inputs[0]

        return encoded_instruct_inputs

    def __call__(
        self,
        text: Optional[
            Union[
                TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]
            ]
        ],
        **kwargs: Any,
    ) -> BatchFeature:
        """
        Forward text processing to the tokenizer.
        This method does not support images - use apply_chat_template instead.
        """
        output_kwargs = self._merge_kwargs(Mistral3ProcessorKwargs, **kwargs)
        text_kwargs = output_kwargs["text_kwargs"]
        common_kwargs = output_kwargs["common_kwargs"]

        out = self.tokenizer(text, **text_kwargs)
        return BatchFeature(
            data=out, tensor_type=common_kwargs.pop("return_tensors", None)
        )


================================================
FILE: src/axolotl/utils/mistral/mistral_tokenizer.py
================================================
"""Wrapper for MistralTokenizer from mistral-common"""

import os
from typing import Optional

import numpy as np
from mistral_common.protocol.instruct.validator import ValidationMode
from mistral_common.tokens.tokenizers.utils import download_tokenizer_from_hf_hub
from torch import Tensor
from transformers.tokenization_mistral_common import MistralCommonBackend
from transformers.tokenization_utils_base import VERY_LARGE_INTEGER


class HFMistralTokenizer(MistralCommonBackend):
    """
    Wraps mistral_common.tokens.tokenizers.mistral.MistralTokenizer
    and exposes HuggingFace API for special tokens.
    """

    def __init__(self, name_or_path: str, **kwargs):
        """
        Args:
            name_or_path: The name or path to the tokenizer files or the repo id.
            **kwargs: Additional keyword arguments passed to the parent class.
        """
        kwargs.pop("mode", None)

        mode = ValidationMode.finetuning
        super().__init__(**kwargs, mode=mode)

        self._name_or_path = name_or_path

        # set mode as is not set upstream
        self._set_mode(mode)

    @property
    def name_or_path(self) -> str:
        return self._name_or_path

    @name_or_path.setter
    def name_or_path(self, name_or_path: str) -> None:
        self._name_or_path = name_or_path

    @property
    def chat_template(self) -> str | None:
        """Chat template is not supported. Dummy method to satisfy HuggingFace API."""
        return "[This is a dummy chat template]"

    @chat_template.setter
    def chat_template(self, chat_template: str | None) -> None:
        pass

    def _set_mode(self, mode: ValidationMode):
        """Set the mode of the MistralRequestValidator.

        Args:
            mode: The mode to set.

        Raises:
            RuntimeError: If the MistralRequestValidator does not have a _mode attribute.
        """
        # Check if MistralRequestValidator has a _mode attribute.
        # This is a private API and may change in the future.

        from mistral_common.protocol.instruct.validator import MistralRequestValidator

        if not (
            hasattr(self.tokenizer, "_chat_completion_request_validator")
            and isinstance(
                self.tokenizer._chat_completion_request_validator,
                MistralRequestValidator,
            )
            and hasattr(self.tokenizer._chat_completion_request_validator, "_mode")
        ):
            raise RuntimeError(
                f"Unable to switch mistral tokenizer to {mode.value} mode - "
                "private API `_chat_completion_request_validator._mode` missing."
            )

        self.tokenizer._chat_completion_request_validator._mode = mode

    def apply_chat_template(  # type: ignore
        self,
        conversation: list[dict] | list[list[dict]],
        chat_template: str | None = None,
        add_generation_prompt: bool = False,
        **kwargs,
    ) -> str | list[int]:
        """Patched fn to handle setting test mode, remove chat_template and add_generation_prompt kwarg"""

        # pop unnecessary kwarg for mistral
        kwargs.pop("real_last_index", None)
        kwargs.pop("add_special_tokens", None)

        try:
            if add_generation_prompt:
                self._set_mode(ValidationMode.test)

            out = super().apply_chat_template(conversation, **kwargs)

            return out  # type: ignore

        finally:
            if add_generation_prompt:
                self._set_mode(ValidationMode.finetuning)

    def decode(  # type: ignore
        self,
        token_ids: int | list[int] | np.ndarray | Tensor,
        **kwargs,
    ) -> str:
        """
        Decode token_ids into str.

        This overrides upstream.decode to convert int to list[int]
        """

        if isinstance(token_ids, int):
            token_ids = [token_ids]

        return super().decode(token_ids, **kwargs)

    @classmethod
    def from_pretrained(
        cls,
        pretrained_model_name_or_path: str | os.PathLike,
        *init_inputs,
        mode: ValidationMode = ValidationMode.test,
        cache_dir: Optional[str | os.PathLike] = None,
        force_download: bool = False,
        local_files_only: bool = False,
        token: Optional[str | bool] = None,
        revision: str = "main",
        model_max_length: int = VERY_LARGE_INTEGER,
        padding_side: str = "left",
        truncation_side: str = "right",
        model_input_names: Optional[list[str]] = None,
        clean_up_tokenization_spaces: bool = False,
        **kwargs,
    ):
        r"""
        Patched fn to pass `name_or_path` and remove extra kwargs.

        Instantiate a `MistralCommonBackend` from a predefined
        tokenizer.

        Args:
            pretrained_model_name_or_path (`str` or `os.PathLike`):
                Can be either:

                - A string, the *model id* of a predefined tokenizer hosted inside a model repo on huggingface.co.
                - A path to a *directory* containing the tokenizer config, for instance saved
                  using the [`MistralCommonBackend.tokenization_mistral_common.save_pretrained`] method, e.g.,
                  `./my_model_directory/`.
            mode (`ValidationMode`, *optional*, defaults to `ValidationMode.test`):
                Validation mode for the `MistralTokenizer` tokenizer.
            cache_dir (`str` or `os.PathLike`, *optional*):
                Path to a directory in which a downloaded predefined tokenizer vocabulary files should be cached if the
                standard cache should not be used.
            force_download (`bool`, *optional*, defaults to `False`):
                Whether or not to force the (re-)download the vocabulary files and override the cached versions if they
                exist.
            token (`str` or *bool*, *optional*):
                The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated
                when running `hf auth login` (stored in `~/.huggingface`).
            local_files_only (`bool`, *optional*, defaults to `False`):
                Whether or not to only rely on local files and not to attempt to download any files.
            revision (`str`, *optional*, defaults to `"main"`):
                The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a
                git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any
                identifier allowed by git.
            max_length (`int`, *optional*):
                Controls the maximum length to use by one of the truncation/padding parameters.

                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
                is required by one of the truncation/padding parameters. If the model has no specific maximum input
                length (like XLNet) truncation/padding to a maximum length will be deactivated.
            padding_side (`str`, *optional*, defaults to `"left"`):
                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
                Default value is picked from the class attribute of the same name.
            truncation_side (`str`, *optional*, defaults to `"right"`):
                The side on which the model should have truncation applied. Should be selected between ['right', 'left'].
            model_input_names (`List[string]`, *optional*):
                The list of inputs accepted by the forward pass of the model (like `"token_type_ids"` or
                `"attention_mask"`). Default value is picked from the class attribute of the same name.
            clean_up_tokenization_spaces (`bool`, *optional*, defaults to `False`):
                Whether or not the model should cleanup the spaces that were added when splitting the input text during the
                tokenization process.
            kwargs (additional keyword arguments, *optional*):
                Not supported by `MistralCommonBackend.from_pretrained`.
                Will raise an error if used.
        """
        if init_inputs:
            raise ValueError(
                "`init_inputs` are not supported by `MistralCommonBackend.from_pretrained`."
            )

        # Delete trust_remote_code as it does nothing
        kwargs.pop("trust_remote_code", None)

        # Delete tokenizer as it does nothing
        kwargs.pop("tokenizer", None)

        # Handle kwargs and AutoTokenizer case
        if kwargs and not kwargs.keys() == {"_from_auto"}:
            raise ValueError(
                f"Kwargs {list(kwargs.keys())} are not supported by `MistralCommonBackend.from_pretrained`."
            )

        if not os.path.isfile(pretrained_model_name_or_path):
            tokenizer_path = download_tokenizer_from_hf_hub(
                repo_id=str(pretrained_model_name_or_path),
                cache_dir=str(cache_dir),
                token=token,
                revision=revision,
                force_download=force_download,
                local_files_only=local_files_only,
            )
        else:
            tokenizer_path = str(pretrained_model_name_or_path)

        return cls(
            name_or_path=str(pretrained_model_name_or_path),
            tokenizer_path=tokenizer_path,
            mode=mode,
            model_max_length=model_max_length,
            padding_side=padding_side,
            truncation_side=truncation_side,
            model_input_names=model_input_names,
            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
        )

    def save_pretrained(self, *args, **kwargs) -> tuple[str, ...]:
        """
        Patches to remove save_jinja_files from being passed onwards.
        """
        kwargs.pop("save_jinja_files", None)
        return super().save_pretrained(*args, **kwargs)


================================================
FILE: src/axolotl/utils/mlflow_.py
================================================
"""Module for mlflow utilities"""

import os

from axolotl.utils.dict import DictDefault


def setup_mlflow_env_vars(cfg: DictDefault):
    for key in cfg.keys():
        if key.startswith("mlflow_") or key.startswith("hf_mlflow_"):
            value = cfg.get(key, "")

            if value and isinstance(value, str) and len(value) > 0:
                os.environ[key.upper()] = value

    # Enable mlflow if experiment name is present
    if cfg.mlflow_experiment_name and len(cfg.mlflow_experiment_name) > 0:
        cfg.use_mlflow = True

    # Enable logging hf artifacts in mlflow if value is truthy
    if cfg.hf_mlflow_log_artifacts is True:
        os.environ["HF_MLFLOW_LOG_ARTIFACTS"] = "true"


================================================
FILE: src/axolotl/utils/model_shard_quant.py
================================================
"""
module to handle loading model on cpu/meta device for FSDP
"""

import os
import time
from typing import List, Optional, Type, Union

import safetensors
import torch
from accelerate import init_empty_weights
from bitsandbytes.nn import Linear4bit, Params4bit
from fastcore.parallel import parallel
from torch import Tensor, nn
from tqdm import tqdm
from transformers import AutoModelForCausalLM
from transformers.quantizers import AutoHfQuantizer
from transformers.utils import SAFE_WEIGHTS_INDEX_NAME, SAFE_WEIGHTS_NAME, hub


def _replace_linear(
    model: nn.Module,
    linear_replacement: Type[nn.Module],
    quant_config: Union[dict, None] = None,
    skip_modules=None,
    **kwargs,
):
    """
    Replace linear modules with a new Linear module.
    Parameters:
        model (`torch.nn.Module`):
            Input model or `torch.nn.Module` as the function is run recursively.
        linear_replacement (`torch.nn.Module`):
            The linear module that replaces the old one. Only expects standard arguments.
            If other arguments need to be passed, use a lambda.
        skip_modules (`List[str]`, *optional*, defaults to `lm_head`):
            List of modules names not to convert. Defaults to `lm_head`.
    """
    if skip_modules is None:
        skip_modules = ["lm_head"]
    for name, module in model.named_children():
        if len(list(module.children())) > 0:
            _replace_linear(
                module, linear_replacement, quant_config, skip_modules, **kwargs
            )

        if isinstance(module, torch.nn.Linear) and name not in skip_modules:
            if issubclass(linear_replacement, Linear4bit):
                model._modules[name] = linear_replacement(
                    module.in_features,
                    module.out_features,
                    module.bias is not None,
                    **kwargs,
                )
            else:
                raise ValueError(
                    f"Unsupported linear replacement: {type(linear_replacement)}"
                )
    return model


def load_and_quantize(
    module: nn.Module,
    name: str,
    value: Tensor,
    device: torch.device = None,
    dtype: torch.dtype = None,
    skip_names: Optional[List[str]] = None,
    to_cpu: bool = False,
    to_meta: bool = False,
    verbose: bool = False,
    quant_method: str = "bnb",
):
    """
    Loads `value` tensor into submodule of `module`, optionally skipping `skip_names` and converting to `dtype`.

    Quantizes `Params4bit` on `device` then places on "cpu" if to_cpu=True or "meta" if to_meta=True.
    """

    if not skip_names:
        skip_names = []

    def place_on_device(value):
        if to_meta:
            device = "meta"
        elif to_cpu:
            device = "cpu"
        return value.to(device=device, dtype=dtype)

    if any(skip_name in name for skip_name in skip_names):
        if verbose:
            print(f"Skipping {name} because it is in skip_names")
        return

    module_key, _, value_key = name.rpartition(".")
    try:
        submodule = module.get_submodule(module_key)
    except AttributeError as exc:
        print(f"Module {module_key} not found:\n{exc}")
        return

    try:
        if quant_method == "bnb":
            param = submodule.get_parameter(value_key)
            if isinstance(param, Params4bit):
                # With `sync_module_states=True`, a meta device Params4bit needs to be the same
                # shape as the quantized Params4bit with an initialized quant_state. However,
                # FSDP only syncs parameters and buffers, so the quant_state isn't copied. This
                # workaround quantizes Params4bit to initialize quant_state on all ranks, then
                # replaces Params4bit's data with a meta tensor to free memory on non-rank 0.
                value = type(param)(
                    value.to(device=device, dtype=dtype).data, **param.__dict__
                ).cuda(device)
                if to_meta:
                    value = type(param)(value.data.to("meta"), **value.__dict__)
                elif to_cpu:
                    value = type(param)(value.data.to("cpu"), **value.__dict__)
            else:
                value = type(param)(place_on_device(value).data)

    except AttributeError:
        # it's a buffer
        value = place_on_device(value)

    setattr(submodule, value_key, value)


def n_loading_workers(quant_method: str, param_count: float):
    devprops = torch.cuda.get_device_properties(torch.cuda.current_device())
    left = int(os.cpu_count() / torch.cuda.device_count())
    model_params_b = 70
    right = int(
        (4 if quant_method == "hqq" else 8)
        * (devprops.total_memory / 1e9 / 40)
        * (model_params_b / (param_count / 1e9))
    )
    return min(left, right)


def load_sharded_model(
    model_name,
    model_config,
    cfg,
    torch_dtype=torch.bfloat16,
    low_memory=True,
):
    if (low_memory and cfg.local_rank == 0) or not low_memory:
        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            use_cache=False,
            dtype=torch.float32,
            _attn_implementation=model_config._attn_implementation,
            trust_remote_code=cfg.trust_remote_code,
        )
        dtype = torch_dtype if not cfg.float32 else None
        model.to(dtype=dtype, device="cpu" if low_memory else cfg.local_rank)
    else:
        with init_empty_weights():
            model = AutoModelForCausalLM.from_config(
                model_config,
                dtype=torch_dtype,
                trust_remote_code=cfg.trust_remote_code,
            )
    return model


def load_sharded_model_quant(
    model_name,
    model_config,
    cfg,
    compute_dtype=torch.bfloat16,
    quant_storage=torch.float32,
    low_memory=True,
    verbose=False,
    loading_workers=2,
    quantization_config=None,
):
    with init_empty_weights():
        model = AutoModelForCausalLM.from_config(
            model_config,
            trust_remote_code=cfg.trust_remote_code,
        )
        if hasattr(model, "transformer"):
            model.transformer = _replace_linear(
                model.transformer,
                Linear4bit,
                compute_dtype=compute_dtype,
                quant_type="nf4",
                quant_storage=quant_storage,
                compress_statistics=True,  # bnb_4bit_use_double_quant
                skip_modules=[
                    "lm_head",
                    "embed_out",
                ],
            )
        else:
            # this is the more common case with HF transformers
            # TODO can we detect the model arch and dynamically set skip_modules
            model.model = _replace_linear(
                model.model,
                Linear4bit,
                compute_dtype=compute_dtype,
                quant_type="nf4",
                quant_storage=quant_storage,
                compress_statistics=True,  # bnb_4bit_use_double_quant
                skip_modules=[
                    "lm_head",
                    "embed_out",
                ],
            )
    model.is_loaded_in_4bit = True

    # Grab the safetensors files that hold the weights
    try:
        idx = hub.cached_file(model_name, SAFE_WEIGHTS_INDEX_NAME)
        files, _ = hub.get_checkpoint_shard_files(model_name, idx)
    except OSError:
        try:
            # This means the model doesn't have a model.safetensors.index.json because it is not sharded
            files = []
            files.append(hub.cached_file(model_name, SAFE_WEIGHTS_NAME))
        except OSError as exc:
            # This means the model probably doesn't have a safetensors file
            raise exc

    # Load in the weights, using our custom load_and_quantize method which quantizes Params4bit on the fly
    # and then places each layer on CPU or meta if using low_memory to minimize GPU memory usage
    def load_and_quantize_parallel(name_param, model, **kwargs):
        name, param = name_param
        load_and_quantize(model, name, param, **kwargs)

    quant_method = "bnb"
    param_count = sum((p.numel() for n, p in model.named_parameters()))

    n_workers = (
        n_loading_workers(quant_method, param_count)
        if loading_workers == -1
        else loading_workers
    )
    if cfg.local_rank == 0 and verbose:
        print(f"Using n_workers: {n_workers} for loading")

    start = time.time()
    for filename in tqdm(
        files,
        desc="Loading & Quantizing Model Shards",
        disable=cfg.local_rank != 0,
        position=0,
    ):
        weights = safetensors.torch.load_file(filename)
        parallel(
            load_and_quantize_parallel,
            iter(weights.items()),
            n_workers=n_workers,
            threadpool=True,
            model=model,
            dtype=quant_storage,
            device=cfg.local_rank,
            skip_names=[],
            to_cpu=(low_memory and cfg.local_rank == 0),
            to_meta=(low_memory and cfg.local_rank != 0),
            verbose=verbose,
            quant_method=quant_method,
        )

    # these attributes are needed to inform transformers/peft of the quantization
    model.is_quantized = True
    model.quantization_method = "bitsandbytes"
    model.hf_quantizer = AutoHfQuantizer.from_config(quantization_config)

    if cfg.local_rank == 0 and verbose:
        print(f"Loaded model weights in {time.time() - start:.3f} seconds")
    # cleanup any extra memory usage from parallel loading
    torch.cuda.empty_cache()

    return model


================================================
FILE: src/axolotl/utils/optimizers/__init__.py
================================================


================================================
FILE: src/axolotl/utils/optimizers/adopt.py
================================================
"""
Copied from https://github.com/iShohei220/adopt

ADOPT: Modified Adam Can Converge with Any β2 with the Optimal Rate (2024)
Taniguchi, Shohei and Harada, Keno and Minegishi, Gouki and Oshima, Yuta and Jeong, Seong Cheol and Nagahara, Go and Iiyama, Tomoshi and Suzuki, Masahiro and Iwasawa, Yusuke and Matsuo, Yutaka
"""

# mypy: ignore-errors
# flake8: noqa
# mypy: allow-untyped-decorators
# mypy: allow-untyped-defs
from typing import Callable, List, Optional, Tuple, Union, cast

import torch
from torch import Tensor
from torch.optim.optimizer import (  # DeviceDict,; _capturable_doc,; _differentiable_doc,; _foreach_doc,; _fused_doc,; _maximize_doc,; _stack_if_compiling,
    DeviceDict,
    Optimizer,
    ParamsT,
    _capturable_doc,
    _default_to_fused_or_foreach,
    _device_dtype_check_for_fused,
    _differentiable_doc,
    _disable_dynamo_if_unsupported,
    _foreach_doc,
    _fused_doc,
    _get_capturable_supported_devices,
    _get_scalar_dtype,
    _get_value,
    _maximize_doc,
    _stack_if_compiling,
    _use_grad_for_differentiable,
    _view_as_real,
)

__all__ = ["ADOPT", "adopt"]


class ADOPT(Optimizer):
    def __init__(
        self,
        params: ParamsT,
        lr: Union[float, Tensor] = 1e-3,
        betas: Tuple[float, float] = (0.9, 0.9999),
        eps: float = 1e-6,
        clip_lambda: Optional[Callable[[int], float]] = lambda step: step**0.25,
        weight_decay: float = 0.0,
        decouple: bool = False,
        *,
        foreach: Optional[bool] = None,
        maximize: bool = False,
        capturable: bool = False,
        differentiable: bool = False,
        fused: Optional[bool] = None,
    ):
        if isinstance(lr, Tensor):
            if foreach and not capturable:
                raise ValueError(
                    "lr as a Tensor is not supported for capturable=False and foreach=True"
                )
            if lr.numel() != 1:
                raise ValueError("Tensor lr must be 1-element")
        if not 0.0 <= lr:
            raise ValueError(f"Invalid learning rate: {lr}")
        if not 0.0 <= eps:
            raise ValueError(f"Invalid epsilon value: {eps}")
        if not 0.0 <= betas[0] < 1.0:
            raise ValueError(f"Invalid beta parameter at index 0: {betas[0]}")
        if not 0.0 <= betas[1] < 1.0:
            raise ValueError(f"Invalid beta parameter at index 1: {betas[1]}")
        if not 0.0 <= weight_decay:
            raise ValueError(f"Invalid weight_decay value: {weight_decay}")

        self.clip_lambda = clip_lambda

        defaults = dict(
            lr=lr,
            betas=betas,
            eps=eps,
            weight_decay=weight_decay,
            decouple=decouple,
            maximize=maximize,
            foreach=foreach,
            capturable=capturable,
            differentiable=differentiable,
            fused=fused,
        )
        super().__init__(params, defaults)

        if fused:
            # TODO: support fused
            raise RuntimeError("`fused` is not currently supported")

            if differentiable:
                raise RuntimeError("`fused` does not support `differentiable`")
            self._step_supports_amp_scaling = True
            # TODO(crcrpar): [low prec params & their higher prec copy]
            # Support AMP with FP16/BF16 model params which would need
            # higher prec copy of params to do update math in higher prec to
            # alleviate the loss of information.
            if foreach:
                raise RuntimeError("`fused` and `foreach` cannot be `True` together.")

    def __setstate__(self, state):
        super().__setstate__(state)
        for group in self.param_groups:
            group.setdefault("maximize", False)
            group.setdefault("foreach", None)
            group.setdefault("capturable", False)
            group.setdefault("differentiable", False)
            fused = group.setdefault("fused", None)
            for p in group["params"]:
                p_state = self.state.get(p, [])
                if len(p_state) != 0 and not torch.is_tensor(p_state["step"]):
                    step_val = float(p_state["step"])
                    p_state["step"] = (
                        torch.tensor(
                            step_val,
                            dtype=_get_scalar_dtype(is_fused=fused),
                            device=p.device,
                        )
                        if group["capturable"] or group["fused"]
                        else torch.tensor(step_val, dtype=_get_scalar_dtype())
                    )

    def _init_group(
        self,
        group,
        params_with_grad,
        grads,
        exp_avgs,
        exp_avg_sqs,
        state_steps,
    ):
        has_complex = False
        for p in group["params"]:
            if p.grad is not None:
                has_complex |= torch.is_complex(p)
                params_with_grad.append(p)
                if p.grad.is_sparse:
                    raise RuntimeError("ADOPT does not support sparse gradients")
                grads.append(p.grad)

                state = self.state[p]
                # Lazy state initialization
                if len(state) == 0:
                    if group["fused"]:
                        _device_dtype_check_for_fused(p)
                    # note(crcrpar): [special device hosting for step]
                    # Deliberately host `step` on CPU if both capturable and fused are off.
                    # This is because kernel launches are costly on CUDA and XLA.
                    state["step"] = (
                        torch.zeros(
                            (),
                            dtype=_get_scalar_dtype(is_fused=group["fused"]),
                            device=p.device,
                        )
                        if group["capturable"] or group["fused"]
                        else torch.tensor(0.0, dtype=_get_scalar_dtype())
                    )
                    # Exponential moving average of gradient values
                    state["exp_avg"] = torch.zeros_like(
                        p, memory_format=torch.preserve_format
                    )
                    # Exponential moving average of squared gradient values
                    state["exp_avg_sq"] = torch.zeros_like(
                        p, memory_format=torch.preserve_format
                    )

                exp_avgs.append(state["exp_avg"])
                exp_avg_sqs.append(state["exp_avg_sq"])

                if group["differentiable"] and state["step"].requires_grad:
                    raise RuntimeError(
                        "`requires_grad` is not supported for `step` in differentiable mode"
                    )

                # Foreach without capturable does not support a tensor lr
                if (
                    group["foreach"]
                    and torch.is_tensor(group["lr"])
                    and not group["capturable"]
                ):
                    raise RuntimeError(
                        "lr as a Tensor is not supported for capturable=False and foreach=True"
                    )

                state_steps.append(state["step"])
        return has_complex

    @_use_grad_for_differentiable
    def step(self, closure=None):
        """Perform a single optimization step.

        Args:
            closure (Callable, optional): A closure that reevaluates the model
                and returns the loss.
        """
        self._cuda_graph_capture_health_check()

        loss = None
        if closure is not None:
            with torch.enable_grad():
                loss = closure()

        for group in self.param_groups:
            params_with_grad: List[Tensor] = []
            grads: List[Tensor] = []
            exp_avgs: List[Tensor] = []
            exp_avg_sqs: List[Tensor] = []
            state_steps: List[Tensor] = []
            beta1, beta2 = group["betas"]

            has_complex = self._init_group(
                group,
                params_with_grad,
                grads,
                exp_avgs,
                exp_avg_sqs,
                state_steps,
            )

            adopt(
                params_with_grad,
                grads,
                exp_avgs,
                exp_avg_sqs,
                state_steps,
                has_complex=has_complex,
                beta1=beta1,
                beta2=beta2,
                lr=group["lr"],
                clip_lambda=self.clip_lambda,
                weight_decay=group["weight_decay"],
                decouple=group["decouple"],
                eps=group["eps"],
                maximize=group["maximize"],
                foreach=group["foreach"],
                capturable=group["capturable"],
                differentiable=group["differentiable"],
                fused=group["fused"],
                grad_scale=getattr(self, "grad_scale", None),
                found_inf=getattr(self, "found_inf", None),
            )

        return loss


def _single_tensor_adopt(
    params: List[Tensor],
    grads: List[Tensor],
    exp_avgs: List[Tensor],
    exp_avg_sqs: List[Tensor],
    state_steps: List[Tensor],
    grad_scale: Optional[Tensor],
    found_inf: Optional[Tensor],
    *,
    has_complex: bool,
    beta1: float,
    beta2: float,
    lr: Union[float, Tensor],
    clip_lambda: Optional[Callable[[int], float]],
    weight_decay: float,
    decouple: bool,
    eps: float,
    maximize: bool,
    capturable: bool,
    differentiable: bool,
):
    assert grad_scale is None and found_inf is None

    if torch.jit.is_scripting():
        # this assert is due to JIT being dumb and not realizing that the ops below
        # have overloads to handle both float and Tensor lrs, so we just assert it's
        # a float since most people using JIT are using floats
        assert isinstance(lr, float)

    for i, param in enumerate(params):
        grad = grads[i] if not maximize else -grads[i]
        exp_avg = exp_avgs[i]
        exp_avg_sq = exp_avg_sqs[i]
        step_t = state_steps[i]

        # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
        if not torch._utils.is_compiling() and capturable:
            capturable_supported_devices = _get_capturable_supported_devices()
            assert (
                param.device.type == step_t.device.type
                and param.device.type in capturable_supported_devices
            ), (
                f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
            )

        step = step_t if capturable or differentiable else _get_value(step_t)

        if weight_decay != 0 and not decouple:
            grad = grad.add(param, alpha=weight_decay)

        if torch.is_complex(param):
            grad = torch.view_as_real(grad)
            if exp_avg is not None:
                exp_avg = torch.view_as_real(exp_avg)
            if exp_avg_sq is not None:
                exp_avg_sq = torch.view_as_real(exp_avg_sq)
            param = torch.view_as_real(param)

        if step == 0:
            exp_avg_sq.addcmul_(grad, grad.conj())
            # update step
            step_t += 1
            continue

        if weight_decay != 0 and decouple:
            param.add_(param, alpha=-lr * weight_decay)

        denom = torch.clamp(exp_avg_sq.sqrt(), eps)
        normed_grad = grad.div(denom)
        if clip_lambda is not None:
            clip = clip_lambda(step)
            normed_grad.clamp_(-clip, clip)

        exp_avg.lerp_(normed_grad, 1 - beta1)

        param.add_(exp_avg, alpha=-lr)
        exp_avg_sq.mul_(beta2).addcmul_(grad, grad.conj(), value=1 - beta2)

        # update step
        step_t += 1


def _multi_tensor_adopt(
    params: List[Tensor],
    grads: List[Tensor],
    exp_avgs: List[Tensor],
    exp_avg_sqs: List[Tensor],
    state_steps: List[Tensor],
    grad_scale: Optional[Tensor],
    found_inf: Optional[Tensor],
    *,
    has_complex: bool,
    beta1: float,
    beta2: float,
    lr: Union[float, Tensor],
    clip_lambda: Optional[Callable[[int], float]],
    weight_decay: float,
    decouple: bool,
    eps: float,
    maximize: bool,
    capturable: bool,
    differentiable: bool,
):
    if len(params) == 0:
        return

    if isinstance(lr, Tensor) and not capturable:
        raise RuntimeError(
            "lr as a Tensor is not supported for capturable=False and foreach=True"
        )

    # If compiling, the compiler will handle cudagraph checks, see note [torch.compile x capturable]
    if not torch._utils.is_compiling() and capturable:
        capturable_supported_devices = _get_capturable_supported_devices(
            supports_xla=False
        )
        assert all(
            p.device.type == step.device.type
            and p.device.type in capturable_supported_devices
            for p, step in zip(params, state_steps)
        ), (
            f"If capturable=True, params and state_steps must be on supported devices: {capturable_supported_devices}."
        )

    assert grad_scale is None and found_inf is None

    assert not differentiable, "_foreach ops don't support autograd"

    grouped_tensors = Optimizer._group_tensors_by_device_and_dtype(
        [params, grads, exp_avgs, exp_avg_sqs, state_steps]  # type: ignore[list-item]
    )
    for (
        device_params_,
        device_grads_,
        device_exp_avgs_,
        device_exp_avg_sqs_,
        device_state_steps_,
    ), _ in grouped_tensors.values():
        device_params = cast(List[Tensor], device_params_)
        device_grads = cast(List[Tensor], device_grads_)
        device_exp_avgs = cast(List[Tensor], device_exp_avgs_)
        device_exp_avg_sqs = cast(List[Tensor], device_exp_avg_sqs_)
        device_state_steps = cast(List[Tensor], device_state_steps_)

        # Handle complex parameters
        if has_complex:
            _view_as_real(
                device_params, device_grads, device_exp_avgs, device_exp_avg_sqs
            )

        if maximize:
            device_grads = torch._foreach_neg(device_grads)  # type: ignore[assignment]

        if weight_decay != 0 and not decouple:
            # Re-use the intermediate memory (device_grads) already allocated for maximize
            if maximize:
                torch._foreach_add_(device_grads, device_params, alpha=weight_decay)
            else:
                device_grads = torch._foreach_add(  # type: ignore[assignment]
                    device_grads, device_params, alpha=weight_decay
                )

        if device_state_steps[0] == 0:
            torch._foreach_addcmul_(device_exp_avg_sqs, device_grads, device_grads)

            # Update steps
            # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
            # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
            # wrapped it once now. The alpha is required to assure we go to the right overload.
            if not torch._utils.is_compiling() and device_state_steps[0].is_cpu:
                torch._foreach_add_(
                    device_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0
                )
            else:
                torch._foreach_add_(device_state_steps, 1)

            continue

        if weight_decay != 0 and decouple:
            torch._foreach_add_(device_params, device_params, alpha=-lr * weight_decay)

        exp_avg_sq_sqrt = torch._foreach_sqrt(device_exp_avg_sqs)
        torch._foreach_maximum_(exp_avg_sq_sqrt, eps)

        normed_grad = torch._foreach_div(device_grads, exp_avg_sq_sqrt)
        if clip_lambda is not None:
            clip = clip_lambda(device_state_steps[0])
            torch._foreach_maximum_(normed_grad, -clip)
            torch._foreach_minimum_(normed_grad, clip)

        torch._foreach_lerp_(device_exp_avgs, normed_grad, 1 - beta1)

        torch._foreach_add_(device_params, device_exp_avgs, alpha=-lr)
        torch._foreach_mul_(device_exp_avg_sqs, beta2)
        torch._foreach_addcmul_(
            device_exp_avg_sqs, device_grads, device_grads, value=1 - beta2
        )

        # Update steps
        # If steps are on CPU, foreach will fall back to the slow path, which is a for-loop calling t.add(1) over
        # and over. 1 will then be wrapped into a Tensor over and over again, which is slower than if we just
        # wrapped it once now. The alpha is required to assure we go to the right overload.
        if not torch._utils.is_compiling() and device_state_steps[0].is_cpu:
            torch._foreach_add_(
                device_state_steps, torch.tensor(1.0, device="cpu"), alpha=1.0
            )
        else:
            torch._foreach_add_(device_state_steps, 1)


@_disable_dynamo_if_unsupported(single_tensor_fn=_single_tensor_adopt)
def adopt(
    params: List[Tensor],
    grads: List[Tensor],
    exp_avgs: List[Tensor],
    exp_avg_sqs: List[Tensor],
    state_steps: List[Tensor],
    # kwonly args with defaults are not supported by functions compiled with torchscript issue #70627
    # setting this as kwarg for now as functional API is compiled by torch/distributed/optim
    foreach: Optional[bool] = None,
    capturable: bool = False,
    differentiable: bool = False,
    fused: Optional[bool] = None,
    grad_scale: Optional[Tensor] = None,
    found_inf: Optional[Tensor] = None,
    has_complex: bool = False,
    *,
    beta1: float,
    beta2: float,
    lr: Union[float, Tensor],
    clip_lambda: Optional[Callable[[int], float]],
    weight_decay: float,
    decouple: bool,
    eps: float,
    maximize: bool,
):
    r"""Functional API that performs ADOPT algorithm computation."""
    # Respect when the user inputs False/True for foreach or fused. We only want to change
    # the default when neither have been user-specified. Note that we default to foreach
    # and pass False to use_fused. This is not a mistake--we want to give the fused impl
    # bake-in time before making it the default, even if it is typically faster.
    if fused is None and foreach is None:
        _, foreach = _default_to_fused_or_foreach(
            params, differentiable, use_fused=False
        )
        # Do not flip on foreach for the unsupported case where lr is a Tensor and capturable=False.
        if foreach and isinstance(lr, Tensor) and not capturable:
            foreach = False
    if fused is None:
        fused = False
    if foreach is None:
        foreach = False

    # this check is slow during compilation, so we skip it
    # if it's strictly needed we can add this check back in dynamo
    if not torch._utils.is_compiling() and not all(
        isinstance(t, torch.Tensor) for t in state_steps
    ):
        raise RuntimeError(
            "API has changed, `state_steps` argument must contain a list of singleton tensors"
        )

    if foreach and torch.jit.is_scripting():
        raise RuntimeError("torch.jit.script not supported with foreach optimizers")
    if fused and torch.jit.is_scripting():
        raise RuntimeError("torch.jit.script not supported with fused optimizers")

    # if fused and not torch.jit.is_scripting():
    #     func = _fused_adopt
    # elif foreach and not torch.jit.is_scripting():
    if foreach and not torch.jit.is_scripting():
        func = _multi_tensor_adopt
    else:
        func = _single_tensor_adopt

    func(
        params,
        grads,
        exp_avgs,
        exp_avg_sqs,
        state_steps,
        has_complex=has_complex,
        beta1=beta1,
        beta2=beta2,
        lr=lr,
        clip_lambda=clip_lambda,
        weight_decay=weight_decay,
        decouple=decouple,
        eps=eps,
        maximize=maximize,
        capturable=capturable,
        differentiable=differentiable,
        grad_scale=grad_scale,
        found_inf=found_inf,
    )


================================================
FILE: src/axolotl/utils/quantization.py
================================================
"""
Utilities for quantization including QAT and PTQ using torchao.
"""

import torch
from packaging import version
from torchao.core.config import AOBaseConfig
from torchao.prototype.qat import MXFakeQuantizeConfig
from torchao.quantization import quantize_
from torchao.quantization.qat import (
    QATConfig,
)
from torchao.quantization.quant_api import (
    Float8DynamicActivationFloat8WeightConfig,
    Float8DynamicActivationInt4WeightConfig,
    Int8DynamicActivationInt4WeightConfig,
)

from axolotl.utils.schemas.enums import TorchAOQuantDType

quantization_config_to_str = {
    Int8DynamicActivationInt4WeightConfig: "int8int4",
    Float8DynamicActivationFloat8WeightConfig: "fp8fp8",
    Float8DynamicActivationInt4WeightConfig: "fp8int4",
}

if version.parse(torch.__version__) >= version.parse("2.8.0"):
    try:
        from torchao.prototype.mx_formats import NVFP4InferenceConfig

        quantization_config_to_str[NVFP4InferenceConfig] = "nvfp4"
    except (ImportError, RuntimeError):
        pass

    # int4 weight config imports will fail on machines with fbgemm-gpu installed
    # without a CUDA runtime available so we do this safely
    try:
        from torchao.quantization.quant_api import Int4WeightOnlyConfig

        quantization_config_to_str[Int4WeightOnlyConfig] = "int4"
    except (ImportError, RuntimeError):
        pass

    try:
        from torchao.prototype.qat import MXFakeQuantizeConfig

        quantization_config_to_str[MXFakeQuantizeConfig] = "mxfp4"
    except ImportError:
        pass


def get_quantization_config(
    weight_dtype: TorchAOQuantDType,
    activation_dtype: TorchAOQuantDType | None = None,
    group_size: int | None = None,
) -> AOBaseConfig:
    """
    This function is used to build a post-training quantization config.

    Args:
        weight_dtype: The dtype to use for weight quantization.
        activation_dtype: The dtype to use for activation quantization.
        group_size: The group size to use for weight quantization.

    Returns:
        The post-training quantization config.

    Raises:
        ValueError: If the activation dtype is not specified and the weight dtype is not int8 or int4,
            or if the group size is not specified for int8 or int4 weight only quantization.
    """
    if activation_dtype is None:
        if weight_dtype == TorchAOQuantDType.int8:
            raise ValueError("Int8WeightOnlyConfig is not supported by torchao QAT.")
        if weight_dtype == TorchAOQuantDType.int4:
            from torchao.quantization.quant_api import Int4WeightOnlyConfig

            if group_size is not None:
                return Int4WeightOnlyConfig(group_size=group_size, version=2)
            else:
                return Int4WeightOnlyConfig(version=2)
    if (
        activation_dtype == TorchAOQuantDType.int4
        and weight_dtype == TorchAOQuantDType.int4
    ):
        raise ValueError(
            "Int4DynamicActivationInt4WeightConfig is not supported by torchao QAT."
        )
    if (
        activation_dtype == TorchAOQuantDType.int8
        and weight_dtype == TorchAOQuantDType.int8
    ):
        raise ValueError(
            "Int8DynamicActivationInt8WeightConfig is not supported by torchao QAT."
        )
    if (
        activation_dtype == TorchAOQuantDType.int8
        and weight_dtype == TorchAOQuantDType.int4
    ):
        if group_size is not None:
            return Int8DynamicActivationInt4WeightConfig(group_size=group_size)
        else:
            return Int8DynamicActivationInt4WeightConfig()
    if (
        activation_dtype == TorchAOQuantDType.float8_e4m3fn
        and weight_dtype == TorchAOQuantDType.float8_e4m3fn
    ):
        return Float8DynamicActivationFloat8WeightConfig()
    if (
        activation_dtype == TorchAOQuantDType.float8_e4m3fn
        and weight_dtype == TorchAOQuantDType.int4
    ):
        return Float8DynamicActivationInt4WeightConfig()
    if weight_dtype == TorchAOQuantDType.nvfp4:
        from torchao.prototype.mx_formats import NVFP4InferenceConfig

        if group_size is not None and group_size != 16:
            raise ValueError("NVFP4 quantization must use a group_size of 16")
        return NVFP4InferenceConfig()

    if weight_dtype == TorchAOQuantDType.mxfp4:
        from torchao.prototype.qat import MXFakeQuantizeConfig

        # MXFP4 uses block_size=32 by default (vs NVFP4's 16)
        block_size = group_size if group_size is not None else 32
        if block_size != 32:
            raise ValueError(
                "MXFP4 quantization must use a block_size (group_size) of 32"
            )

        return MXFakeQuantizeConfig(dtype=torch.float4_e2m1fn_x2, block_size=block_size)

    raise ValueError(
        f"Invalid activation/weight dtype combination: {activation_dtype}/{weight_dtype}"
    )


def quantize_model(
    model,
    weight_dtype: TorchAOQuantDType,
    group_size: int | None = None,
    activation_dtype: TorchAOQuantDType | None = None,
    quantize_embedding: bool | None = None,
):
    """
    This function is used to quantize a model.

    Args:
        model: The model to quantize.
        weight_dtype: The dtype to use for weight quantization.
        group_size: The group size to use for weight quantization.
        activation_dtype: The dtype to use for activation quantization.
        quantize_embedding: Whether to quantize the model's embedding weights.

    """
    linear_ptq_config = get_quantization_config(
        weight_dtype=weight_dtype,
        activation_dtype=activation_dtype,
        group_size=group_size,
    )
    quantize_(model, linear_ptq_config)
    if quantize_embedding:
        # activation fake quantization is not supported for embedding layers
        embedding_quantize_config = get_quantization_config(
            weight_dtype=weight_dtype,
            activation_dtype=None,
            group_size=group_size,
        )
        quantize_(
            model,
            embedding_quantize_config,
            filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding),
        )


def prepare_model_for_qat(
    model,
    weight_dtype: TorchAOQuantDType,
    group_size: int | None = None,
    activation_dtype: TorchAOQuantDType | None = None,
    quantize_embedding: bool = False,
):
    """
    This function is used to prepare a model for QAT by swapping the model's linear
    layers with fake quantized linear layers, and optionally the embedding weights with
    fake quantized embedding weights.

    Args:
        model: The model to quantize.
        weight_dtype: The dtype to use for weight quantization.
        group_size: The group size to use for weight quantization.
        activation_dtype: The dtype to use for activation quantization.
        quantize_embedding: Whether to quantize the model's embedding weights.

    Raises:
        ValueError: If the activation/weight dtype combination is invalid.
    """
    base_config = get_quantization_config(
        weight_dtype=weight_dtype,
        activation_dtype=activation_dtype,
        group_size=group_size,
    )
    if isinstance(base_config, MXFakeQuantizeConfig):
        qat_config = QATConfig(
            activation_config=base_config,
            weight_config=base_config,
        )
    else:
        qat_config = QATConfig(base_config)
    quantize_(model, qat_config)
    if quantize_embedding:
        # activation fake quantization is not supported for embedding layers
        embedding_base_config = get_quantization_config(
            weight_dtype=weight_dtype,
            activation_dtype=None,
            group_size=group_size,
        )
        if isinstance(embedding_base_config, MXFakeQuantizeConfig):
            embedding_qat_config = QATConfig(
                weight_config=embedding_base_config,
            )
        else:
            embedding_qat_config = QATConfig(embedding_base_config)
        quantize_(
            model,
            embedding_qat_config,
            filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding),
        )


def convert_qat_model(
    model,
    quantize_embedding: bool = False,
):
    """
    This function converts a QAT model which has fake quantized layers back to the original model.
    """
    config = QATConfig(step="convert")
    quantize_(model, config)
    if quantize_embedding:
        quantize_(
            model,
            config,
            filter_fn=lambda m, _: isinstance(m, torch.nn.Embedding),
        )


================================================
FILE: src/axolotl/utils/samplers/__init__.py
================================================
"""
axolotl samplers module
"""

from .multipack import MultipackBatchSampler  # noqa: F401
from .utils import get_dataset_lengths  # noqa: F401


================================================
FILE: src/axolotl/utils/samplers/multipack.py
================================================
"""
Multipack Batch Sampler - An efficient batch sampler for packing variable-length sequences
into fixed-capacity batches to optimize memory usage and training throughput.
"""

import gc
import math
import os
import time
from concurrent.futures import ProcessPoolExecutor
from multiprocessing import cpu_count, get_context
from typing import Iterable, Iterator, Union

import numba
import numpy as np
from torch.utils.data import BatchSampler, Sampler, SequentialSampler

from axolotl.utils.distributed import reduce_and_broadcast
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


@numba.njit
def ffd_check(sequence_lengths: np.ndarray, bin_capacity: int, num_bins: int) -> bool:
    """First-fit-decreasing bin packing algorithm check.

    Checks if sequences with the given lengths could fit in the specified number of
    bins.

    Args:
        sequence_lengths: Array of sequence lengths.
        bin_capacity: Maximum capacity of each bin.
        num_bins: Number of bins available.

    Returns:
        `True` if all sequences can be packed, `False` otherwise.
    """
    # Sort sequence lengths in descending order for optimal packing
    sequence_lengths = np.sort(sequence_lengths)[::-1]
    # Initialize all bins with full capacity
    bins = np.full((num_bins,), bin_capacity, dtype=sequence_lengths.dtype)

    # Try to place each sequence in the first bin it fits
    for size in sequence_lengths:
        not_found = True
        for idx in range(num_bins):
            if bins[idx] >= size:
                bins[idx] -= size
                not_found = False
                break

        # If no bin could fit this sequence, packing failed
        if not_found:
            return False

    return True


@numba.njit
def pack_group(
    sequence_lengths: np.ndarray,
    group_offset: int,
    bin_capacity: int,
    max_bins: int,
    bin_size: int,
    safe_mode: bool = True,
) -> list[list[int]]:
    """Pack a group of sequences into bins using First-Fit Decreasing algorithm.

    Args:
        sequence_lengths: Array of sequence lengths.
        group_offset: Offset to apply to indices when returning results.
        bin_capacity: Maximum capacity of each bin.
        max_bins: Maximum number of bins to use.
        bin_size: Maximum number of sequences per bin.
        safe_mode: If True, use a more conservative packing approach.

    Returns:
        List of bins, where each bin contains indices of sequences assigned to it.
    """
    bins_remaining_space: list = []  # Tracks remaining capacity in each bin
    bins_assigned_sequences: list = []  # Tracks sequence indices assigned to each bin

    for seq_id, size in enumerate(sequence_lengths):
        global_idx = seq_id + group_offset

        # Try to place sequence in existing bins
        add_new_bin = True
        for bin_idx, _ in enumerate(bins_remaining_space):
            if (
                bins_remaining_space[bin_idx] >= size
                and len(bins_assigned_sequences[bin_idx]) < bin_size
            ):
                bins_remaining_space[bin_idx] -= size
                bins_assigned_sequences[bin_idx].append(global_idx)
                add_new_bin = False
                break

        # Create a new bin if needed and if we haven't reached the limit
        if add_new_bin:
            if len(bins_remaining_space) >= max_bins and safe_mode:
                # In safe mode, skip items that would exceed max_bins
                continue
            bins_remaining_space.append(bin_capacity - size)
            bins_assigned_sequences.append([global_idx])

            # Safety check to avoid infinite bins
            if len(bins_remaining_space) > len(sequence_lengths):
                break

    return bins_assigned_sequences


def _process_group(
    args: tuple[np.ndarray, int, int, int, int, bool],
) -> list[list[int]]:
    """Standalone function for multiprocessing."""
    group_lengths, start_idx, bin_capacity, max_bins, bin_size, safe_mode = args
    return pack_group(
        group_lengths, start_idx, bin_capacity, max_bins, bin_size, safe_mode
    )


def pack_parallel(
    sequence_lengths: np.ndarray,
    bin_capacity: int,
    group_size: int,
    bin_size: int,
    num_processes: int | None = None,
    safe_mode: bool = True,
    mp_start_method: str | None = "fork",
) -> list[list[int]]:
    """Pack sequences into bins using parallel processing.

    Args:
        sequence_lengths: Array of sequence lengths.
        bin_capacity: Maximum capacity of each bin as total number of tokens.
        group_size: Number of sequences to process in each group.
        bin_size: Maximum number of bins to use.
        num_processes: Number of parallel processes to use.
        safe_mode: If True, use a more conservative packing approach.
        mp_start_method: Multiprocessing start method ('fork', 'spawn', 'forkserver').
                         'spawn' is often safer with Numba/PyTorch.
                         Set to None to use system default.
    Returns:
        List of bins, where each bin contains indices of sequences assigned to it.
    """
    num_items = len(sequence_lengths)
    if num_processes is None:
        num_processes = max(1, min(num_items // group_size, cpu_count(), 16))

    # Create tasks for parallel processing
    tasks = []
    for i in range(0, num_items, group_size):
        group_lengths = sequence_lengths[i : i + group_size]
        max_bins = len(group_lengths)  # Allow as many bins as items in the group
        tasks.append((group_lengths, i, bin_capacity, max_bins, bin_size, safe_mode))

    # Process groups in parallel
    all_bins = []

    mp_ctx = None
    if mp_start_method:
        try:
            mp_ctx = get_context(mp_start_method)
        except ValueError:
            LOG.warning(
                f"Failed to get multiprocessing context '{mp_start_method}'. "
                f"Falling back to default. Available: {get_context().get_all_start_methods()}"
            )
            mp_ctx = (
                None  # Fallback to default context if specified one is not available
            )

    if num_processes == 1:
        LOG.debug("Using single process for pack_parallel, running sequentially.")
        for task_args in tasks:
            group_bins = _process_group(task_args)
            all_bins.extend(group_bins)
    else:
        # Use ProcessPoolExecutor only if num_processes > 1
        # Pass mp_context if available
        with ProcessPoolExecutor(
            max_workers=num_processes, mp_context=mp_ctx
        ) as executor:
            for group_bins in executor.map(_process_group, tasks):
                all_bins.extend(group_bins)

    return all_bins


@numba.njit
def allocate_sequentially(
    sequence_lengths: np.ndarray, rank: int, bin_capacity: int, num_ranks: int
) -> tuple[list[list[int]], int, int]:
    """Sequential allocator that preserves example order.

    Args:
        sequence_lengths: The lengths of all examples.
        rank: The current rank (for distributed training).
        bin_capacity: The capacity of each bin (maximum sequence length).
        num_ranks: Number of ranks (processes / GPUs).

    Returns:
        rank_batches: List of batches for the current rank.
        total_tokens_used: Number of actual example tokens.
        total_token_slots: Maximum theoretical number of example tokens (number of bins
            * bin capacity).
    """
    result = []
    total_used = 0

    # First, do sequential packing into bins
    all_bins = []
    current_bin = [0 for i in range(0)]  # numba hint
    remaining_capacity = bin_capacity

    for idx, size in enumerate(sequence_lengths):
        if size <= remaining_capacity:
            # Example fits in current bin
            current_bin.append(idx)
            remaining_capacity -= size
            total_used += size
        else:
            # Example doesn't fit, start a new bin
            if current_bin:  # Add non-empty bin to all_bins
                all_bins.append(current_bin)
            current_bin = [idx]
            remaining_capacity = bin_capacity - size
            total_used += size

    # Add the last bin if not empty
    if current_bin:
        all_bins.append(current_bin)

    # Assign bins to ranks - each rank gets every n-th bin
    for bin_idx in range(rank, len(all_bins), num_ranks):
        result.append(all_bins[bin_idx])

    return result, total_used, len(all_bins) * bin_capacity


class MultipackBatchSampler(BatchSampler):
    """Batch sampler class for efficient packing of variable-length sequences

    This sampler packs sequences into fixed-capacity bins (batches) to maximize
    GPU memory utilization and training throughput by reducing padding.

    It supports both parallel packing (using FFD algorithm) and
    sequential packing (preserving original sequence order).
    """

    _batches: list[list[list[int]]] | None = None
    _len_across_ranks: int | None = None

    def __init__(
        self,
        sampler: Union[Sampler[int], Iterable[int]],
        batch_size: int,  # Number of bins per batch
        batch_max_len: int,  # Maximum sequence length (bin capacity)
        lengths: np.ndarray,  # Sequence lengths
        bin_size: int,  # The max number of samples that can be packed in a single bin
        packing_efficiency_estimate: float = 1.0,  # Initial efficiency estimate
        drop_last: bool = True,  # Whether to drop final batches (might be incomplete)
        num_count_samples: int = 4,  # Number of times to estimate batch count
        sequential: bool = False,  # Whether to use sequential packing
        group_size: int = 100_000,  # Size of groups for parallel packing
        num_processes: int | None = None,  # Number of processes for parallel packing
        safe_mode: bool = True,  # Conservative packing to prevent training instability
        mp_start_method: str = "fork",
        **kwargs,
    ):
        super().__init__(sampler, batch_size, drop_last)
        self.batch_size = batch_size
        self.batch_max_len = batch_max_len
        self.lengths = np.array(lengths, dtype=np.int32)
        self.packing_efficiency_estimate = packing_efficiency_estimate or 1.0
        self.sequential = sequential
        self.group_size = group_size
        self.bin_size = bin_size
        self.num_processes = num_processes
        self.safe_mode = safe_mode
        self.mp_start_method = mp_start_method

        assert isinstance(self.lengths, np.ndarray)

        self.epoch = 0

        # Efficiency statistics tracking
        self.total_tokens_used = 0
        self.total_token_slots = 0

        # The number of times to calculate batches to determine minimum packed dataset length
        world_size = int(os.environ.get("WORLD_SIZE", "1"))
        self.num_count_samples = (
            1 if world_size >= num_count_samples else num_count_samples
        )

        if self.sequential and not isinstance(sampler, SequentialSampler):
            LOG.warning(
                "using sequential sample packing with non-sequential sampler, did you want to also enable curriculum_sampling?"
            )

    def set_epoch(self, epoch: int):
        """Set the epoch number, used for reproducible shuffling across epochs"""
        self.epoch = epoch
        self._batches = None  # Invalidate batch cache

    def generate_batches(self, set_stats: bool = False) -> list[list[list[int]]]:
        """Generate packed batches for training.

        Args:
            set_stats: Whether to update efficiency statistics.

        Returns:
            List of batches, where each batch contains multiple bins, and each bin
                contains multiple sequence indices.
        """
        if self._batches is not None:
            return self._batches

        # Get indices from the sampler
        indices = [idx for idx in self.sampler]

        # Get lengths of the selected sequences
        lengths = self.lengths[indices]

        # Pack sequences into bins using either sequential or parallel packing
        if self.sequential:
            bins, total_used, total_slots = allocate_sequentially(
                lengths,
                rank=0,
                bin_capacity=self.batch_max_len,
                num_ranks=1,
            )
            # Map bin indices back to original indices
            bins = [[indices[b_idx] for b_idx in bin_indices] for bin_indices in bins]
        else:
            # Use parallel packing
            num_processes = self.num_processes or 1
            all_bins = pack_parallel(
                lengths,
                bin_capacity=self.batch_max_len,
                group_size=self.group_size,
                bin_size=self.bin_size or self.batch_max_len,
                num_processes=min(4, num_processes) if num_processes else 4,
                safe_mode=self.safe_mode,
                mp_start_method=self.mp_start_method,
            )

            # Map bin indices back to original indices
            bins = [
                [indices[b_idx] for b_idx in bin_indices] for bin_indices in all_bins
            ]

            # Calculate efficiency statistics
            total_used = lengths.sum()
            total_slots = len(all_bins) * self.batch_max_len
            del all_bins

        # Group bins into batches (each batch contains batch_size bins)
        batches = [
            bins[i : i + self.batch_size] for i in range(0, len(bins), self.batch_size)
        ]

        # Drop last batch if requested and it's incomplete
        if self.drop_last and len(batches[-1]) < self.batch_size:
            batches = batches[:-1]
            # Adjust total_slots if we dropped a batch
            if not self.sequential:
                total_slots -= (self.batch_size - len(batches[-1])) * self.batch_max_len

        # Update statistics if requested
        if set_stats:
            self.total_tokens_used += total_used
            self.total_token_slots += total_slots

        self._batches = batches
        gc.collect()
        return batches

    def __iter__(self) -> Iterator[list[list[int]]]:
        """Return an iterator over batches.

        The batches are truncated to match the minimum number of batches across all
        ranks to ensure distributed training balance.
        """
        batches = self.generate_batches(set_stats=True)
        if self._len_across_ranks:
            # Truncate batches to ensure all ranks have the same number of batches
            batches = batches[: self._len_across_ranks]
        return iter(batches)

    def efficiency(self) -> float:
        """Calculate the packing efficiency (ratio of tokens used to total token slots).
        Higher is better - 1.0 would mean perfect packing with no wasted space.
        """
        if self.total_token_slots == 0:
            self.generate_batches(set_stats=True)
        if self.total_token_slots == 0:
            return 0.0
        # Return a Python float instead of potentially a numpy float
        return float(self.total_tokens_used / self.total_token_slots)

    def gather_efficiency(self) -> float:
        """Gather and synchronize packing efficiency estimates across all distributed
        ranks.

        Returns:
            A conservative efficiency estimate based on the measurements.
        """

        def calc_sample_packing_eff_est(estimates: list[float]):
            LOG.debug(f"sample_packing_eff_est across ranks: {repr(estimates)}")
            # Use 99.7% of max observed efficiency as a safe estimate
            max_eff = max(float(eff) for eff in estimates)
            return math.floor(0.997 * max_eff)

        # Gather efficiency from all ranks and apply the calculation function
        sample_packing_actual_eff_all = reduce_and_broadcast(
            lambda: float(self.efficiency()),
            calc_sample_packing_eff_est,
        )

        # Quantize to 0.5% intervals for stability
        sample_packing_eff_est = (
            math.ceil(sample_packing_actual_eff_all * 200.0) / 200.0
        )
        return sample_packing_eff_est

    def gather_len_batches(self, num: int) -> int:
        """Gather and synchronize batch counts across all distributed ranks. Returns
        the minimum number of batches available on any rank.
        """

        def calc_min_len(estimates: list[int]) -> int:
            LOG.info(f"gather_len_batches: {repr(estimates)}")
            return math.floor(min(estimates))

        # Find minimum batch count across ranks to ensure balance
        min_len_batches = reduce_and_broadcast(lambda: num, calc_min_len)
        return min_len_batches

    def __len__(self) -> int:
        """Return the total number of batches that will be yielded by this sampler.

        This is calculated as the minimum number of batches available on any rank to
        ensure balanced distributed training.
        """
        if self._batches is None:
            self._batches = self.generate_batches(set_stats=True)

        if self._len_across_ranks is None:
            # Sample multiple times to get stable estimate
            _sampled_lens = []
            for _ in range(self.num_count_samples):
                self._batches = None  # Reset cached batches
                # log timer for generating batches
                start_time = time.time()
                _sampled_lens.append(len(self.generate_batches(set_stats=False)))
                LOG.debug(f"generate_batches time: {time.time() - start_time}")
            len_batches = min(_sampled_lens)

            # Gather minimum across all ranks
            if self._len_across_ranks is None:
                self._len_across_ranks = self.gather_len_batches(len_batches)
            else:
                self._len_across_ranks = min(
                    self._len_across_ranks, self.gather_len_batches(len_batches)
                )

        return self._len_across_ranks


================================================
FILE: src/axolotl/utils/samplers/utils.py
================================================
"""
helper util to calculate dataset lengths
"""

import numpy as np


def get_dataset_lengths(dataset, from_arrow=False):
    if "length" in dataset.column_names:
        lengths = np.array(dataset["length"])
    elif "position_ids" in dataset.column_names:
        position_ids = dataset["position_ids"]
        lengths = np.array([x[-1] + 1 for x in position_ids])
    else:
        if from_arrow:
            input_ids = dataset.data.column("input_ids")
            lengths = np.vectorize(len)(np.array(input_ids, dtype=object))
        else:
            input_ids = dataset["input_ids"]
            lengths = np.array([len(seq) for seq in input_ids])
    return lengths


================================================
FILE: src/axolotl/utils/schedulers.py
================================================
"""Module for custom LRScheduler class"""

import math
from functools import partial
from typing import Sequence

from torch import Tensor
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LambdaLR, LRScheduler


class RexLR(LRScheduler):
    """
    Reflected Exponential (REX) learning rate scheduler.

    - Original implementation: https://github.com/IvanVassi/REX_LR
    - Original license: Apache 2.0
    - Based on: https://arxiv.org/abs/2107.04197

    Args:
        optimizer (torch.optim.Optimizer): The optimizer to schedule the learning rate for.
        max_lr (float): The maximum learning rate.
        min_lr (float): The minimum learning rate.
        total_steps (int): The total number of training steps.
        num_warmup_steps (int): The number of warmup steps.
        last_step (int): The index of last step.
    """

    def __init__(
        self, optimizer, max_lr, min_lr, total_steps=0, num_warmup_steps=0, last_step=0
    ):
        if min_lr > max_lr:
            raise ValueError(
                f'Value of "min_lr" should be less than value of "max_lr". Got min_lr={min_lr} and max_lr={max_lr}'
            )
        if num_warmup_steps > total_steps:
            raise ValueError(
                f"num_warmup_steps ({num_warmup_steps}) must be less than or equal to total_steps ({total_steps})."
            )

        self.min_lr = min_lr
        self.max_lr = max_lr
        self.total_steps = total_steps
        self.num_warmup_steps = num_warmup_steps
        self.last_step = max(last_step - 1, 0)

        # Ensure each parameter group has an "initial_lr" key to avoid issues when resuming.
        for group in optimizer.param_groups:
            initial_lr = group["lr"]
            if isinstance(initial_lr, Tensor):
                initial_lr = initial_lr.clone()
            group.setdefault("initial_lr", initial_lr)
        # Pass self.last_step as last_epoch to the parent.
        super().__init__(optimizer, last_epoch=self.last_step)

    @property
    def last_step(self):
        return self.last_epoch

    @last_step.setter
    def last_step(self, value):
        self.last_epoch = value

    def get_lr(self):
        # Warmup phase: if defined, increase lr linearly from 0 to max_lr.
        if 1 <= self.last_step <= self.num_warmup_steps:
            return [
                base_lr * self.last_step / self.num_warmup_steps
                for base_lr in self.base_lrs
            ]

        # Post-warmup phase: adjust step relative to the end of warmup.
        step_after = self.last_step - self.num_warmup_steps
        remaining_steps = self.total_steps - self.num_warmup_steps

        # Avoid LR spiking
        if step_after >= remaining_steps or step_after == -1 or remaining_steps <= 0:
            return [self.min_lr for _ in self.base_lrs]

        mod_iter = step_after % remaining_steps
        z = (remaining_steps - mod_iter) / remaining_steps
        rex_factor = self.min_lr / self.max_lr + (1.0 - self.min_lr / self.max_lr) * (
            z / (0.1 + 0.9 * z)
        )
        return [base_lr * rex_factor for base_lr in self.base_lrs]


class InterpolatingLogScheduler(LRScheduler):
    """
    A scheduler that interpolates learning rates in a logarithmic fashion
    """

    def __init__(self, optimizer, num_steps, min_lr, max_lr, last_epoch=-1):
        """A scheduler that interpolates learning rates in a logarithmic fashion

        Args:
        - optimizer: pytorch optimizer
        - num_steps: int, the number of steps over which to increase from the min_lr to the max_lr
        - min_lr: float, the minimum learning rate
        - max_lr: float, the maximum learning rate

        Usage:
            fc = nn.Linear(1,1)
            optimizer = optim.Adam(fc.parameters())
            lr_scheduler = InterpolatingLogScheduler(optimizer, num_steps=400, min_lr=1e-6, max_lr=1e-4)
        """
        self.num_steps = num_steps
        self.min_lr = min_lr
        self.max_lr = max_lr
        self.q = (max_lr / min_lr) ** (1 / (num_steps - 1))
        super().__init__(optimizer, last_epoch)

    def get_lr(self):
        if self.last_epoch <= 0:
            lrs = [self.min_lr for base_lr in self.base_lrs]
        elif self.last_epoch < self.num_steps:
            lrs = [
                self.min_lr * (self.q ** (self.last_epoch - 1))
                for base_lr in self.base_lrs
            ]
        else:
            lrs = [self.max_lr for base_lr in self.base_lrs]

        return lrs


def _get_cosine_schedule_with_quadratic_warmup_lr_lambda(
    current_step: int,
    *,
    num_warmup_steps: int,
    num_training_steps: int,
    num_cycles: float,
):
    if current_step < num_warmup_steps:
        return (float(current_step) / float(max(1, num_warmup_steps))) ** 2
    progress = float(current_step - num_warmup_steps) / float(
        max(1, num_training_steps - num_warmup_steps)
    )
    return max(
        0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress))
    )


def get_cosine_schedule_with_quadratic_warmup(
    optimizer: Optimizer,
    num_warmup_steps: int,
    num_training_steps: int,
    num_cycles: float = 0.5,
    last_epoch: int = -1,
):
    """
    Create a schedule with a learning rate that decreases following the values of the cosine function between the
    initial lr set in the optimizer to 0, after a warmup period during which it increases linearly between 0 and the
    initial lr set in the optimizer.

    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
        num_cycles (`float`, *optional*, defaults to 0.5):
            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
            following a half-cosine).
        last_epoch (`int`, *optional*, defaults to -1):
            The index of the last epoch when resuming training.

    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """

    lr_lambda = partial(
        _get_cosine_schedule_with_quadratic_warmup_lr_lambda,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
        num_cycles=num_cycles,
    )
    return LambdaLR(optimizer, lr_lambda, last_epoch)


def _get_cosine_schedule_with_min_lr_lambda(
    current_step: int,
    *,
    num_warmup_steps: int,
    num_training_steps: int,
    min_lr_ratio: float,
):
    # Warm up
    if current_step < num_warmup_steps:
        return float(current_step) / float(max(1, num_warmup_steps))

    # Cosine learning rate decay
    progress = float(current_step - num_warmup_steps) / float(
        max(1, num_training_steps - num_warmup_steps)
    )
    scaling = 0.5 * (1.0 + math.cos(math.pi * progress))
    return (1 - min_lr_ratio) * scaling + min_lr_ratio


def get_cosine_schedule_with_min_lr(
    optimizer: Optimizer,
    num_warmup_steps: int,
    num_training_steps: int,
    min_lr_ratio: float = 0.0,
):
    """
    Create a learning rate schedule which has:
        - linear warmup from 0 -> `max_lr` over `num_warmup_steps`
        - cosine learning rate annealing from `max_lr` -> `min_lr` over `num_training_steps`
    """

    lr_lambda = partial(
        _get_cosine_schedule_with_min_lr_lambda,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
        min_lr_ratio=min_lr_ratio,
    )
    return LambdaLR(optimizer, lr_lambda)


def _get_cosine_schedule_with_warmup_decay_constant_lr_lambda(
    current_step: int,
    *,
    num_warmup_steps: int,
    num_training_steps: int,
    constant_lr_ratio: float,
    min_lr_ratio: float,
    num_cycles: float,
):
    if current_step < num_warmup_steps:
        return float(current_step) / float(max(1, num_warmup_steps))

    num_constant_steps = int(num_training_steps * constant_lr_ratio)
    current_step = min(current_step, num_constant_steps)

    progress = float(current_step - num_warmup_steps) / float(
        max(1, num_constant_steps - num_warmup_steps)
    )

    return (
        max(
            0,
            (1 - min_lr_ratio)
            * 0.5
            * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)),
        )
        + min_lr_ratio
    )


def get_cosine_schedule_with_warmup_decay_constant(
    optimizer: Optimizer,
    num_warmup_steps: int,
    num_training_steps: int,
    constant_lr_ratio: float,
    min_lr_ratio: float,
    num_cycles: float = 0.5,
    last_epoch: int = -1,
):
    """
    Implementation of Continual Pre-Training of Large Language Models: How to (re)warm your model? (https://arxiv.org/pdf/2308.04014.pdf)
    Create a schedule with a learning rate that decreases following the values of the cosine function between the
    initial lr set in the optimizer to min_lr_ratio until num_training_steps * constant_lr_ratio, after constant_rate returns constant value of min_rate
    , after a warmup period during which it increases linearly between 0 and the initial lr set in the optimizer.

    Args:
        optimizer ([`~torch.optim.Optimizer`]):
            The optimizer for which to schedule the learning rate.
        num_warmup_steps (`int`):
            The number of steps for the warmup phase.
        num_training_steps (`int`):
            The total number of training steps.
        constant_lr_ratio: (`float`):
            The ratio of num_training_steps to decrease by cosine function.
        min_lr_ratio: (`float):
            The ratio of maximum learning rate for cosine function to decay to minimum learning rate.
        num_cycles (`float`, *optional*, defaults to 0.5):
            The number of waves in the cosine schedule (the defaults is to just decrease from the max value to 0
            following a half-cosine).
        last_epoch (`int`, *optional*, defaults to -1):
            The index of the last epoch when resuming training.

    Return:
        `torch.optim.lr_scheduler.LambdaLR` with the appropriate schedule.
    """

    lr_lambda = partial(
        _get_cosine_schedule_with_warmup_decay_constant_lr_lambda,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_training_steps,
        constant_lr_ratio=constant_lr_ratio,
        min_lr_ratio=min_lr_ratio,
        num_cycles=num_cycles,
    )
    return LambdaLR(optimizer, lr_lambda, last_epoch)


class JaggedLRRestartScheduler(LRScheduler):
    """Wraps another scheduler to apply per-lora-restart learning rate warmups."""

    def __init__(
        self,
        optimizer: Optimizer,
        inner_schedule: LRScheduler,
        jagged_restart_steps: int,
        jagged_restart_warmup_steps: int,
        jagged_restart_anneal_steps: int = 1,
        min_lr_scale: float = 0.001,
    ) -> None:
        self.inner_schedule = inner_schedule
        self.restarts_steps = jagged_restart_steps
        self.warmup_steps = jagged_restart_warmup_steps
        self.anneal_steps = jagged_restart_anneal_steps
        self.min_lr_scale = min_lr_scale
        super().__init__(optimizer, inner_schedule.last_epoch)

    def get_lr(self) -> float | Sequence[float]:
        self.inner_schedule.last_epoch = self.last_epoch

        original = self.inner_schedule.get_lr()
        step = self.last_epoch

        if step < self.restarts_steps - self.anneal_steps:
            scale = 1
        else:
            per_restart_progress = step % self.restarts_steps
            if per_restart_progress < self.warmup_steps:
                cycle_t = min(1.0, (per_restart_progress) / self.warmup_steps)
            elif per_restart_progress > (self.restarts_steps - self.anneal_steps):
                cycle_t = min(
                    1.0,
                    (self.restarts_steps - per_restart_progress) / self.anneal_steps,
                )
            else:
                cycle_t = 1
            scale = cycle_t * (1 - self.min_lr_scale) + self.min_lr_scale

        if isinstance(original, Sequence):
            return [lr * scale for lr in original]

        return original * scale


================================================
FILE: src/axolotl/utils/schemas/__init__.py
================================================


================================================
FILE: src/axolotl/utils/schemas/config.py
================================================
"""Module with Pydantic models for configuration."""

from typing import Annotated, Any, Literal

from accelerate.utils import is_fp8_available
from annotated_types import MinLen
from packaging import version
from pydantic import (
    BaseModel,
    Field,
    StringConstraints,
    field_serializer,
    model_validator,
)

from axolotl.utils.datasets import get_default_process_count
from axolotl.utils.logging import get_logger
from axolotl.utils.schemas.datasets import (
    DatasetConfig,
    DPODataset,
    KTODataset,
    PretrainingDataset,
    SFTDataset,
    StepwiseSupervisedDataset,
)
from axolotl.utils.schemas.deprecated import DeprecatedParameters, RemappedParameters
from axolotl.utils.schemas.dynamic_checkpoint import DynamicCheckpointConfig
from axolotl.utils.schemas.enums import ChatTemplate, RingAttnFunc, RLType
from axolotl.utils.schemas.fsdp import FSDPConfig
from axolotl.utils.schemas.integrations import (
    CometConfig,
    GradioConfig,
    LISAConfig,
    MLFlowConfig,
    OpenTelemetryConfig,
    RayConfig,
    TrackioConfig,
    WandbConfig,
)
from axolotl.utils.schemas.internal import EnvCapabilities, GPUCapabilities
from axolotl.utils.schemas.model import (
    ModelInputConfig,
    ModelOutputConfig,
    SpecialTokensConfig,
)
from axolotl.utils.schemas.multimodal import MultiModalConfig
from axolotl.utils.schemas.peft import LoraConfig, ReLoRAConfig
from axolotl.utils.schemas.quantization import PTQConfig, QATConfig
from axolotl.utils.schemas.training import HyperparametersConfig, JaggedLRConfig
from axolotl.utils.schemas.trl import TRLConfig
from axolotl.utils.schemas.validation import ValidationMixin
from axolotl.utils.schemas.vllm import VllmConfig

LOG = get_logger(__name__)


class AxolotlInputConfig(
    ModelInputConfig,
    ModelOutputConfig,
    LoraConfig,
    ReLoRAConfig,
    JaggedLRConfig,
    HyperparametersConfig,
    WandbConfig,
    MLFlowConfig,
    CometConfig,
    TrackioConfig,
    OpenTelemetryConfig,
    LISAConfig,
    GradioConfig,
    RayConfig,
    MultiModalConfig,
    RemappedParameters,
    DeprecatedParameters,
    ValidationMixin,
    BaseModel,
):
    """Wrapper of all config options."""

    model_config = {"populate_by_name": True}

    strict: bool | None = Field(
        default=False,
        json_schema_extra={"description": "Allow overwrite yml config using from cli"},
    )
    resume_from_checkpoint: str | None = Field(
        default=None,
        json_schema_extra={"description": "Resume from a specific checkpoint dir"},
    )
    auto_resume_from_checkpoints: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "If resume_from_checkpoint isn't set and you simply want it to start where it left off. Be careful with this being turned on between different models."
        },
    )
    resize_token_embeddings_to_32x: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Resize the model embeddings when new tokens are added to multiples of 32. This is reported to improve training speed on some models"
        },
    )
    mean_resizing_embeddings: bool | None = False
    # optionally shrink the embeddings when the tokenizer vocab size is smaller
    shrink_embeddings: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to shrink the embeddings to len(tokenizer). By default, we won't shrink."
        },
    )
    embeddings_skip_upcast: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Don't upcast the embeddings to float32 when using PEFT. Useful for low-VRAM GPUs"
        },
    )
    reinit_weights: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Reinitialize model weights randomly instead of loading pretrained weights"
        },
    )

    trainer_cls: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "module to custom trainer class to use for training"
        },
    )

    rl: RLType | None = Field(
        default=None,
        json_schema_extra={
            "description": "Use RL training: 'dpo', 'ipo', 'kto', 'simpo', 'orpo', 'grpo'"
        },
    )
    trl: TRLConfig | None = Field(
        default_factory=lambda: TRLConfig(),
    )
    vllm: VllmConfig | None = Field(
        default_factory=lambda: VllmConfig(),
    )
    qat: QATConfig | None = None
    quantization: PTQConfig | None = None
    reward_model: bool | None = Field(
        default=None,
        json_schema_extra={"description": "Reward modelling: `True` or `False`"},
    )
    dynamic_checkpoint: DynamicCheckpointConfig | None = Field(
        default=None,
        json_schema_extra={
            "description": "Configuration for dynamic checkpointing (trigger by file or signal). "
            "Set 'enabled: true' to activate this feature."
        },
    )
    process_reward_model: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Process reward modelling: `True` or `False`"
        },
    )
    center_rewards_coefficient: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Coefficient to incentivize the reward model to output mean-zero rewards (proposed by https://huggingface.co/papers/2312.09244, Eq. 2). Recommended value: `0.01`."
        },
    )
    num_labels: int | None = None
    # Whether to use weighting in DPO trainer.
    # If `None`, default is `False` in the trainer.
    dpo_use_weighting: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to perform weighting in DPO trainer"
        },
    )
    dpo_label_smoothing: float | None = None
    dpo_norm_loss: bool | None = None

    dpo_use_liger_kernel: bool | None = Field(
        default=None,
        json_schema_extra={"description": "Whether to use Liger kernel for DPO loss."},
    )

    dpo_padding_free: bool | None = None

    datasets: (
        Annotated[
            list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset],
            MinLen(1),
        ]
        | None
    ) = Field(
        default=None,
        json_schema_extra={
            "description": "A list of one or more datasets to finetune the model with"
        },
    )

    test_datasets: (
        Annotated[
            list[SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset],
            MinLen(1),
        ]
        | None
    ) = Field(
        default=None,
        json_schema_extra={
            "description": "A list of one or more datasets to eval the model with. You can use either test_datasets, or val_set_size, but not both."
        },
    )
    shuffle_merged_datasets: bool | None = Field(
        default=True,
        json_schema_extra={
            "description": "If false, the datasets will not be shuffled and will keep their original order in `datasets`. The same applies to the `test_datasets` option and the `pretraining_dataset` option. Default is true."
        },
    )
    shuffle_before_merging_datasets: bool | None = Field(
        default=False,
        json_schema_extra={
            "description": "If true, each dataset in `datasets` will be shuffled before merging. This allows curriculum learning strategies to be applied at the dataset level. Default is false."
        },
    )
    dataset_prepared_path: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Axolotl attempts to save the dataset as an arrow after packing the data together so subsequent training attempts load faster, relative path"
        },
    )
    dataset_shard_num: int | None = Field(
        default=None, json_schema_extra={"description": "Num shards for whole dataset"}
    )
    dataset_shard_idx: int | None = Field(
        default=None,
        json_schema_extra={"description": "Index of shard to use for whole dataset"},
    )
    skip_prepare_dataset: bool | None = False
    num_dataset_shards_to_save: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Number of shards to save the prepared dataset"
        },
    )

    pretraining_dataset: (
        Annotated[list[PretrainingDataset | SFTDataset], MinLen(1)] | None
    ) = Field(
        default=None,
        json_schema_extra={
            "description": "Set to HF dataset for type: 'completion' for streaming instead of pre-tokenize"
        },
    )
    dataset_processes: int | None = Field(
        default=None,
        deprecated="Use `dataset_num_proc` instead. This parameter will be removed in a future version.",
        json_schema_extra={
            "description": (
                "The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()` if not set.\n"
                "For Runpod VMs, it will default to number of vCPUs via RUNPOD_CPU_COUNT."
            )
        },
    )
    dataset_num_proc: int | None = Field(
        default=None,
        json_schema_extra={
            "description": (
                "The maximum number of processes to use while preprocessing your input dataset. This defaults to `os.cpu_count()` if not set.\n"
                "For Runpod VMs, it will default to number of vCPUs via RUNPOD_CPU_COUNT."
            )
        },
    )

    dataset_exact_deduplication: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Deduplicates datasets and test_datasets with identical entries"
        },
    )
    dataset_keep_in_memory: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Keep dataset in memory while preprocessing. Only needed if cached dataset is taking too much storage"
        },
    )
    dataloader_pin_memory: bool | None = None
    dataloader_num_workers: int | None = None
    dataloader_prefetch_factor: int | None = None
    dataloader_drop_last: bool | None = None

    accelerator_config: dict[str, Any] | None = None

    remove_unused_columns: bool | None = None

    push_dataset_to_hub: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Push prepared dataset to hub - repo_org/repo_name"
        },
    )
    hf_use_auth_token: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to use hf `use_auth_token` for loading datasets. Useful for fetching private datasets. Required to be true when used in combination with `push_dataset_to_hub`"
        },
    )

    device: Any | None = None
    device_map: Any | None = Field(
        default=None,
        json_schema_extra={
            "description": "Passed through to transformers when loading the model when launched without accelerate. Use `sequential` when training w/ model parallelism to limit memory"
        },
    )
    world_size: int | None = None
    local_rank: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Don't mess with this, it's here for accelerate and torchrun"
        },
    )
    ddp: bool | None = None

    seed: int | None = Field(
        default=None, json_schema_extra={"description": "Seed for reproducibility"}
    )
    ddp_timeout: int | None = Field(
        default=None,
        json_schema_extra={"description": "Advanced DDP Arguments - timeout"},
    )
    ddp_bucket_cap_mb: int | None = Field(
        default=None,
        json_schema_extra={"description": "Advanced DDP Arguments - bucket cap in MB"},
    )
    ddp_broadcast_buffers: bool | None = Field(
        default=None,
        json_schema_extra={"description": "Advanced DDP Arguments - broadcast buffers"},
    )
    ddp_find_unused_parameters: bool | None = None

    do_causal_lm_eval: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to run causal language model evaluation for metrics in `eval_causal_lm_metrics`"
        },
    )
    eval_causal_lm_metrics: list[str] | None = Field(
        default=None,
        json_schema_extra={
            "description": "HF evaluate metrics used during evaluation. Default is ['sacrebleu', 'comet', 'ter', 'chrf', 'perplexity']"
        },
    )
    do_bench_eval: bool | None = None
    bench_dataset: str | None = None
    bench_split: str | None = None
    metric_for_best_model: str | None = None
    greater_is_better: bool | None = None

    loss_watchdog_threshold: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "High loss value, indicating the learning has broken down (a good estimate is ~2 times the loss at the start of training)"
        },
    )
    loss_watchdog_patience: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Number of high-loss steps in a row before the trainer aborts (default: 3)"
        },
    )

    gc_steps: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Run garbage collection every `gc_steps` steps. -1 will run on epoch end and before evaluations. Default is 0 (disabled)."
        },
    )

    bf16: Literal["auto"] | bool | None = Field(
        default="auto",
        json_schema_extra={
            "description": "Use CUDA bf16. bool or 'full' for `bf16_full_eval`, or 'auto' for automatic detection. require >=ampere"
        },
    )
    fp16: bool | None = Field(
        default=None, json_schema_extra={"description": "Use CUDA fp16"}
    )
    fp8: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Enable FP8 mixed precision training using TorchAO. Best "
            "used in combination with torch.compile."
        },
    )
    fp8_enable_fsdp_float8_all_gather: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Enable FSDP float8 all-gather optimization for FP8 training. Can "
            "improve training speed by 10-15% when FSDP is enabled."
        },
    )
    bfloat16: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "No AMP (automatic mixed precision) - require >=ampere"
        },
    )  # for non-AMP cases
    float16: bool | None = Field(
        default=None,
        json_schema_extra={"description": "No AMP (automatic mixed precision)"},
    )  # for non-AMP cases
    tf32: Literal["auto"] | bool | None = Field(
        default="auto",
        json_schema_extra={
            "description": "bool to use CUDA tf32 or 'auto' for automatic detection - require >=ampere"
        },
    )
    float32: bool | None = None

    gradient_checkpointing: Literal["offload", "offload_disk"] | bool | None = Field(
        default=False,
        json_schema_extra={
            "description": "Whether to use gradient checkpointing. Available options are: true, false, 'offload', 'offload_disk'. https://huggingface.co/docs/transformers/v4.18.0/en/performance#gradient-checkpointing"
        },
    )
    gradient_checkpointing_kwargs: dict[str, Any] | None = Field(
        default=None,
        json_schema_extra={
            "description": "Additional kwargs to pass to the trainer for gradient checkpointing"
        },
    )
    activation_offloading: Literal["legacy", "disk"] | bool | None = Field(
        default=False,
        json_schema_extra={
            "description": "Whether to offload activations. Available options are: true, false, 'legacy', 'disk'."
        },
    )

    unfrozen_parameters: list[str] | None = Field(
        default=None,
        json_schema_extra={
            "description": "List of regex patterns for parameter names to keep unfrozen. "
            "All other parameters will be frozen via requires_grad=False. "
            "Note: range-based patterns (e.g. embed_tokens.weight$[:32000]) use gradient "
            "zeroing rather than a true freeze, so weight decay will still apply to the "
            "frozen portion and optimizer states are allocated for the full parameter."
        },
    )

    sequence_len: int = Field(
        default=512,
        json_schema_extra={
            "description": "The maximum length of an input to train with, this should typically be less than 2048 as most models have a token/context limit of 2048"
        },
    )
    excess_length_strategy: Literal["drop", "truncate", "raise"] | None = Field(
        default=None,
        json_schema_extra={
            "description": "What to do when a tokenized row exceeds sequence_len. 'drop' removes the row; 'truncate' slices tensors to sequence_len; 'raise' raises a ValueError. Defaults to 'drop' for backward compatibility."
        },
    )
    eval_sequence_len: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "The maximum length of an input for evaluation. If not specified, defaults to sequence_len"
        },
    )
    min_sample_len: int | None = None
    max_prompt_len: int | None = Field(
        default=None,
        json_schema_extra={"description": "maximum prompt length for RL training"},
    )
    sample_packing: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Use efficient multi-packing with block diagonal attention and per sequence position_ids. Recommend set to 'true'"
        },
    )
    sample_packing_group_size: int | None = Field(
        default=100_000,
        json_schema_extra={
            "description": "The number of samples packed at a time. Increasing the following values helps with packing, but usually only slightly (<%1.)"
        },
    )
    sample_packing_bin_size: int | None = Field(
        default=200,
        json_schema_extra={
            "description": "The number of samples which can be packed into one sequence. Increase if using a large sequence_len with many short samples."
        },
    )
    sample_packing_sequentially: bool | None = Field(
        default=None,
        json_schema_extra={"description": "Whether to pack samples sequentially"},
    )
    sample_packing_mp_start_method: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "The multiprocessing start method to use for packing. Should be 'fork', 'spawn' or 'forkserver'"
        },
    )
    eval_sample_packing: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Set to 'false' if getting errors during eval with sample_packing on"
        },
    )
    pad_to_sequence_len: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Pad inputs so each step uses constant sized buffers. This will reduce memory fragmentation and may prevent OOMs, by re-using memory more efficiently. Defaults to True if `sample_packing` enabled"
        },
    )
    curriculum_sampling: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to use sequential sampling for curriculum learning"
        },
    )
    multipack_real_batches: bool | None = None

    batch_flattening: Literal["auto"] | bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Use batch flattening for speedups when not using sample_packing"
        },
    )

    # for PoSE context length extension
    use_pose: bool | None = None
    pose_split_on_token_ids: list[int] | None = None
    pose_max_context_len: int | None = None
    pose_num_chunks: int | None = None

    # Deprecated: Use streaming_multipack_buffer_size instead
    pretrain_multipack_buffer_size: int | None = Field(
        default=None,
        deprecated="Deprecated in v0.13.0, will be removed in v0.14.0. Use streaming_multipack_buffer_size instead",
    )
    pretrain_multipack_attn: bool | None = Field(
        default=True,
        json_schema_extra={
            "description": "whether to prevent cross attention for packed sequences during pretraining",
        },
    )
    pretraining_sample_concatenation: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "whether to concatenate samples during pretraining",
        },
    )

    streaming: bool | None = Field(
        default=None,
        json_schema_extra={"description": "Use streaming mode for loading datasets"},
    )
    streaming_multipack_buffer_size: int | None = Field(
        default=10_000,
        json_schema_extra={
            "description": "Buffer size for multipack streaming datasets"
        },
    )

    xformers_attention: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to use xformers attention patch https://github.com/facebookresearch/xformers"
        },
    )
    sdp_attention: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to use scaled-dot-product attention https://pytorch.org/docs/stable/generated/torch.nn.functional.scaled_dot_product_attention.html"
        },
    )
    s2_attention: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Shifted-sparse attention (only llama) - https://arxiv.org/pdf/2309.12307.pdf"
        },
    )
    flex_attention: bool | None = None
    flex_attn_compile_kwargs: dict[str, Any] | None = None
    flash_attention: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to use flash attention patch https://github.com/Dao-AILab/flash-attention"
        },
    )
    flash_attn_cross_entropy: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to use flash-attention cross entropy implementation - advanced use only"
        },
    )
    flash_attn_rms_norm: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to use flash-attention rms norm implementation - advanced use only"
        },
    )
    flash_attn_fuse_mlp: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to fuse part of the MLP into a single operation"
        },
    )
    flash_optimum: bool | None = Field(
        default=None,
        json_schema_extra={"description": "Whether to use bettertransformers"},
    )
    sage_attention: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to use SageAttention https://github.com/thu-ml/SageAttention"
        },
    )

    eager_attention: bool | None = None

    attn_implementation: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Specify a custom attention implementation, used mostly for kernels."
        },
    )

    experts_implementation: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Which experts implementation to use for MoE models,"
        },
    )

    quantize_moe_experts: bool = Field(
        default=False,
        json_schema_extra={
            "description": "Quantize MoE expert weights on load to reduce VRAM. "
            "Requires adapter (lora/qlora) with load_in_4bit or load_in_8bit. "
            "Requires CUDA (not compatible with ROCm or other backends). "
            "Note: total parameter count may be reported incorrectly when enabled "
            "(trainable param count is correct)."
        },
    )

    scaling_softmax: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to use Scaled Softmax (SSMax) attention. Ref: https://arxiv.org/abs/2501.19399"
        },
    )
    scaling_softmax_factor: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Scaling factor for SSMax attention. Default is 0.43"
        },
    )
    scaling_softmax_bias: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Bias for SSMax attention. Default is 0.0. Note: The paper recommends bias=0 for better length generalization."
        },
    )

    unsloth_cross_entropy_loss: bool | None = None
    unsloth_lora_mlp: bool | None = None
    unsloth_lora_qkv: bool | None = None
    unsloth_lora_o: bool | None = None
    unsloth_rms_norm: bool | None = None
    unsloth_rope: bool | None = None

    lora_mlp_kernel: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Apply custom LoRA autograd functions and activation function Triton kernels for speed and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html"
        },
    )
    lora_qkv_kernel: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Apply custom LoRA autograd functions and activation function Triton kernels for speed and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html"
        },
    )
    lora_o_kernel: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Apply custom LoRA autograd functions and activation function Triton kernels for speed and memory savings. See: https://docs.axolotl.ai/docs/lora_optims.html"
        },
    )

    chunked_cross_entropy: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to use chunked cross entropy loss for memory efficiency"
        },
    )
    chunked_cross_entropy_num_chunks: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Number of chunks to use for chunked cross entropy loss"
        },
    )
    use_eaft: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Enable Entropy-Aware Focal Training loss (EAFT)"
        },
    )
    eaft_alpha: float | None = Field(
        default=1.0,
        json_schema_extra={
            "description": "Exponent for entropy weighting in EAFT (default: 1.0)"
        },
    )
    eaft_k: int | None = Field(
        default=20,
        json_schema_extra={
            "description": "Number of top logits for entropy approximation (default: 20)"
        },
    )

    tiled_mlp: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to use ALST tiled mlp for memory efficient long context"
        },
    )

    tiled_mlp_num_shards: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Number of shards to use for ALST tiled mlp. If unset, it will be set based on seqlen/hidden_size"
        },
    )

    tiled_mlp_use_original_mlp: bool | None = Field(
        default=True,
        json_schema_extra={
            "description": "Whether to use original mlp for ALST tiled mlp. Otherwise uses a generic MLP based on llama."
        },
    )

    llama4_linearized_experts: bool | None = None

    deepspeed: str | dict[str, Any] | None = Field(
        default=None,
        json_schema_extra={
            "description": "Deepspeed config path. e.g., deepspeed_configs/zero3.json"
        },
    )
    deepcompile: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to use deepcompile for faster training with deepspeed"
        },
    )
    fsdp: list[str] | None = Field(
        default=None,
        json_schema_extra={"description": "FSDP configuration"},
        deprecated="Configuring FSDP using `fsdp` is deprecated. Please use `fsdp_config` instead. ",
    )
    fsdp_config: FSDPConfig | None = Field(
        default=None, json_schema_extra={"description": "FSDP configuration options"}
    )
    fsdp_version: int | None = Field(
        default=None,
        json_schema_extra={"description": "FSDP version"},
    )
    fsdp_final_state_dict_type: (
        Literal["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DICT"] | None
    ) = Field(
        default=None,
        deprecated="Configuring FSDP final state dict type using `fsdp_final_state_dict_type` is deprecated. Please use `fsdp_config.final_state_dict_type` instead.",
    )

    val_set_size: float | None = Field(
        default=0.0,
        json_schema_extra={
            "description": "How much of the dataset to set aside as evaluation. 1 = 100%, 0.50 = 50%, etc. 0 for no eval."
        },
    )

    dp_shard_size: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Number of devices to shard across. If not set, will use all available devices."
        },
    )
    dp_replicate_size: int | None = Field(
        default=None,
        json_schema_extra={"description": "Number of devices to replicate across."},
    )
    sequence_parallel_degree: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Deprecated: use `context_parallel_size` instead"
        },
    )
    context_parallel_size: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Set to a divisor of the number of GPUs available to split sequences into chunks of equal size. Use in long context training to prevent OOM when sequences cannot fit into a single GPU's VRAM. E.g., if 4 GPUs are available, set this value to 2 to split each sequence into two equal-sized subsequences, or set to 4 to split into four equal-sized subsequences. See https://docs.axolotl.ai/docs/sequence_parallelism.html for more details."
        },
    )
    heads_k_stride: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Optional; strides across the key dimension. Larger values use more memory but should make training faster. Must evenly divide the number of KV heads in your model."
        },
    )
    ring_attn_func: RingAttnFunc | None = Field(
        default=None,
        json_schema_extra={
            "description": "One of 'varlen_llama3', 'batch_ring', 'batch_zigzag', 'batch_stripe'. Defaults to 'varlen_llama3' in the sample packing case, and 'batch_ring' in the non-sample packing case."
        },
    )
    tensor_parallel_size: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Number of tensor parallel processes in TP group. Only supported with DeepSpeed AutoTP."
        },
    )
    special_tokens: SpecialTokensConfig | None = Field(
        default=None,
        json_schema_extra={
            "description": "Add or change special tokens. If you add tokens here, you don't need to add them to the `tokens` list."
        },
    )
    tokens: list[str] | None = Field(
        default=None,
        json_schema_extra={"description": "Add extra tokens to the tokenizer"},
    )
    added_tokens_overrides: dict[int, str] | None = Field(
        default=None,
        json_schema_extra={
            "description": "Mapping token_id to new_token_string to override reserved added_tokens in the tokenizer. Only works for tokens that are not part of the base vocab (aka are added_tokens). Can be checked if they exist in tokenizer.json added_tokens."
        },
    )

    torch_compile: Literal["auto"] | bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to use torch.compile and which backend to use. setting to `auto` will enable torch compile when torch>=2.6.0"
        },
    )
    torch_compile_backend: str | None = Field(
        default=None,
        json_schema_extra={"description": "Backend to use for torch.compile"},
    )
    torch_compile_mode: Literal["default", "reduce-overhead", "max-autotune"] | None = (
        None
    )

    max_steps: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Maximum number of iterations to train for. It precedes num_epochs which means that if both are set, num_epochs will not be guaranteed. e.g., when 1 epoch is 1000 steps => `num_epochs: 2` and `max_steps: 100` will train for 100 steps"
        },
    )
    warmup_steps: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Number of warmup steps. Cannot use with warmup_ratio"
        },
    )
    warmup_ratio: float | None = Field(
        default=None,
        json_schema_extra={"description": "Warmup ratio. Cannot use with warmup_steps"},
    )
    eval_steps: int | float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Leave empty to eval at each epoch, integer for every N steps. float for fraction of total steps"
        },
    )
    evals_per_epoch: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Number of times per epoch to run evals, mutually exclusive with eval_steps"
        },
    )
    eval_strategy: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Set to `no` to skip evaluation, `epoch` at end of each epoch, leave empty to infer from `eval_steps`"
        },
    )

    save_steps: int | float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Leave empty to save at each epoch, integer for every N steps. float for fraction of total steps"
        },
    )
    saves_per_epoch: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Number of times per epoch to save a checkpoint, mutually exclusive with save_steps"
        },
    )
    save_strategy: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Set to `no` to skip checkpoint saves, `epoch` at end of each epoch, `best` when better result is achieved, leave empty to infer from `save_steps`"
        },
    )
    save_total_limit: int | None = Field(
        default=None, json_schema_extra={"description": "Checkpoints saved at a time"}
    )
    save_first_step: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to checkpoint a model after the first step of training. Defaults to False."
        },
    )

    logging_steps: int | None = Field(
        default=None, json_schema_extra={"description": "Logging frequency"}
    )
    early_stopping_patience: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Stop training after this many evaluation losses have increased in a row. https://huggingface.co/transformers/v4.2.2/_modules/transformers/trainer_callback.html#EarlyStoppingCallback"
        },
    )
    load_best_model_at_end: bool | None = False
    save_only_model: bool | None = Field(
        default=False,
        json_schema_extra={
            "description": "Save only the model weights, skipping the optimizer. Using this means you can't resume from checkpoints."
        },
    )
    use_tensorboard: bool | None = Field(
        default=None, json_schema_extra={"description": "Use tensorboard for logging"}
    )
    profiler_steps: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Enable the pytorch profiler to capture the first N steps of training to the output_dir. see https://pytorch.org/blog/understanding-gpu-memory-1/ for more information. Snapshots can be visualized @ https://pytorch.org/memory_viz"
        },
    )
    profiler_steps_start: int | None = Field(
        default=0,
        json_schema_extra={
            "description": "Which step to start the profiler at. Useful for only capturing a few steps mid-run."
        },
    )
    include_tokens_per_second: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "bool of whether to report tokens per second at the end of training. This is not supported with pre-training datasets."
        },
    )
    include_tkps: bool | None = Field(
        default=True,
        json_schema_extra={
            "description": "bool of whether to report tokens per second per-gpu during training by measuring throughput of non-padding tokens."
        },
    )
    neftune_noise_alpha: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "NEFT https://arxiv.org/abs/2310.05914, set this to a number (paper default is 5) to add noise to embeddings. Currently only supported on Llama and Mistral"
        },
    )

    orpo_alpha: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Parameter controlling the relative ratio loss weight in the ORPO loss. Passed to `beta` in `ORPOConfig` due to trl mapping."
        },
    )
    rpo_alpha: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Weighting of NLL term in loss from RPO paper"
        },
    )
    simpo_gamma: float | None = Field(
        default=None,
        json_schema_extra={"description": "Target reward margin for the SimPO loss"},
    )
    cpo_alpha: float | None = Field(
        default=None, json_schema_extra={"description": "Weight of the BC regularizer"}
    )

    kto_desirable_weight: float | None = Field(
        default=None,
        json_schema_extra={"description": "Factor for desirable loss term in KTO loss"},
    )
    kto_undesirable_weight: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Factor for undesirable loss term in KTO loss"
        },
    )
    rl_beta: float | None = Field(
        default=None,
        json_schema_extra={"description": "The beta parameter for the RL training"},
    )

    max_memory: dict[int | Literal["cpu", "disk"], int | str] | None = Field(
        default=None,
        json_schema_extra={
            "description": "Defines the max memory usage per gpu on the system. Passed through to transformers when loading the model."
        },
    )
    gpu_memory_limit: int | str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Limit the memory for all available GPUs to this amount (if an integer, expressed in gigabytes); default: unset"
        },
    )
    low_cpu_mem_usage: bool | None = Field(
        default=None,
        json_schema_extra={"description": "Whether to use low_cpu_mem_usage"},
    )

    chat_template: (
        ChatTemplate
        | Annotated[str, StringConstraints(pattern="^tokenizer_default_fallback_")]
    ) | None = Field(
        default=None,
        json_schema_extra={
            "description": "The name of the chat template to use for training, following values are supported: tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default value. alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py. tokenizer_default_fallback_*: where * is the name of the chat template to fallback to. E.g. tokenizer_default_fallback_chatml. This is useful when the chat template is not available in the tokenizer. jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field. The selected chat template will be saved to the tokenizer_config.json for easier inferencing"
        },
    )
    chat_template_jinja: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Custom jinja template or path to jinja file for chat template. This will be only used if chat_template is set to `jinja` or `null` (in which case chat_template is automatically set to `jinja`). Default is null."
        },
    )
    chat_template_kwargs: dict[str, Any] | None = Field(
        default=None,
        json_schema_extra={
            "description": "Additional kwargs to pass to the chat template. This is useful for customizing the chat template. For example, you can pass `thinking=False` to add a generation prompt to the chat template."
        },
    )
    eot_tokens: list[str] | None = Field(
        default=None,
        json_schema_extra={
            "description": "Custom EOT (End-of-Turn) tokens to mask/unmask during training. These tokens mark the boundaries between conversation turns. For example: ['/INST', '</s>', '[/SYSTEM_PROMPT]']. If not specified, defaults to just the model's eos_token. This is useful for templates that use multiple delimiter tokens."
        },
    )
    default_system_message: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Changes the default system message. Currently only supports chatml."
        },
    )

    fix_untrained_tokens: int | list[int] | None = Field(
        default=None,
        json_schema_extra={
            "description": (
                "Token index or indices to adjust embedding weights to the mean of the other tokens. "
                "This is useful when the model has untrained embeddings."
            )
        },
    )

    # INTERNALS - document for now, generally not set externally
    is_preprocess: bool | None = None
    preprocess_iterable: bool | None = None

    total_num_tokens: int | None = Field(
        default=None,
        json_schema_extra={"description": "Total number of tokens - internal use"},
    )
    total_supervised_tokens: int | None = None
    sample_packing_eff_est: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "You can set these packing optimizations AFTER starting a training at least once. The trainer will provide recommended values for these values."
        },
    )
    axolotl_config_path: str | None = None

    is_falcon_derived_model: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Internal use only - Used to identify which the model is based on"
        },
    )
    is_llama_derived_model: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Internal use only - Used to identify which the model is based on"
        },
    )
    is_mistral_derived_model: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Internal use only - Used to identify which the model is based on. Please note that if you set this to true, `padding_side` will be set to 'left' by default"
        },
    )
    is_qwen_derived_model: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Internal use only - Used to identify which the model is based on"
        },
    )

    plugins: list[str] | None = Field(
        default=None,
        json_schema_extra={
            "description": "Add plugins to extend the pipeline. See `src/axolotl/integrations` for the available plugins or doc below for more details. https://docs.axolotl.ai/docs/custom_integrations.html"
        },
    )
    generate_samples: bool | None = Field(
        default=False,
        json_schema_extra={
            "description": "Enable sample generation during training for monitoring"
        },
    )
    num_generation_samples: int | None = Field(
        default=3,
        json_schema_extra={
            "description": "Number of samples to generate at each interval"
        },
    )
    generation_max_new_tokens: int | None = Field(
        default=50,
        json_schema_extra={"description": "Maximum new tokens to generate per sample"},
    )
    generation_temperature: float | None = Field(
        default=0.7,
        json_schema_extra={
            "description": "Temperature for sample generation (0.0 = greedy)"
        },
    )
    generation_top_p: float | None = Field(
        default=None,
        json_schema_extra={"description": "Nucleus sampling parameter for generation"},
    )
    generation_top_k: int | None = Field(
        default=None,
        json_schema_extra={"description": "Top-k sampling parameter for generation"},
    )
    generation_prompt_ratio: float | None = Field(
        default=0.5,
        json_schema_extra={"description": "Ratio of input to use as prompt (0.0-1.0)"},
    )
    generation_do_sample: bool | None = Field(
        default=True,
        json_schema_extra={
            "description": "Whether to use sampling (vs greedy decoding)"
        },
    )

    @field_serializer("datasets")
    def datasets_serializer(
        self, ds_configs: list[DatasetConfig] | None
    ) -> list[dict[str, Any]] | None:
        if ds_configs:
            return [ds_config.model_dump(exclude_none=True) for ds_config in ds_configs]
        return None

    @model_validator(mode="before")
    @classmethod
    def warn_peft_trainable_token_to_fix_untrained(cls, data):
        if (
            peft_trainable_token_indices := data.get("peft_trainable_token_indices")
        ) and (fix_untrained_tokens := data.get("fix_untrained_tokens")):
            if isinstance(fix_untrained_tokens, int):
                fix_untrained_tokens = (fix_untrained_tokens,)

            if isinstance(peft_trainable_token_indices, int):
                peft_trainable_token_indices = (peft_trainable_token_indices,)

            for untrained_token_id in fix_untrained_tokens:
                if untrained_token_id not in peft_trainable_token_indices:
                    LOG.warning_once(
                        f"Token {untrained_token_id} is fixed via `fix_untrained_tokens`, yet not in `peft_trainable_token_indices: ` list. "
                        "Please add it, otherwise the token won't be trained on."
                    )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_sageattn_wo_sample_packing(cls, data):
        if (not data.get("sample_packing", False)) and data.get("sage_attention"):
            if not data.get("pad_to_sequence_len", False):
                LOG.warning(
                    "We recommend turning on `pad_to_sequence_len` for SageAttention without packing."
                    "This is because there has been signs that the loss explodes after a few steps."
                )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_sageattn_fft(cls, data):
        if (not data.get("adapter", False)) and data.get("sage_attention"):
            LOG.warning(
                "We found loss to drop to 0 with SageAttention full finetuning."
                "Please observe the loss, otherwise switch to LoRA/QLoRA or another attention method."
            )
        return data


class AxolotlConfigWCapabilities(AxolotlInputConfig):
    """Wrapper to valdiate GPU capabilities with the configured options"""

    capabilities: GPUCapabilities
    env_capabilities: EnvCapabilities

    @model_validator(mode="after")
    def check_bf16(self):
        if self.capabilities.bf16:
            if not self.bf16 and not self.bfloat16:
                LOG.info(
                    "bf16 support detected, but not enabled for this configuration."
                )
        else:
            if (
                not self.merge_lora
                and not self.is_preprocess
                and (self.bf16 is True or self.bfloat16 is True)
            ):
                raise ValueError(
                    "bf16 requested, but AMP is not supported on this GPU. Requires Ampere series or above."
                )
        return self

    @model_validator(mode="after")
    def check_tf32(self):
        if self.tf32 == "auto":
            self.tf32 = self.capabilities.tf32
        return self

    @model_validator(mode="after")
    def check_fp8(self):
        if self.fp8 and not self.capabilities.fp8:
            raise ValueError("fp8 requested, but fp8 is not supported on this GPU")
        elif self.fp8 and self.capabilities.fp8 and not is_fp8_available():
            raise ValueError(
                "fp8 requested, but missing one of ms-amp, transformers-engine or torchao."
            )
        return self

    @model_validator(mode="before")
    @classmethod
    def check_sample_packing_w_sdpa_bf16(cls, data):
        is_sm_90: bool = (
            data["capabilities"]
            and data["capabilities"].get("compute_capability") == "sm_90"
        )
        if (
            data.get("sample_packing")
            and data.get("sdp_attention")
            and (data.get("bfloat16") or data.get("bf16"))
            and not is_sm_90
        ):
            # https://github.com/pytorch/pytorch/blob/1b03423526536b5f3d35bdfa95ccc6197556cf9b/test/test_transformers.py#L2440-L2450
            LOG.warning(
                "sample_packing & torch sdpa with bf16 is unsupported may results in 0.0 loss. "
                "This may work on H100s."
            )

        return data

    @model_validator(mode="before")
    @classmethod
    def check_compute_capability_w_sageattn(cls, data):
        if (
            data.get("sage_attention")
            and data.get("capabilities")
            and data.get("capabilities").get("compute_capability")
            not in ["sm_80", "sm_86", "sm_89", "sm_90", "sm_120"]
        ):
            raise ValueError(
                "SageAttention supports compute capability between sm_80 and sm_120. "
                "Please use a different attention implementation."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_multigpu_unsloth(cls, data):
        if (
            data.get("unsloth_lora_mlp")
            or data.get("unsloth_lora_qkv")
            or data.get("unsloth_lora_o")
        ):
            capabilities = data.get("capabilities")
            if capabilities and capabilities.get("n_gpu", 0) > 1:
                raise ValueError(
                    "unsloth_lora_mlp, unsloth_lora_qkv, and unsloth_lora_o are not compatible with multi-GPU training."
                )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_multigpu_lora_kernels(cls, data):
        if (
            data.get("lora_mlp_kernel")
            or data.get("lora_qkv_kernel")
            or data.get("lora_o_kernel")
        ):
            capabilities = data.get("capabilities")
            is_fsdp = data.get("fsdp_config") is not None
            is_fsdp2 = is_fsdp and str(data.get("fsdp_version")) == "2"

            if capabilities and capabilities.get("n_gpu", 0) > 1 and not is_fsdp2:
                if is_fsdp:
                    raise ValueError(
                        "lora_mlp_kernel, lora_qkv_kernel, and lora_o_kernel are not compatible with FSDP1."
                    )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_quantize_moe_experts(cls, data):
        if data.get("quantize_moe_experts"):
            if data.get("lora_target_linear"):
                raise ValueError(
                    "lora_target_linear is not compatible with quantize_moe_experts. "
                    "Use lora_target_parameters to target expert weights instead."
                )
            if data.get("adapter") not in ("lora", "qlora"):
                raise ValueError("quantize_moe_experts requires adapter: lora or qlora")
            if not (data.get("load_in_4bit") or data.get("load_in_8bit")):
                raise ValueError(
                    "quantize_moe_experts requires load_in_4bit or load_in_8bit"
                )
            if (
                data.get("capabilities")
                and data["capabilities"].get("compute_capability")
                and not data["capabilities"]["compute_capability"].startswith("sm_")
            ):
                raise ValueError(
                    "quantize_moe_experts requires CUDA (not compatible with ROCm or other backends)"
                )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_auto_enable_lora_kernels(cls, data):
        # Only proceed if using LoRA or QLoRA adapter
        if data.get("rl"):
            # RL trainers not tested so don't enable kernels by default
            return data
        if data.get("adapter") in ["lora", "qlora"]:
            # Skip if already set, using unsloth optimizations, or using 8-bit
            unsloth_fields = ["unsloth_lora_mlp", "unsloth_lora_qkv", "unsloth_lora_o"]
            kernel_fields = ["lora_mlp_kernel", "lora_qkv_kernel", "lora_o_kernel"]
            if (
                any(data.get(k) is not None for k in kernel_fields)
                or any(data.get(k) for k in unsloth_fields)
                or data.get("adapter") == "lora"
                and data.get("load_in_8bit")
            ):
                return data

            # Skip if trust_remote_code is enabled, as lora kernels are not compatible
            if data.get("trust_remote_code"):
                return data

            # Skip if dropout is not 0, as auto enabling it would just disable it during runtime patch checks
            if data.get("lora_dropout") != 0:
                return data

            # Check multi-GPU compatibility
            capabilities = data.get("capabilities")
            is_multi_gpu = capabilities and capabilities.get("n_gpu", 0) > 1
            is_fsdp = data.get("fsdp_config") is not None
            is_fsdp2 = is_fsdp and str(data.get("fsdp_version")) == "2"

            if (
                not is_multi_gpu
                or (is_multi_gpu and not is_fsdp)
                or (is_multi_gpu and is_fsdp2)
            ):
                # Auto-enable kernels if not explicitly set by user
                if data.get("lora_mlp_kernel") is None:
                    data["lora_mlp_kernel"] = True

                if data.get("lora_qkv_kernel") is None:
                    data["lora_qkv_kernel"] = True

                if data.get("lora_o_kernel") is None:
                    data["lora_o_kernel"] = True

                LOG.warning(
                    "Auto-enabling LoRA kernel optimizations for faster training. "
                    + "Please explicitly set `lora_*_kernel` config values to `false` to disable. "
                    + "See https://docs.axolotl.ai/docs/lora_optims.html for more info."
                )

        return data

    @model_validator(mode="before")
    @classmethod
    def check_adopt_torch_version(cls, data):
        if (data.get("optimizer") is not None) and ("adopt" in data.get("optimizer")):
            env_capabilities = data.get("env_capabilities", {})
            torch_version = env_capabilities.get("torch_version")

            if torch_version is None:
                import torch

                torch_version = str(torch.__version__).split("+", maxsplit=1)[0]

            if version.parse(torch_version) < version.parse("2.5.1"):
                raise ValueError(
                    "ADOPT optimizer is incompatible with torch version < 2.5.1"
                )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_flex_torch_version(cls, data):
        if (data.get("flex_attention") is not None) and (data.get("flex_attention")):
            env_capabilities = data.get("env_capabilities", {})
            torch_version = env_capabilities.get("torch_version")

            if torch_version is None:
                import torch

                torch_version = str(torch.__version__).split("+", maxsplit=1)[0]

            if version.parse(torch_version) < version.parse("2.6.0"):
                raise ValueError(
                    "Flex attention is not supported on torch version < 2.6.0"
                )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_torch_compile_auto(cls, data):
        if data.get("torch_compile") == "auto":
            env_capabilities = data.get("env_capabilities", {})
            if env_capabilities.get("torch_version"):
                if version.parse(
                    env_capabilities.get("torch_version")
                ) >= version.parse("2.5.1"):
                    LOG.info(
                        "torch.compile is available, setting torch_compile to True"
                    )
                    data["torch_compile"] = True
                else:
                    data["torch_compile"] = False
            else:
                data["torch_compile"] = False
        return data

    @model_validator(mode="before")
    @classmethod
    def check_beta_and_trl_beta_match(cls, data):
        if data.get("beta") and data.get("trl", {}).get("beta"):
            if data["beta"] != data["trl"]["beta"]:
                raise ValueError("beta and trl.beta must match or one must be removed")
        return data

    @model_validator(mode="after")
    def check_min_torch_version(self):
        if self.env_capabilities and self.env_capabilities.torch_version:
            torch_version = self.env_capabilities.torch_version
            if version.parse(torch_version) < version.parse("2.6.0"):
                LOG.warning(
                    f"torch=={torch_version} not be supported. Please upgrade to torch>=2.6.0."
                )

        return self

    @model_validator(mode="before")
    @classmethod
    def check_qat_config(cls, data):
        qat_cfg = data.get("qat", {})
        if not qat_cfg:
            return data

        if data.get("peft"):
            raise ValueError("QAT and PEFT cannot be used together.")

        if data.get("load_in_8bit"):
            raise ValueError("QAT and load_in_8bit cannot be used together.")

        if data.get("load_in_4bit"):
            raise ValueError("QAT and load_in_4bit cannot be used together.")

        env_capabilities = data.get("env_capabilities", {})
        torch_version = env_capabilities.get("torch_version")

        if torch_version is None:
            import torch

            torch_version = str(torch.__version__).split("+", maxsplit=1)[0]

        if version.parse(torch_version) < version.parse("2.6.0"):
            raise ValueError("QAT is not supported on torch version < 2.6.0")

        return data

    @model_validator(mode="before")
    @classmethod
    def check_fsdp_torch_version(cls, data):
        env_capabilities = data.get("env_capabilities", {})
        torch_version = env_capabilities.get("torch_version")

        if torch_version is None:
            import torch

            torch_version = str(torch.__version__).split("+", maxsplit=1)[0]

        if data.get("fsdp_config") and str(data.get("fsdp_version")) == "2":
            if version.parse(torch_version) < version.parse("2.7.0"):
                raise ValueError("FSDP2 is not supported on torch version < 2.7.0")

        return data

    @model_validator(mode="before")
    @classmethod
    def default_dataloader_opts(cls, data):
        if (
            data.get("dataloader_num_workers") is None
            and data.get("dataloader_pin_memory") is None
            and data.get("dataloader_prefetch_factor") is None
        ):
            data["dataloader_num_workers"] = data.get("capabilities").get("n_gpu", 1)
            data["dataloader_pin_memory"] = True
            data["dataloader_prefetch_factor"] = 256

        return data

    @model_validator(mode="before")
    @classmethod
    def default_dataset_num_proc(cls, data):
        if data.get("dataset_processes") is not None:
            if data.get("dataset_num_proc") is None:
                data["dataset_num_proc"] = data["dataset_processes"]
                LOG.warning(
                    "dataset_processes is deprecated and will be removed in a future version. "
                    "Please use dataset_num_proc instead."
                )
            else:
                LOG.warning(
                    "Both dataset_processes and dataset_num_proc are set. "
                    "Using dataset_num_proc and ignoring dataset_processes."
                )
            del data["dataset_processes"]
        elif data.get("dataset_num_proc") is None:
            data["dataset_num_proc"] = get_default_process_count()
        return data

    @model_validator(mode="before")
    @classmethod
    def check_deduplication_with_streaming(cls, data):
        if data.get("dataset_exact_deduplication") and (
            data.get("streaming") or data.get("pretraining_dataset")
        ):
            raise NotImplementedError(
                "dataset_exact_deduplication is not available for streaming datasets. "
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_deduplication_with_skip_prepare(cls, data):
        if data.get("dataset_exact_deduplication") and data.get("skip_prepare_dataset"):
            raise ValueError(
                "dataset_exact_deduplication=True has no effect when "
                "skip_prepare_dataset=True. Deduplication runs as part of the "
                "prepare pipeline, which is skipped. Either set "
                "skip_prepare_dataset: false or disable "
                "dataset_exact_deduplication."
            )
        return data


================================================
FILE: src/axolotl/utils/schemas/datasets.py
================================================
"""Pydantic models for datasets-related configuration"""

from typing import Literal

from pydantic import BaseModel, Field, model_validator

from axolotl.utils.schemas.enums import ChatTemplate
from axolotl.utils.schemas.utils import handle_legacy_message_fields_logic


class UserDefinedPrompterType(BaseModel):
    """Structure for user defined prompt types"""

    system_prompt: str | None = Field(
        default=None,
        json_schema_extra={"description": "Custom user instruction prompt"},
    )
    system_format: str | None = Field(
        default=None,
        json_schema_extra={"description": "Use {system} as key to be replaced"},
    )
    field_system: str | None = None
    field_instruction: str | None = None
    field_input: str | None = None
    field_output: str | None = None

    format: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Customizable to be single line or multi-line. Use {instruction}/{input} as key to be replaced. 'format' can include {input}"
        },
    )
    no_input_format: str | None = Field(
        default=None,
        json_schema_extra={"description": "'no_input_format' cannot include {input}"},
    )


class SFTDataset(BaseModel):
    """SFT configuration subset"""

    path: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "HuggingFace dataset repo | s3:// | gs:// | path to local file or directory"
        },
    )
    split: str | None = Field(
        default=None,
        json_schema_extra={"description": "name of dataset split to load from"},
    )
    type: str | UserDefinedPrompterType | None = Field(
        default=None,
        json_schema_extra={
            "description": "The type of prompt to use for training. [alpaca, gpteacher, oasst, reflection]"
        },
    )
    input_transform: str | None = None
    shards: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "split dataset into N pieces (use with shards_idx)"
        },
    )
    shards_idx: int | None = Field(
        default=None,
        json_schema_extra={"description": "the index of sharded dataset to use"},
    )
    preprocess_shards: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "process dataset in N sequential chunks for memory efficiency (exclusive with `shards`)"
        },
    )
    conversation: str | None = None
    # Do not make this too strict or it will break the validator to choose different dataset class
    chat_template: ChatTemplate | str | None = Field(
        default=None,
        json_schema_extra={
            "description": "The name of the chat template to use for training, following values are supported: tokenizer_default: Uses the chat template that is available in the tokenizer_config.json. If the chat template is not available in the tokenizer, it will raise an error. This is the default. alpaca/inst/chatml/gemma/cohere/llama3/phi_3/deepseek_v2/jamba: These chat templates are available in the axolotl codebase at src/axolotl/utils/chat_templates.py. tokenizer_default_fallback_*: where * is the name of the chat template to fallback to if the tokenizer does not have a chat template else default to tokenizer. E.g. tokenizer_default_fallback_chatml. jinja: Uses a custom jinja template for the chat template. The custom jinja template should be provided in the chat_template_jinja field."
        },
    )
    chat_template_jinja: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Custom jinja chat template or path to jinja file. Used only if `chat_template: jinja` or empty."
        },
    )
    data_files: str | list[str] | None = Field(
        default=None, json_schema_extra={"description": "path to source data files"}
    )
    input_format: str | None = None
    name: str | None = Field(
        default=None,
        json_schema_extra={"description": "name of dataset configuration to load"},
    )
    ds_type: str | None = Field(
        default=None,
        json_schema_extra={"description": "defines the datatype when path is a file"},
    )
    field: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "For `completion` datasets only, uses the provided field instead of `text` column"
        },
    )
    field_human: str | None = None
    field_model: str | None = None
    field_messages: str | None = Field(
        default=None,
        json_schema_extra={
            "description": 'Key containing the messages (default: "messages")'
        },
    )
    field_tools: str | None = Field(
        default=None,
        json_schema_extra={
            "description": 'Key containing the tools (default: "tools"). Must be a list[dict] and follow [JSON schema](https://json-schema.org/learn/getting-started-step-by-step).'
        },
    )
    field_thinking: str | None = Field(
        default=None,
        json_schema_extra={
            "description": 'Key containing the reasoning trace (default: "reasoning_content").'
        },
    )
    template_thinking_key: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "The key the chat template expects that indicates the reasoning trace."
        },
    )
    # deprecated, use message_property_mappings
    message_field_role: str | None = None
    # deprecated, use message_property_mappings
    message_field_content: str | None = None
    message_property_mappings: dict[str, str] | None = Field(
        default=None,
        json_schema_extra={
            "description": "Mapping of properties from the input dataset to the chat template. (default: message_property_mappings={'role':'role', 'content':'content'}) If a property exists in the template but not in this mapping, the system will attempt to load it directly from the message using the property name as the key. Example: In the mapping below, 'from' is loaded from input dataset and used as 'role', while 'value' is loaded and used as 'content' in the chat template."
        },
    )
    message_field_training: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "The key in the message turn that indicates via boolean whether tokens of a turn should be considered for training. Useful to selectively train on certain turns besides the `roles_to_train`."
        },
    )
    message_field_training_detail: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "The key in the message turn that contains the training details. Useful to selectively train on certain tokens in a turn. The value of the key is a List[Dict] containing `begin_offset` (start character index in content), `end_offset` (end character index in content), and `train` (boolean whether to train)."
        },
    )
    split_thinking: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "(for Qwen3 template only) Whether to split the assistant content based on a reasoning trace inside delimited tags"
        },
    )
    logprobs_field: str | None = None
    temperature: float | None = None
    roles_to_train: list[str] | None = Field(
        default=None,
        json_schema_extra={
            "description": "Roles to train on. The tokens from these roles will be considered for the loss."
        },
    )
    train_on_eos: Literal["all", "turn", "last"] | None = Field(
        default=None,
        json_schema_extra={
            "description": "Which EOS tokens to train on in the conversation. Possible values are: all: train on all EOS tokens, turn (default): train on the EOS token at the end of each trainable turn, last: train on the last EOS token in the conversation"
        },
    )
    roles: dict[str, list[str]] | None = Field(
        default=None,
        json_schema_extra={
            "description": 'Roles mapping in the messages. The format is {target_role: [source_roles]}. All source roles will be mapped to the target role. The default is: user: ["human", "user"], assistant: ["gpt", "assistant"], system: ["system"], tool: ["tool"]'
        },
    )
    drop_system_message: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to drop the system turn from the dataset. Only works with chat_template. This does not drop the default system message from chat_template if it exists. If you wish to, we recommend using a custom jinja template with the default system message removed or adding a system turn with empty content."
        },
    )
    trust_remote_code: bool | None = Field(
        default=False,
        json_schema_extra={"description": "Trust remote code for untrusted source"},
    )
    revision: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "The specific revision of the dataset to use when loading from the Hugging Face Hub. This can be a commit hash, tag, or branch name. If not specified, the latest version will be used. This parameter is ignored for local datasets."
        },
    )

    @model_validator(mode="before")
    @classmethod
    def handle_legacy_message_fields(cls, data):
        """Handle backwards compatibility between legacy message field mapping and new property mapping system."""
        return handle_legacy_message_fields_logic(data)

    @model_validator(mode="before")
    @classmethod
    def check_chat_template_config(cls, data):
        if isinstance(data, BaseModel):
            data = data.model_dump()

        # Set chat_template to tokenizer_default if not set
        if data.get("type") == "chat_template" and not data.get("chat_template"):
            data["chat_template"] = ChatTemplate.tokenizer_default

        # if chat_template is set to jinja, chat_template_jinja is required
        if data.get("chat_template") == ChatTemplate.jinja and not data.get(
            "chat_template_jinja"
        ):
            raise ValueError(
                "chat_template_jinja is required when chat_template is set to jinja"
            )

        # If chat_template_jinja is set, set chat_template to jinja
        if data.get("chat_template_jinja") and not data.get("chat_template"):
            data["chat_template"] = ChatTemplate.jinja

        return data


class PretrainingDataset(BaseModel):
    """Pretraining dataset configuration subset"""

    name: str | None = None
    path: str | None = None
    split: str | None = "train"
    text_column: str | None = "text"
    type: str | None = "pretrain"
    trust_remote_code: bool | None = False
    data_files: str | None = None
    skip: int | None = None


class UserDefinedDPOType(BaseModel):
    """User defined typing for DPO"""

    field_system: str | None = None
    field_prompt: str | None = None
    field_chosen: str | None = None
    field_rejected: str | None = None
    prompt_format: str | None = None
    chosen_format: str | None = None
    rejected_format: str | None = None


class DPODataset(BaseModel):
    """DPO configuration subset"""

    path: str | None = None
    split: str | None = None
    type: UserDefinedDPOType | str | None = None
    data_files: list[str] | None = None
    revision: str | None = None
    field_messages: str | None = None


class StepwiseSupervisedDataset(BaseModel):
    """Stepwise supervised dataset configuration subset"""

    path: str | None = None
    split: str | None = None
    data_files: list[str] | None = None
    revision: str | None = None
    step_separator: str | None = None
    max_completion_length: int | None = None
    train_on_last_step_only: bool | None = None


class UserDefinedKTOType(BaseModel):
    """User defined typing for KTO"""

    field_system: str | None = None
    field_prompt: str | None = None
    field_completion: str | None = None
    field_label: bool | None = None
    prompt_format: str | None = None
    completion_format: str | None = None


class KTODataset(BaseModel):
    """KTO configuration subset"""

    path: str | None = None
    split: str | None = None
    type: UserDefinedKTOType | str | None = None
    data_files: list[str] | None = None
    trust_remote_code: bool | None = False
    revision: str | None = None


DatasetConfig = SFTDataset | DPODataset | KTODataset | StepwiseSupervisedDataset


================================================
FILE: src/axolotl/utils/schemas/deprecated.py
================================================
"""Pydantic models for deprecated and remapped configuration parameters"""

from typing import Any

from pydantic import BaseModel, Field, field_validator

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


class DeprecatedParameters(BaseModel):
    """configurations that are deprecated"""

    max_packed_sequence_len: int | None = None
    rope_scaling: Any | None = None
    noisy_embedding_alpha: float | None = None
    dpo_beta: float | None = None
    evaluation_strategy: str | None = None
    eval_table_size: int | None = None
    eval_max_new_tokens: int | None = None
    dpo_use_logits_to_keep: bool | None = None
    dpo_generate_during_eval: bool | None = None

    @field_validator("max_packed_sequence_len")
    @classmethod
    def validate_max_packed_sequence_len(cls, max_packed_sequence_len):
        if max_packed_sequence_len:
            raise DeprecationWarning("`max_packed_sequence_len` is no longer supported")
        return max_packed_sequence_len

    @field_validator("rope_scaling")
    @classmethod
    def validate_rope_scaling(cls, rope_scaling):
        if rope_scaling:
            raise DeprecationWarning(
                "`rope_scaling` is no longer supported, it should now be be a key under `model_config`"
            )
        return rope_scaling

    @field_validator("noisy_embedding_alpha")
    @classmethod
    def validate_noisy_embedding_alpha(cls, noisy_embedding_alpha):
        if noisy_embedding_alpha:
            LOG.warning("noisy_embedding_alpha is deprecated, use neftune_noise_alpha")
        return noisy_embedding_alpha

    @field_validator("dpo_beta")
    @classmethod
    def validate_dpo_beta(cls, dpo_beta):
        if dpo_beta is not None:
            LOG.warning("dpo_beta is deprecated, use rl_beta instead")
        return dpo_beta

    @field_validator("evaluation_strategy")
    @classmethod
    def validate_evaluation_strategy(cls, evaluation_strategy):
        if evaluation_strategy is not None:
            LOG.warning("evaluation_strategy is deprecated, use eval_strategy instead")
        return evaluation_strategy

    @field_validator("eval_table_size")
    @classmethod
    def validate_eval_table_size(cls, eval_table_size):
        if eval_table_size is not None:
            LOG.warning(
                "eval_table_size is deprecated and superseded by generate_samples config. "
                "Please use generate_samples: true and num_generation_samples instead. "
                "The LogPredictionCallback is replaced by the new sample generation feature."
            )
        return eval_table_size

    @field_validator("eval_max_new_tokens")
    @classmethod
    def validate_eval_max_new_tokens(cls, eval_max_new_tokens):
        if eval_max_new_tokens is not None:
            LOG.warning(
                "eval_max_new_tokens is deprecated and superseded by generate_samples config. "
                "Please use generation_max_new_tokens instead."
            )
        return eval_max_new_tokens

    @field_validator("dpo_use_logits_to_keep")
    @classmethod
    def validate_dpo_use_logits_to_keep(cls, dpo_use_logits_to_keep):
        if dpo_use_logits_to_keep is not None:
            raise DeprecationWarning(
                "`dpo_use_logits_to_keep` is no longer supported, "
                "it has been removed in TRL >= 0.29.0"
            )
        return dpo_use_logits_to_keep

    @field_validator("dpo_generate_during_eval")
    @classmethod
    def validate_dpo_generate_during_eval(cls, dpo_generate_during_eval):
        if dpo_generate_during_eval is not None:
            raise DeprecationWarning(
                "`dpo_generate_during_eval` is no longer supported, "
                "it has been removed in TRL >= 0.29.0"
            )
        return dpo_generate_during_eval


class RemappedParameters(BaseModel):
    """Parameters that have been remapped to other names"""

    overrides_of_model_config: dict[str, Any] | None = Field(
        default=None,
        alias="model_config",
        json_schema_extra={
            "description": "optional overrides to the base model configuration"
        },
    )
    overrides_of_model_kwargs: dict[str, Any] | None = Field(
        default=None,
        alias="model_kwargs",
        json_schema_extra={
            "description": "optional overrides the base model loading from_pretrained"
        },
    )
    type_of_model: str | None = Field(
        default=None,
        alias="model_type",
        json_schema_extra={
            "description": "If you want to specify the type of model to load, AutoModelForCausalLM is a good choice too"
        },
    )
    revision_of_model: str | None = Field(
        default=None,
        alias="model_revision",
        json_schema_extra={
            "description": "You can specify to choose a specific model revision from huggingface hub"
        },
    )


================================================
FILE: src/axolotl/utils/schemas/dynamic_checkpoint.py
================================================
"""Schema for dynamic checkpoint configuration."""

from pydantic import BaseModel, Field


class DynamicCheckpointConfig(BaseModel):
    """Configuration for dynamic checkpoint triggering during training."""

    enabled: bool = Field(
        default=False,
        json_schema_extra={
            "description": "Enable dynamic checkpoint triggering during training. "
            "Create a file 'axolotl_checkpoint.save' in the configured `output_dir` to trigger. "
        },
    )
    check_interval: int = Field(
        default=10,
        ge=1,
        json_schema_extra={
            "description": "Check for trigger file every N steps (reduces I/O overhead). "
            "Default: 100"
        },
    )
    trigger_file_path: str = Field(
        default="",
        json_schema_extra={
            "description": "Custom trigger filename (optional). "
            "If not specified, defaults to 'axolotl_checkpoint.save'. "
            "Specify a filename (not a full path) to override the default."
        },
    )


================================================
FILE: src/axolotl/utils/schemas/enums.py
================================================
"""Enums for Axolotl input config"""

from enum import Enum

import torch


class TorchAOQuantDType(Enum):
    int4 = torch.int4
    int8 = torch.int8
    float8_e4m3fn = torch.float8_e4m3fn
    nvfp4 = "nvfp4"
    mxfp4 = "mxfp4"

    def from_string(str):
        if str == "int4":
            return TorchAOQuantDType.int4
        if str == "int8":
            return TorchAOQuantDType.int8
        if str in ["float8_e4m3fn", "fp8", "float8"]:
            return TorchAOQuantDType.float8_e4m3fn
        if str == "nvfp4":
            return TorchAOQuantDType.nvfp4
        if str == "mxfp4":
            return TorchAOQuantDType.mxfp4


class RLType(str, Enum):
    """RL trainer type configuration subset"""

    DPO = "dpo"
    GDPO = "gdpo"
    GRPO = "grpo"
    IPO = "ipo"
    ORPO = "orpo"
    KTO = "kto"
    SIMPO = "simpo"


class ChatTemplate(str, Enum):
    """Chat templates configuration subset"""

    alpaca = "alpaca"
    chatml = "chatml"
    mistral_v1 = "mistral_v1"
    mistral_v2v3 = "mistral_v2v3"
    mistral_v3_tekken = "mistral_v3_tekken"
    mistral_v7_tekken = "mistral_v7_tekken"
    gemma = "gemma"
    cohere = "cohere"
    llama3 = "llama3"
    llama3_2_vision = "llama3_2_vision"
    llama4 = "llama4"
    phi_3 = "phi_3"
    phi_35 = "phi_35"
    deepseek_v2 = "deepseek_v2"
    deepseek_v3 = "deepseek_v3"
    jamba = "jamba"
    jinja = "jinja"
    qwen_25 = "qwen_25"
    qwen3 = "qwen3"
    qwen3_5 = "qwen3_5"
    falcon_h1 = "falcon_h1"
    tokenizer_default = "tokenizer_default"
    exaone = "exaone"
    exaone4 = "exaone4"
    metharme = "metharme"
    pixtral = "pixtral"
    llava = "llava"
    qwen2_vl = "qwen2_vl"
    gemma3 = "gemma3"
    gemma3n = "gemma3n"
    command_a = "command_a"
    command_a_tool_use = "command_a_tool_use"
    command_a_rag = "command_a_rag"
    aya = "aya"


class CustomSupportedOptimizers(str, Enum):
    """Custom supported optimizers"""

    optimi_adamw = "optimi_adamw"
    ao_adamw_4bit = "ao_adamw_4bit"
    ao_adamw_8bit = "ao_adamw_8bit"
    ao_adamw_fp8 = "ao_adamw_fp8"
    adopt_adamw = "adopt_adamw"
    came_pytorch = "came_pytorch"
    muon = "muon"
    dion = "dion"
    flash_adamw = "flash_adamw"
    flash_adam = "flash_adam"
    flash_sgd = "flash_sgd"
    flash_sgdw = "flash_sgdw"
    flash_lion = "flash_lion"


class RingAttnFunc(str, Enum):
    """Enum class for supported `ring-flash-attn` implementations"""

    VARLEN_LLAMA3 = "varlen_llama3"
    BATCH_RING = "batch_ring"
    # VARLEN_RING = "varlen_ring"
    # VARLEN_ZIGZAG = "varlen_zigzag"
    # BATCH_ZIGZAG = "batch_zigzag"
    # BATCH_STRIPE = "batch_stripe"


================================================
FILE: src/axolotl/utils/schemas/fsdp.py
================================================
"""
FSDP Configuration Schema
"""

from typing import Literal

from pydantic import AliasChoices, BaseModel, Field


class FSDPConfig(BaseModel):
    """
    FSDP Configuration Schema
    """

    fsdp_version: int | None = Field(
        validation_alias=AliasChoices("fsdp_version", "version"),
        default=None,
        json_schema_extra={"description": "FSDP version"},
    )
    activation_checkpointing: bool | None = Field(
        default=None,
        description="Enable activation checkpointing to reduce memory usage during forward passes",
    )
    offload_params: bool | None = Field(
        default=None,
        description="Offload parameters to CPU to reduce GPU memory usage",
    )
    sync_module_states: bool | None = Field(
        default=None,
        description="Synchronize module states across all processes",
    )
    cpu_ram_efficient_loading: bool | None = Field(
        default=None,
        description="Enable CPU RAM efficient loading to reduce memory usage during model loading",
    )
    cpu_offload_pin_memory: bool | None = Field(
        default=None,
        description="Disabling this enables swap memory usage for resource-constrained setups when offload_params is enabled.",
    )
    use_orig_params: bool | None = Field(
        default=None,
        description="Use original parameters instead of flattened parameters",
    )

    state_dict_type: (
        Literal["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DICT"] | None
    ) = Field(
        default=None,
        description="Type of state dict to use for saving/loading checkpoints",
    )
    final_state_dict_type: (
        Literal["FULL_STATE_DICT", "LOCAL_STATE_DICT", "SHARDED_STATE_DICT"] | None
    ) = Field(
        default=None,
        description="Final state dict type to use after training completion",
    )

    auto_wrap_policy: Literal["TRANSFORMER_BASED_WRAP", "SIZE_BASED_WRAP"] | None = (
        Field(
            default=None,
            description="Policy for automatically wrapping modules with FSDP",
        )
    )
    transformer_layer_cls_to_wrap: str | None = Field(
        default=None,
        description="Class name of transformer layers to wrap (e.g., 'LlamaDecoderLayer')",
    )

    reshard_after_forward: bool | None = Field(
        default=None,
        description="Reshard parameters after forward pass to save memory",
    )
    mixed_precision_policy: str | None = Field(
        default=None,
        description="Mixed precision policy for FSDP (e.g., 'fp16', 'bf16')",
    )


================================================
FILE: src/axolotl/utils/schemas/integrations.py
================================================
"""Pydantic models for Axolotl integrations"""

from typing import Any

from pydantic import BaseModel, Field, model_validator

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


class MLFlowConfig(BaseModel):
    """MLFlow configuration subset"""

    use_mlflow: bool | None = None
    mlflow_tracking_uri: str | None = Field(
        default=None, json_schema_extra={"description": "URI to mlflow"}
    )
    mlflow_experiment_name: str | None = Field(
        default=None, json_schema_extra={"description": "Your experiment name"}
    )
    mlflow_run_name: str | None = Field(
        default=None, json_schema_extra={"description": "Your run name"}
    )
    hf_mlflow_log_artifacts: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "set to true to copy each saved checkpoint on each save to mlflow artifact registry"
        },
    )


class LISAConfig(BaseModel):
    """LISA configuration subset"""

    lisa_n_layers: int | None = Field(
        default=None,
        json_schema_extra={"description": "the number of activate layers in LISA"},
    )
    lisa_step_interval: int | None = Field(
        default=None,
        json_schema_extra={"description": "how often to switch layers in LISA"},
    )
    lisa_layers_attribute: str | None = Field(
        default="model.layers",
        json_schema_extra={"description": "path under the model to access the layers"},
    )


class WandbConfig(BaseModel):
    """Wandb configuration subset"""

    use_wandb: bool | None = None
    wandb_name: str | None = Field(
        default=None,
        json_schema_extra={"description": "Set the name of your wandb run"},
    )
    wandb_run_id: str | None = Field(
        default=None, json_schema_extra={"description": "Set the ID of your wandb run"}
    )
    wandb_mode: str | None = Field(
        default=None,
        json_schema_extra={
            "description": '"offline" to save run metadata locally and not sync to the server, "disabled" to turn off wandb'
        },
    )
    wandb_project: str | None = Field(
        default=None, json_schema_extra={"description": "Your wandb project name"}
    )
    wandb_entity: str | None = Field(
        default=None,
        json_schema_extra={"description": "A wandb Team name if using a Team"},
    )
    wandb_watch: str | None = None
    wandb_log_model: str | None = Field(
        default=None,
        json_schema_extra={
            "description": '"checkpoint" to log model to wandb Artifacts every `save_steps` or "end" to log only at the end of training'
        },
    )

    @model_validator(mode="before")
    @classmethod
    def check_wandb_run(cls, data):
        if data.get("wandb_run_id") and not data.get("wandb_name"):
            data["wandb_name"] = data.get("wandb_run_id")

            LOG.warning(
                "wandb_run_id sets the ID of the run. If you would like to set the name, please use wandb_name instead."
            )

        return data


class CometConfig(BaseModel):
    """Comet configuration subset"""

    use_comet: bool | None = Field(
        default=None,
        json_schema_extra={"description": "Enable or disable Comet integration."},
    )
    comet_api_key: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "API key for Comet. Recommended to set via `comet login`."
        },
    )
    comet_workspace: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Workspace name in Comet. Defaults to the user's default workspace."
        },
    )
    comet_project_name: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Project name in Comet. Defaults to Uncategorized."
        },
    )
    comet_experiment_key: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Identifier for the experiment. Used to append data to an existing experiment or control the key of new experiments. Default to a random key."
        },
    )
    comet_mode: str | None = Field(
        default=None,
        json_schema_extra={
            "description": 'Create a new experiment ("create") or log to an existing one ("get"). Default ("get_or_create") auto-selects based on configuration.'
        },
    )
    comet_online: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Set to True to log data to Comet server, or False for offline storage. Default is True."
        },
    )
    comet_experiment_config: dict[str, Any] | None = Field(
        default=None,
        json_schema_extra={
            "description": "Dictionary for additional configuration settings, see the doc for more details."
        },
    )


class GradioConfig(BaseModel):
    """Gradio configuration subset"""

    gradio_title: str | None = None
    gradio_share: bool | None = None
    gradio_server_name: str | None = None
    gradio_server_port: int | None = None
    gradio_max_new_tokens: int | None = None
    gradio_temperature: float | None = None


class RayConfig(BaseModel):
    """Ray launcher configuration subset"""

    use_ray: bool = Field(default=False)
    ray_run_name: str | None = Field(
        default=None,
        json_schema_extra={
            "help": "The training results will be saved at `saves/ray_run_name`."
        },
    )
    ray_num_workers: int = Field(
        default=1,
        json_schema_extra={
            "help": "The number of workers for Ray training. Default is 1 worker."
        },
    )
    resources_per_worker: dict = Field(
        default_factory=lambda: {"GPU": 1},
        json_schema_extra={
            "help": "The resources per worker for Ray training. Default is to use 1 GPU per worker."
        },
    )


class OpenTelemetryConfig(BaseModel):
    """OpenTelemetry configuration subset"""

    use_otel_metrics: bool | None = Field(
        default=False,
        json_schema_extra={
            "description": "Enable OpenTelemetry metrics collection and Prometheus export"
        },
    )
    otel_metrics_host: str | None = Field(
        default="localhost",
        json_schema_extra={
            "title": "OpenTelemetry Metrics Host",
            "description": "Host to bind the OpenTelemetry metrics server to",
        },
    )
    otel_metrics_port: int | None = Field(
        default=8000,
        json_schema_extra={
            "description": "Port for the Prometheus metrics HTTP server"
        },
    )


class TrackioConfig(BaseModel):
    """Trackio configuration subset"""

    use_trackio: bool | None = None
    trackio_project_name: str | None = Field(
        default=None,
        json_schema_extra={"description": "Your trackio project name"},
    )
    trackio_run_name: str | None = Field(
        default=None,
        json_schema_extra={"description": "Set the name of your trackio run"},
    )
    trackio_space_id: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Hugging Face Space ID to sync dashboard to (optional, runs locally if not provided)"
        },
    )


================================================
FILE: src/axolotl/utils/schemas/internal/__init__.py
================================================
"""module for gpu capabilities"""

from typing import Optional

from pydantic import BaseModel, Field


class GPUCapabilities(BaseModel):
    """model to manage the gpu capabilities statically"""

    bf16: bool = Field(default=False)
    fp8: bool = Field(default=False)
    tf32: bool = Field(default=False)
    n_gpu: int = Field(default=1)
    n_node: int = Field(default=1)
    compute_capability: Optional[str] = Field(default=None)


class EnvCapabilities(BaseModel):
    """model to manage the environment capabilities statically"""

    torch_version: Optional[str] = Field(default=None)


================================================
FILE: src/axolotl/utils/schemas/model.py
================================================
"""Pydantic models for model input / output, etc. configuration"""

from typing import Any, Literal

from pydantic import BaseModel, Field, field_validator

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


class ModelInputConfig(BaseModel):
    """Model configuration subset"""

    model_config = {"protected_namespaces": ()}

    base_model: str = Field(
        json_schema_extra={
            "description": "This is the huggingface model that contains *.pt, *.safetensors, or *.bin files. This can also be a relative path to a model on disk"
        }
    )
    base_model_config: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "If the base_model repo on hf hub doesn't include configuration .json files, You can set that here, or leave this empty to default to base_model"
        },
    )
    cls_model_config: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "transformers config class (e.g., 'LlamaConfig', 'MistralConfig'). Defaults to AutoConfig."
        },
    )
    tokenizer_config: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Optional tokenizer configuration path in case you want to use a different tokenizer than the one defined in the base model"
        },
    )
    tokenizer_use_fast: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "use_fast option for tokenizer loading from_pretrained, default to True"
        },
    )
    tokenizer_legacy: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to use the legacy tokenizer setting, defaults to True"
        },
    )
    tokenizer_use_mistral_common: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to use mistral-common tokenizer. If set to True, it will use the mistral-common tokenizer."
        },
    )
    tokenizer_type: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Corresponding tokenizer for the model AutoTokenizer is a good choice"
        },
    )
    processor_type: str | None = Field(
        default=None, json_schema_extra={"description": "transformers processor class"}
    )
    tokenizer_save_jinja_files: bool | None = Field(
        default=True,  # match the default behavior from transformers
        json_schema_extra={
            "description": "Whether to save jinja files for tokenizer, transformers default is True"
        },
    )
    trust_remote_code: bool | None = Field(
        default=None,
        json_schema_extra={"description": "Trust remote code for untrusted source"},
    )

    experimental_skip_move_to_device: bool | None = Field(
        default=True,
        json_schema_extra={
            "description": "Don't move the model to the device before sharding. Set to `false` to revert to legacy behavior."
        },
    )

    use_kernels: bool | None = Field(
        default=None,
        json_schema_extra={"description": "Use custom kernels, e.g. MegaBlocks."},
    )

    model_quantization_config: Literal["Mxfp4Config"] | None = Field(
        default=None,
        json_schema_extra={"description": "Model loading quantization config"},
    )
    model_quantization_config_kwargs: dict[str, Any] | None = Field(
        default=None,
        json_schema_extra={"description": "kwargs for model quantization config"},
    )

    @field_validator("trust_remote_code")
    @classmethod
    def hint_trust_remote_code(cls, trust_remote_code):
        if trust_remote_code:
            LOG.warning(
                "`trust_remote_code` is set to true. Please make sure that you reviewed the remote code/model."
            )
        return trust_remote_code


class ModelOutputConfig(BaseModel):
    """model save configuration subset"""

    output_dir: str = Field(
        default="./model-out",
        json_schema_extra={"description": "Where to save the full-finetuned model to"},
    )
    hub_model_id: str | None = Field(
        default=None, json_schema_extra={"description": "push checkpoints to hub"}
    )
    hub_strategy: str | None = Field(
        default=None,
        json_schema_extra={"description": "how to push checkpoints to hub"},
    )
    hub_revision: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "branch/revision to push to on hub (default: main)"
        },
    )
    save_safetensors: bool | None = Field(
        default=True,
        json_schema_extra={
            "description": "Whether to save the model using safetensors format. Defaults to True."
        },
    )

    @field_validator("save_safetensors")
    @classmethod
    def validate_save_safetensors(cls, v):
        if v is False:
            raise ValueError(
                "save_safetensors=False is not supported in Transformers V5. "
                "Transformers V5 always uses safetensors format for model serialization. "
                "This field is deprecated and will be removed in a future version."
            )
        # Allow None and True, will default to True if None
        return True if v is None else v


class SpecialTokensConfig(BaseModel):
    """Special tokens configuration subset"""

    bos_token: str | None = None
    eos_token: str | None = None
    pad_token: str | None = None
    unk_token: str | None = None
    additional_special_tokens: list[str] | None = None


================================================
FILE: src/axolotl/utils/schemas/multimodal.py
================================================
"""Pydantic models for multimodal-related configuration"""

from typing import Literal

from PIL.Image import Resampling
from pydantic import BaseModel, Field, field_validator


class MultiModalConfig(BaseModel):
    """Multi-modal configuration subset"""

    image_size: int | tuple[int, int] | None = Field(
        default=None,
        json_schema_extra={
            "description": (
                "The size of the image to resize to. It can be an integer (resized into padded-square image) or a tuple (width, height)."
                "If not provided, we will attempt to load from preprocessor.size, otherwise, images won't be resized."
            )
        },
    )
    image_resize_algorithm: (
        Literal["bilinear", "bicubic", "lanczos"] | Resampling | None
    ) = Field(
        default=None,
        json_schema_extra={
            "description": "The resampling algorithm to use for image resizing. Default is bilinear. Please refer to PIL.Image.Resampling for more details."
        },
    )

    @field_validator("image_resize_algorithm", mode="before")
    @classmethod
    def convert_image_resize_algorithm(cls, image_resize_algorithm):
        """
        Convert the image resize algorithm to a PIL.Image.Resampling enum.
        """
        if isinstance(image_resize_algorithm, str):
            image_resize_algorithm = image_resize_algorithm.lower()
            if image_resize_algorithm == "bilinear":
                image_resize_algorithm = Resampling.BILINEAR
            elif image_resize_algorithm == "bicubic":
                image_resize_algorithm = Resampling.BICUBIC
            elif image_resize_algorithm == "lanczos":
                image_resize_algorithm = Resampling.LANCZOS
            else:
                raise ValueError(
                    f"Invalid image resize algorithm: {image_resize_algorithm}"
                )
        return image_resize_algorithm


================================================
FILE: src/axolotl/utils/schemas/peft.py
================================================
"""Pydantic models for PEFT-related configuration"""

from typing import Any, Literal

from pydantic import BaseModel, Field, field_validator, model_validator


class LoftQConfig(BaseModel):
    """LoftQ configuration subset"""

    loftq_bits: int = Field(
        default=4, json_schema_extra={"description": "typically 4 bits"}
    )
    # loftq_iter: int = Field(default=1, json_schema_extra={"description": "Alternating iterations for LoftQ"})


class PeftConfig(BaseModel):
    """peftq configuration subset"""

    loftq_config: LoftQConfig | None = Field(
        default=None,
        json_schema_extra={
            "description": "Configuration options for loftq initialization for LoRA"
        },
    )


class LoraConfig(BaseModel):
    """Peft / LoRA configuration subset"""

    load_in_8bit: bool | None = Field(
        default=False,
        json_schema_extra={
            "description": "This will attempt to quantize the model down to 8 bits and use adam 8 bit optimizer"
        },
    )
    load_in_4bit: bool | None = Field(
        default=False, json_schema_extra={"description": "Use bitsandbytes 4 bit"}
    )

    adapter: Literal["lora", "qlora", "llama-adapter"] | None = Field(
        default=None,
        json_schema_extra={
            "description": "If you want to use 'lora', 'qlora', or 'llama-adapter', or leave blank to train all parameters in original model"
        },
    )
    lora_model_dir: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "If you already have a lora model trained that you want to load, put that here. This means after training, if you want to test the model, you should set this to the value of `output_dir`. Note that if you merge an adapter to the base model, a new subdirectory `merged` will be created under the `output_dir`."
        },
    )
    lora_r: int | None = None
    lora_alpha: int | None = None
    lora_fan_in_fan_out: bool | None = None
    lora_target_modules: str | list[str] | None = None
    lora_target_parameters: str | list[str] | None = None
    lora_target_linear: bool | None = Field(
        default=None,
        json_schema_extra={"description": "If true, will target all linear modules"},
    )
    lora_modules_to_save: list[str] | None = Field(
        default=None,
        json_schema_extra={
            "description": "If you added new tokens to the tokenizer, you may need to save some LoRA modules because they need to know the new tokens. For LLaMA and Mistral, you need to save `embed_tokens` and `lm_head`. It may vary for other models. `embed_tokens` converts tokens to embeddings, and `lm_head` converts embeddings to token probabilities."
        },
    )
    lora_dropout: float | None = 0.0
    peft_layers_to_transform: list[int] | None = Field(
        default=None,
        json_schema_extra={
            "description": "The layer indices to transform, otherwise, apply to all layers"
        },
    )
    peft_layers_pattern: list[str] | None = None
    peft: PeftConfig | None = None
    peft_use_dora: bool | None = Field(
        default=None, json_schema_extra={"description": "Whether to use DoRA."}
    )
    peft_use_rslora: bool | None = Field(
        default=None, json_schema_extra={"description": "Whether to use RSLoRA."}
    )
    peft_layer_replication: list[tuple[int, int]] | None = Field(
        default=None,
        json_schema_extra={"description": "List of layer indices to replicate."},
    )
    peft_init_lora_weights: bool | str | None = Field(
        default=None,
        json_schema_extra={
            "description": "How to initialize LoRA weights. Default to True which is MS original implementation."
        },
    )
    peft_trainable_token_indices: list[int] | dict[str, list[int]] | None = Field(
        default=None,
        json_schema_extra={
            "description": (
                "A list of token indices to fine-tune on the `embed_tokens` layer.\n"
                "Otherwise, a dict mapping an embedding layer name to its trainable token indices.\n"
                "See https://huggingface.co/docs/peft/v0.17.0/en/developer_guides/lora#efficiently-train-tokens-alongside-lora"
            )
        },
    )
    peft_ensure_weight_tying: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": (
                "Whether to tie adapter weights for tied model weights. "
                "See https://github.com/huggingface/peft/issues/2864"
            )
        },
    )
    peft_autocast_adapter_dtype: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to upcast the LoRA adapter to fp32. This is enabled by default in PEFT."
        },
    )

    qlora_sharded_model_loading: bool | None = Field(
        default=False,
        json_schema_extra={
            "description": "load qlora model in sharded format for FSDP using answer.ai technique."
        },
    )
    lora_on_cpu: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Do the LoRA/PEFT loading on CPU -- this is required if the base model is so large it takes up most or all of the available GPU VRAM, e.g. during a model and LoRA merge"
        },
    )
    gptq: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether you are training a 4-bit GPTQ quantized model"
        },
    )
    bnb_config_kwargs: dict[str, Any] | None = Field(
        default=None,
        json_schema_extra={
            "description": "optional overrides to the bnb 4bit quantization configuration"
        },
    )

    loraplus_lr_ratio: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "loraplus learning rate ratio lr_B / lr_A. Recommended value is 2^4."
        },
    )
    loraplus_lr_embedding: float | None = Field(
        default=1e-6,
        json_schema_extra={
            "description": "loraplus learning rate for lora embedding layers. Default value is 1e-6."
        },
    )

    merge_lora: bool | None = None

    @model_validator(mode="before")
    @classmethod
    def validate_adapter(cls, data):
        if (
            not data.get("adapter")
            and not data.get("inference")
            and (data.get("load_in_8bit") or data.get("load_in_4bit"))
        ):
            raise ValueError(
                "load_in_8bit and load_in_4bit are not supported without setting an adapter for training."
                "If you want to full finetune, please turn off load_in_8bit and load_in_4bit."
            )
        return data

    @model_validator(mode="after")
    def validate_qlora(self):
        if self.adapter == "qlora":
            if self.merge_lora:
                # can't merge qlora if loaded in 8bit or 4bit
                if self.load_in_8bit:
                    raise ValueError("Can't merge qlora if loaded in 8bit")

                if self.gptq:
                    raise ValueError("Can't merge qlora if gptq")

                if self.load_in_4bit:
                    raise ValueError("Can't merge qlora if loaded in 4bit")

            else:
                if self.load_in_8bit:
                    raise ValueError("Can't load qlora in 8bit")

                if self.gptq:
                    raise ValueError("Can't load qlora if gptq")

                if not self.load_in_4bit:
                    raise ValueError("Require cfg.load_in_4bit to be True for qlora")
        return self

    @field_validator("loraplus_lr_embedding")
    @classmethod
    def convert_loraplus_lr_embedding(cls, loraplus_lr_embedding):
        if loraplus_lr_embedding and isinstance(loraplus_lr_embedding, str):
            loraplus_lr_embedding = float(loraplus_lr_embedding)
        return loraplus_lr_embedding

    @model_validator(mode="before")
    @classmethod
    def validate_lora_dropout(cls, data):
        if data.get("adapter") is not None and data.get("lora_dropout") is None:
            data["lora_dropout"] = 0.0
        return data

    @model_validator(mode="after")
    def validate_lora_target_parameters_dropout(self):
        if (
            self.lora_target_parameters
            and self.lora_dropout
            and self.lora_dropout != 0.0
        ):
            raise ValueError(
                "lora_dropout must be 0 when lora_target_parameters is set. "
                "PEFT's ParamWrapper does not support lora_dropout != 0."
            )
        return self


class ReLoRAConfig(BaseModel):
    """ReLoRA configuration subset"""

    relora: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Whether to use ReLoRA. Use with jagged_restart_*steps options."
        },
    )
    relora_prune_ratio: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "threshold for optimizer magnitude when pruning"
        },
    )
    relora_cpu_offload: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "True to perform lora weight merges on cpu during restarts, for modest gpu memory savings"
        },
    )


================================================
FILE: src/axolotl/utils/schemas/quantization.py
================================================
"""
QAT Config Schema
"""

from typing import Any

from pydantic import BaseModel, Field, field_validator

from axolotl.utils.schemas.enums import TorchAOQuantDType


def validate_ao_dtype(v: Any) -> TorchAOQuantDType | None:
    if v is None:
        return None
    if v == "int4":
        return TorchAOQuantDType.int4
    if v == "int8":
        return TorchAOQuantDType.int8
    if v in ["float8_e4m3fn", "fp8", "float8"]:
        return TorchAOQuantDType.float8_e4m3fn
    if v == "nvfp4":
        return TorchAOQuantDType.nvfp4
    if v == "mxfp4":
        return TorchAOQuantDType.mxfp4

    raise ValueError(
        f"Invalid dtype: '{v}'. Must be one of: {[e.name for e in TorchAOQuantDType] + ['fp8', 'float8']}"
    )


class QATConfig(BaseModel):
    """
    QAT Config Schema
    """

    activation_dtype: TorchAOQuantDType | None = Field(
        default=None,
        description="Fake quantization layout to use for activation quantization.",
    )
    weight_dtype: TorchAOQuantDType = Field(
        default=TorchAOQuantDType.int8,
        description="Fake quantization layout to use for weight quantization.",
    )
    quantize_embedding: bool | None = Field(
        default=False, description="Quantize embedding"
    )
    group_size: int | None = Field(
        default=32,
        description="The number of elements in each group for per-group fake quantization",
    )
    fake_quant_after_n_steps: int | None = Field(
        default=None, description="The number of steps to apply fake quantization after"
    )

    @field_validator("activation_dtype", "weight_dtype", mode="before")
    @classmethod
    def validate_dtype(cls, v: Any) -> TorchAOQuantDType | None:
        return validate_ao_dtype(v)


class PTQConfig(BaseModel):
    """
    PTQ Config Schema
    """

    weight_dtype: TorchAOQuantDType = Field(
        default=TorchAOQuantDType.int8,
        description="Fake quantization layout to use for weight quantization.",
    )
    activation_dtype: TorchAOQuantDType | None = Field(
        default=None,
        description="Fake quantization layout to use for activation quantization.",
    )
    quantize_embedding: bool | None = Field(
        default=None, description="Whether to quantize the embedding layer."
    )
    group_size: int | None = Field(
        default=32,
        description="The number of elements in each group for per-group fake quantization",
    )

    @field_validator("activation_dtype", "weight_dtype", mode="before")
    @classmethod
    def validate_dtype(cls, v: Any) -> TorchAOQuantDType | None:
        return validate_ao_dtype(v)


================================================
FILE: src/axolotl/utils/schemas/training.py
================================================
"""Pydantic models for training hyperparameters"""

from typing import Any, Literal

from pydantic import BaseModel, Field, field_validator
from transformers import SchedulerType
from transformers.training_args import OptimizerNames

from axolotl.utils.logging import get_logger
from axolotl.utils.schemas.enums import CustomSupportedOptimizers

LOG = get_logger(__name__)


class LrGroup(BaseModel):
    """Custom learning rate group configuration"""

    name: str
    modules: list[str]
    lr: float


class HyperparametersConfig(BaseModel):
    """Training hyperparams configuration subset"""

    gradient_accumulation_steps: int | None = Field(
        default=1,
        json_schema_extra={
            "description": "If greater than 1, backpropagation will be skipped and the gradients will be accumulated for the given number of steps."
        },
    )
    micro_batch_size: int | None = Field(
        default=1,
        json_schema_extra={
            "description": "The number of samples to include in each batch. This is the number of samples sent to each GPU. Batch size per gpu = micro_batch_size * gradient_accumulation_steps"
        },
    )
    batch_size: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Total batch size, we do not recommended setting this manually"
        },
    )
    eval_batch_size: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "per gpu micro batch size for evals, defaults to value of micro_batch_size"
        },
    )

    auto_find_batch_size: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "whether to find batch size that fits in memory. Passed to underlying transformers Trainer"
        },
    )

    train_on_inputs: bool | None = Field(
        default=False,
        json_schema_extra={
            "description": "Whether to mask out or include the human's prompt from the training labels"
        },
    )
    group_by_length: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Group similarly sized data to minimize padding. May be slower to start, as it must download and sort the entire dataset. Note that training loss may have an oscillating pattern with this enabled."
        },
    )

    learning_rate: str | float
    embedding_lr: float | None = None
    embedding_lr_scale: float | None = None
    weight_decay: float | None = Field(
        default=0.0, json_schema_extra={"description": "Specify weight decay"}
    )
    optimizer: (OptimizerNames | CustomSupportedOptimizers) | None = Field(
        default=OptimizerNames.ADAMW_TORCH_FUSED,
        json_schema_extra={"description": "Specify optimizer"},
    )
    optim_args: (str | dict[str, Any]) | None = Field(
        default=None,
        json_schema_extra={
            "description": "Dictionary of arguments to pass to the optimizer"
        },
    )
    optim_target_modules: (list[str] | Literal["all_linear"]) | None = Field(
        default=None,
        json_schema_extra={
            "description": "The target modules to optimize, i.e. the module names that you would like to train, right now this is used only for GaLore algorithm"
        },
    )
    torchdistx_path: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Path to torch distx for optim 'adamw_anyprecision'"
        },
    )
    lr_scheduler: (
        SchedulerType | Literal["one_cycle"] | Literal["rex"]
    ) | None = SchedulerType.COSINE
    lr_scheduler_kwargs: dict[str, Any] | None = Field(
        default=None,
        json_schema_extra={
            "description": "Specify a scheduler and kwargs to use with the optimizer"
        },
    )
    lr_quadratic_warmup: bool | None = None
    cosine_min_lr_ratio: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "decay lr to some percentage of the peak lr, e.g. cosine_min_lr_ratio=0.1 for 10% of peak lr"
        },
    )
    cosine_constant_lr_ratio: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "freeze lr at some percentage of the step, e.g. cosine_constant_lr_ratio=0.8 means start cosine_min_lr at 80% of training step"
        },
    )
    lr_div_factor: float | None = Field(
        default=None, json_schema_extra={"description": "Learning rate div factor"}
    )
    lr_groups: list[LrGroup] | None = None

    adam_epsilon: float | None = Field(
        default=None, json_schema_extra={"description": "adamw hyperparams"}
    )
    adam_epsilon2: float | None = Field(
        default=None, json_schema_extra={"description": "only used for CAME Optimizer"}
    )
    adam_beta1: float | None = Field(
        default=None, json_schema_extra={"description": "adamw hyperparams"}
    )
    adam_beta2: float | None = Field(
        default=None, json_schema_extra={"description": "adamw hyperparams"}
    )
    adam_beta3: float | None = Field(
        default=None, json_schema_extra={"description": "only used for CAME Optimizer"}
    )

    dion_lr: float | None = Field(
        default=None, json_schema_extra={"description": "Dion Optimizer learning rate"}
    )
    dion_momentum: float | None = Field(
        default=None, json_schema_extra={"description": "Dion Optimizer momentum"}
    )
    dion_rank_fraction: float | None = Field(
        default=1.0,
        json_schema_extra={
            "description": "Dion Optimizer: r/d fraction for low-rank approximation. Used to compute the low-rank dimension."
        },
    )
    dion_rank_multiple_of: int | None = Field(
        default=1,
        json_schema_extra={
            "description": "Dion Optimizer: Round up the low-rank dimension to a multiple of this number. This may be useful to ensure even sharding."
        },
    )

    max_grad_norm: float | None = Field(
        default=None, json_schema_extra={"description": "Gradient clipping max norm"}
    )
    num_epochs: float = Field(default=1.0)

    @field_validator("batch_size")
    @classmethod
    def hint_batch_size_set(cls, batch_size):
        if batch_size:
            LOG.warning(
                "%s\n%s",
                "batch_size is not recommended. Please use gradient_accumulation_steps instead.",
                "To calculate the equivalent gradient_accumulation_steps, divide batch_size / micro_batch_size / number of gpus.",
            )
        return batch_size

    @field_validator("learning_rate")
    @classmethod
    def convert_learning_rate(cls, learning_rate):
        if learning_rate and isinstance(learning_rate, str):
            learning_rate = float(learning_rate)
        return learning_rate


class JaggedLRConfig(BaseModel):
    """JaggedLR configuration subset, can be used w/ ReLoRA training"""

    jagged_restart_steps: int | None = Field(
        default=None,
        json_schema_extra={"description": "how often to reset for jagged restarts"},
    )
    jagged_restart_warmup_steps: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "how many warmup steps to take after reset for jagged restarts"
        },
    )
    jagged_restart_anneal_steps: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "how many anneal steps to take before reset for jagged restarts"
        },
    )


================================================
FILE: src/axolotl/utils/schemas/trl.py
================================================
"""Pydantic models for TRL trainer configuration"""

from typing import Literal

from pydantic import BaseModel, Field


class TRLConfig(BaseModel):
    """
    Input args for TRL.
    """

    beta: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Beta parameter for the RL training. Same as `rl_beta`. Use"
        },
    )
    max_completion_length: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Maximum length of the completion for RL training."
        },
    )

    # GRPO specific args
    # Ref: https://github.com/huggingface/trl/blob/26d86757a7c7e24e397ea44f57ecce6031dfac01/trl/trainer/grpo_config.py#L23
    use_vllm: bool = Field(
        default=False,
        json_schema_extra={"description": "Whether to use VLLM for RL training."},
    )
    vllm_mode: Literal["server", "colocate"] | None = Field(
        default=None,
        json_schema_extra={
            "description": "VLLM mode to use, one of 'server' or 'colocate'"
        },
    )
    vllm_server_host: str | None = Field(
        default="0.0.0.0",  # nosec B104
        json_schema_extra={"description": "Host of the vLLM server to connect to."},
    )
    vllm_server_port: int | None = Field(
        default=8000,
        json_schema_extra={"description": "Port of the vLLM server to connect to."},
    )
    vllm_server_timeout: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Total timeout (in seconds) to wait for the vLLM server to respond."
        },
    )
    vllm_guided_decoding_regex: str | None = Field(
        default=None,
        json_schema_extra={"description": "Regex for vLLM guided decoding."},
    )

    reward_funcs: list[str] | None = Field(
        default=None,
        json_schema_extra={
            "description": "List of reward functions to load. Paths must be importable from current dir."
        },
    )
    reward_weights: list[float] | None = Field(
        default=None,
        json_schema_extra={
            "description": "List of reward weights for the reward functions."
        },
    )
    num_generations: int | None = Field(
        default=None,
        json_schema_extra={"description": "Number of generations to sample."},
    )
    log_completions: bool | None = Field(
        default=False,
        json_schema_extra={"description": "Whether to log completions."},
    )
    num_completions_to_print: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Number of completions to print when log_completions is True."
        },
    )
    importance_sampling_level: Literal["sequence", "token"] | None = Field(
        default=None,
        json_schema_extra={
            "description": "Controls whether importance sampling ratios are computed at the `'token'` or `'sequence'` level. "
            "For GSPO, use `sequence`, default is None which corresponds to the original GRPO paper."
        },
    )

    sync_ref_model: bool | None = Field(
        default=False,
        json_schema_extra={"description": "Whether to sync the reference model."},
    )
    ref_model_mixup_alpha: float | None = Field(
        default=0.9,
        json_schema_extra={"description": "Mixup alpha for the reference model."},
    )
    ref_model_sync_steps: int | None = Field(
        default=64,
        json_schema_extra={"description": "Sync steps for the reference model."},
    )
    scale_rewards: bool = Field(
        default=True,
        json_schema_extra={
            "description": "Whether to scale rewards by their standard deviation."
        },
    )

    temperature: float | None = Field(
        default=None,
        json_schema_extra={"description": "Sampling temperature for the GRPO policy."},
    )
    top_p: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Top-p sampling probability for the generation policy."
        },
    )
    top_k: int | None = Field(
        default=None,
        json_schema_extra={"description": "Top-k sampling for the generation policy."},
    )
    min_p: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Minimum probability for the generation policy."
        },
    )
    repetition_penalty: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Penalty for tokens that appear in prompt and generated text."
        },
    )
    num_iterations: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Number of iterations per batch (μ) for GRPO."
        },
    )
    epsilon: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Epsilon value for clipping in the GRPO algorithm."
        },
    )
    epsilon_high: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "Upper-bound epsilon value for clipping in the GRPO algorithm."
        },
    )
    use_liger_loss: bool | None = Field(
        default=None,
        json_schema_extra={"description": "Whether to use Liger loss for GRPO."},
    )
    loss_type: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Loss formulation to use. Supported values: grpo, bnpo, dr_grpo."
        },
    )
    mask_truncated_completions: bool = Field(
        default=False,
        json_schema_extra={
            "description": "Whether to exclude truncated completions from loss calculation."
        },
    )
    vllm_enable_sleep_mode: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Enable sleep mode for vLLM to offload VRAM when idle"
        },
    )
    rollout_func: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Path to custom rollout function. Must be importable from current dir."
        },
    )
    multi_objective_aggregation: (
        Literal["sum_then_normalize", "normalize_then_sum"] | None
    ) = Field(
        default=None,
        json_schema_extra={
            "description": "Multi-objective reward aggregation strategy. "
            "'sum_then_normalize' (GRPO default): weights and sums rewards first, then normalizes. "
            "'normalize_then_sum' (GDPO): normalizes each reward independently, then sums."
        },
    )

    # Async GRPO fields
    use_data_producer: bool = Field(
        default=False,
        json_schema_extra={
            "description": "Use the GRPODataProducer protocol for online data generation."
        },
    )
    async_prefetch: bool = Field(
        default=False,
        json_schema_extra={
            "description": "Generate rollouts in a background thread while training on the previous rollout."
        },
    )
    prefetch_depth: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Number of rollouts to prefetch ahead of training."
        },
    )
    vllm_sync_interval: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Sync model weights to vLLM every N optimizer steps (async mode only)."
        },
    )
    streaming_partial_batch: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Score prompt groups incrementally instead of the full batch at once."
        },
    )
    streaming_min_groups: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Minimum prompt groups to score per streaming chunk."
        },
    )
    vllm_importance_sampling_correction: bool | None = Field(
        default=None,
        json_schema_extra={
            "description": "Apply IS correction for distribution mismatch between vLLM and training model."
        },
    )
    vllm_importance_sampling_mode: (
        Literal["token_truncate", "token_mask", "sequence_truncate", "sequence_mask"]
        | None
    ) = Field(
        default=None,
        json_schema_extra={
            "description": "IS mode: token_truncate, token_mask, sequence_truncate, or sequence_mask."
        },
    )
    vllm_importance_sampling_cap: float | None = Field(
        default=None,
        json_schema_extra={"description": "Cap C for IS ratio clipping/masking."},
    )
    off_policy_mask_threshold: float | None = Field(
        default=None,
        json_schema_extra={
            "description": "KL threshold for off-policy sequence masking (OPSM). None = disabled."
        },
    )
    use_bias_correction_kl: bool | None = Field(
        default=None,
        json_schema_extra={"description": "Apply IS correction to KL divergence term."},
    )

    reward_num_workers: int = Field(
        default=1,
        json_schema_extra={
            "description": "Number of persistent subprocess workers for parallel reward computation. Each worker has its "
            "own main thread so signal.alarm() (used by math_verify) works correctly. Work is sharded across "
            "workers by prompt groups. Only used with use_data_producer=True and non-nn.Module reward functions."
        },
    )
    replay_buffer_size: int = Field(
        default=0,
        json_schema_extra={
            "description": "[Experimental, disabled by default] Size of the replay buffer for storing high-signal rollout "
            "groups. When > 0, groups with reward variance are cached and used to replace zero-signal groups "
            "(where all rewards are identical). Set to 0 to disable. Only used with use_data_producer=True."
        },
    )
    replay_recompute_logps: bool = Field(
        default=True,
        json_schema_extra={
            "description": "When True (default), recompute old_per_token_logps for replayed groups using the current "
            "training model. This fixes the importance sampling mismatch that occurs when replaying stale data. "
            "Only relevant when replay_buffer_size > 0."
        },
    )
    reroll_start_fraction: float = Field(
        default=1.0,
        json_schema_extra={
            "description": "Fraction of total training steps after which deferred re-rolling begins. Zero-signal prompts "
            "(where all rewards in a group are identical) are buffered and re-injected into later batches when the "
            "model is more likely to solve them. Set to 1.0 to disable. Only used with use_data_producer=True."
        },
    )
    reroll_max_groups: int = Field(
        default=1,
        json_schema_extra={
            "description": "Maximum number of prompt groups to replace with re-roll candidates per batch. Higher values "
            "increase data utilization but reduce prompt diversity. Only used with use_data_producer=True."
        },
    )
    skip_zero_advantage_batches: bool = Field(
        default=True,
        json_schema_extra={
            "description": "When True, skip gradient computation for micro-batches where all advantages are zero (no learning "
            "signal). This avoids the forward/backward pass entirely when no learning signal is present. The step is "
            "logged with skipped_zero_adv_batches=1 for monitoring."
        },
    )
    vllm_lora_sync: bool = Field(
        default=False,
        json_schema_extra={
            "description": "Sync LoRA adapter to vLLM via filesystem instead of merging + NCCL broadcast. "
            "Auto-selects vllm_serve_lora serve module. Syncs only LoRA adapter weights vs full merged model."
        },
    )


================================================
FILE: src/axolotl/utils/schemas/utils.py
================================================
"""Utilities for Axolotl Pydantic models"""

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def handle_legacy_message_fields_logic(data: dict) -> dict:
    """
    Handle backwards compatibility between legacy message field mapping and new property mapping system.

    Previously, the config only supported mapping 'role' and 'content' fields via dedicated config options:
    - message_field_role: Mapped to the role field
    - message_field_content: Mapped to the content field

    The new system uses message_property_mappings to support arbitrary field mappings:
    message_property_mappings:
        role: source_role_field
        content: source_content_field
        additional_field: source_field

    Args:
        data: Dictionary containing configuration data

    Returns:
        Updated dictionary with message field mappings consolidated

    Raises:
        ValueError: If there are conflicts between legacy and new mappings
    """
    data = data.copy()  # Create a copy to avoid modifying the original

    if data.get("message_property_mappings") is None:
        data["message_property_mappings"] = {}

    # Check for conflicts and handle role
    if "message_field_role" in data:
        LOG.warning(
            "message_field_role is deprecated, use message_property_mappings instead. "
            f"Example: message_property_mappings: {{role: {data['message_field_role']}}}"
        )
        if (
            "role" in data["message_property_mappings"]
            and data["message_property_mappings"]["role"] != data["message_field_role"]
        ):
            raise ValueError(
                f"Conflicting message role fields: message_field_role='{data['message_field_role']}' "
                f"conflicts with message_property_mappings.role='{data['message_property_mappings']['role']}'"
            )
        data["message_property_mappings"]["role"] = data["message_field_role"] or "role"

        del data["message_field_role"]
    elif "role" not in data["message_property_mappings"]:
        data["message_property_mappings"]["role"] = "role"

    # Check for conflicts and handle content
    if "message_field_content" in data:
        LOG.warning(
            "message_field_content is deprecated, use message_property_mappings instead. "
            f"Example: message_property_mappings: {{content: {data['message_field_content']}}}"
        )
        if (
            "content" in data["message_property_mappings"]
            and data["message_property_mappings"]["content"]
            != data["message_field_content"]
        ):
            raise ValueError(
                f"Conflicting message content fields: message_field_content='{data['message_field_content']}' "
                f"conflicts with message_property_mappings.content='{data['message_property_mappings']['content']}'"
            )
        data["message_property_mappings"]["content"] = (
            data["message_field_content"] or "content"
        )

        del data["message_field_content"]
    elif "content" not in data["message_property_mappings"]:
        data["message_property_mappings"]["content"] = "content"

    return data


================================================
FILE: src/axolotl/utils/schemas/validation.py
================================================
"""Module with validation methods for config pydantic model."""

import json
import sys
import tempfile
from pathlib import Path

from pydantic import (
    field_validator,
    model_validator,
)
from transformers.utils.import_utils import is_torch_npu_available

from axolotl.utils.logging import get_logger
from axolotl.utils.schemas.enums import ChatTemplate, RingAttnFunc, RLType

LOG = get_logger(__name__)

SUPPORTED_METRICS = {"sacrebleu", "comet", "ter", "chrf", "perplexity"}


class DatasetValidationMixin:
    """Validation methods related to dataset configuration."""

    @field_validator("seed", mode="after")
    @classmethod
    def set_default_seed(cls, seed):
        if seed is None:
            LOG.info("`seed` not set in config; setting to 42")
            seed = 42
        return seed

    @field_validator("datasets", mode="before")
    @classmethod
    def deprecate_sharegpt_datasets(cls, datasets):
        for _, ds_cfg in enumerate(datasets):
            ds_type = (
                ds_cfg.get("type")
                if isinstance(ds_cfg, dict)
                else getattr(ds_cfg, "type", None)
            )
            if not ds_type:
                continue

            if isinstance(ds_type, dict):
                continue

            if isinstance(ds_type, str) and ds_type.startswith("sharegpt"):
                raise ValueError(
                    "`type: sharegpt.*` is deprecated. Please use `type: chat_template` instead."
                )

        return datasets

    @model_validator(mode="before")
    @classmethod
    def check_dataset_or_pretraining_dataset(cls, data):
        if data.get("datasets") is None and data.get("pretraining_dataset") is None:
            raise ValueError("either datasets or pretraining_dataset is required")
        return data

    @model_validator(mode="before")
    @classmethod
    def check_pretraining_streaming_deprecation(cls, data):
        # TODO(djsaunde): remove this check + implement change for 0.13.0 release
        if data.get("pretraining_dataset") and not data.get("streaming"):
            LOG.warning(
                "Setting `pretraining_dataset` without explicitly setting `streaming: "
                "true` is deprecated. In a future release, streaming will not be "
                "automatically enabled when using pretraining_dataset. Please "
                "explicitly set `streaming: true` in your configuration to maintain "
                "current behavior."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_push_ds_auth(cls, data):
        if (
            data.get("push_dataset_to_hub")
            and data.get("hf_use_auth_token") is not True
        ):
            raise ValueError(
                "Require cfg.hf_use_auth_token to be True for push_dataset_to_hub"
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_val_w_test_datasets(cls, data):
        if data.get("test_datasets") and data.get("val_set_size"):
            raise ValueError(
                "non-zero val_set_size should not be used with test_datasets configuration"
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_test_datasets_bench(cls, data):
        if (
            data.get("do_bench_eval")
            and not data.get("test_datasets")
            and not data.get("val_set_size")
        ):
            LOG.warning(
                "`do_bench_eval` needs a test dataset to run evals, adding an empty test_dataset."
            )
            data["test_datasets"] = [{"path": "axolotl-ai-co/empty-test-ds"}]
        return data

    @model_validator(mode="before")
    @classmethod
    def check_eval_packing(cls, data):
        # TODO also should check test_datasets and val_set_size as we can skip
        # if there are no eval datasets/splits
        if (
            data.get("sample_packing")
            and data.get("eval_table_size")
            and data.get("eval_sample_packing") is not False
        ):
            raise ValueError(
                "eval_table_size and eval_sample_packing are not supported together with sample_packing. Please set 'eval_sample_packing' to false."
            )
        if (
            data.get("sample_packing")
            and data.get("eval_sample_packing") is None
            and not data.get("eval_table_size")
        ):
            LOG.info(
                "explicitly setting `eval_sample_packing` to match `sample_packing`",
            )
            data["eval_sample_packing"] = True

        if (
            data.get("sample_packing")
            and data.get("eval_sample_packing") is False
            and data.get("remove_unused_columns") is None
        ):
            LOG.info(
                "setting `remove_unused_columns: false` for when sample_packing and eval_sample_packing don't match"
            )
            data["remove_unused_columns"] = False

        return data

    @model_validator(mode="before")
    @classmethod
    def check_mm_prepare(cls, data):
        if data.get("skip_prepare_dataset"):
            if data.get("remove_unused_columns") is None:
                LOG.info(
                    "setting `remove_unused_columns: false` for skip_prepare_dataset"
                )
                data["remove_unused_columns"] = False

        return data


class AttentionValidationMixin:
    """Validation methods related to attention mechanisms."""

    @model_validator(mode="before")
    @classmethod
    def check_attention_fields(cls, data):
        fields = (
            "xformers_attention",
            "sdp_attention",
            # "s2_attention",  # requires both FA and this to be enabled
            "flash_attention",
            "flex_attention",
            "sage_attention",
        )
        non_empty_count = sum(1 for field in fields if data.get(field))

        if non_empty_count > 1:
            raise ValueError(f"Only one of {', '.join(fields)} must be set")
        return data

    @model_validator(mode="before")
    @classmethod
    def check_sample_packing_without_attention(cls, data):
        if (
            data.get("sample_packing")
            and not data.get("flash_attention")
            and not data.get("sdp_attention")
            and not data.get("flex_attention")
            and not data.get("xformers_attention")
            and not data.get("sage_attention")
        ):
            LOG.warning(
                "sample_packing without flash, sdp, xformers, sage, or flex attention does not handle cross sample decontamination."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_sample_packing_with_s2attn(cls, data):
        if data.get("sample_packing") and data.get("s2_attention"):
            raise ValueError(
                "Received `sample_packing=true` and `s2_attention=true`; however, \
                shifted-sparse attention does not currently support sample packing."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_scaling_softmax_requires_flex(cls, data):
        if data.get("scaling_softmax") and not data.get("flex_attention"):
            raise ValueError(
                "scaling_softmax requires flex_attention: true\n"
                "Add 'flex_attention: true' to your config file.\n"
            )
        return data


class TrainingValidationMixin:
    """Validation methods related to training configuration."""

    @model_validator(mode="before")
    @classmethod
    def check_batch_size_fields(cls, data):
        fields = ("micro_batch_size", "gradient_accumulation_steps", "batch_size")
        non_empty_count = sum(1 for field in fields if data.get(field))

        if non_empty_count < 2:
            raise ValueError(f"At least two of {', '.join(fields)} must be set")
        return data

    @model_validator(mode="before")
    @classmethod
    def hint_sample_packing_padding(cls, data):
        if data.get("sample_packing"):
            pad_to_sequence_len = data.get("pad_to_sequence_len")
            if pad_to_sequence_len is False:
                LOG.warning(
                    "`pad_to_sequence_len: true` is recommended when using sample_packing"
                )
            elif pad_to_sequence_len is None:
                LOG.info(
                    "Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing"
                )
                data["pad_to_sequence_len"] = True
        return data

    @model_validator(mode="before")
    @classmethod
    def hint_reward_model_pad(cls, data):
        if data.get("reward_model") and not data.get("pad_to_sequence_len"):
            LOG.warning(
                "`pad_to_sequence_len: true` is recommended when using reward_model"
            )
            if data.get("pad_to_sequence_len") is None:
                data["pad_to_sequence_len"] = True
        return data

    @model_validator(mode="before")
    @classmethod
    def set_reward_model_defaults(cls, data):
        if data.get("reward_model"):
            if data.get("num_labels") is None:
                data["num_labels"] = 1
            if not (data.get("type_of_model") or data.get("model_type")):
                data["model_type"] = "AutoModelForSequenceClassification"

        if data.get("process_reward_model"):
            if data.get("num_labels") is None:
                data["num_labels"] = 2
            if not (data.get("type_of_model") or data.get("model_type")):
                data["model_type"] = "AutoModelForTokenClassification"

        return data

    @model_validator(mode="before")
    @classmethod
    def check_gas_bsz(cls, data):
        if data.get("gradient_accumulation_steps") and data.get("batch_size"):
            raise ValueError(
                "please set only one of gradient_accumulation_steps or batch_size"
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def hint_eval_train_mbsz(cls, data):
        if (
            data.get("eval_batch_size")
            and data.get("micro_batch_size")
            and data.get("eval_batch_size") != data.get("micro_batch_size")
        ):
            LOG.warning(
                "eval_batch_size != micro_batch_size. This can lead to VRAM instability."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_warmup(cls, data):
        if data.get("warmup_steps") and data.get("warmup_ratio"):
            raise ValueError("warmup_steps and warmup_ratio are mutually exclusive")
        return data

    @model_validator(mode="before")
    @classmethod
    def check_saves(cls, data):
        if (
            data.get("save_strategy")
            and data.get("save_steps")
            and data.get("save_strategy") != "steps"
        ):
            raise ValueError(
                "save_strategy and save_steps mismatch. Please set save_strategy to 'steps' or remove save_steps."
            )
        if data.get("saves_per_epoch") and data.get("save_steps"):
            raise ValueError(
                "save_steps and saves_per_epoch are mutually exclusive and cannot be used together."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_push_save(cls, data):
        if data.get("hub_model_id") and (
            data.get("save_strategy") not in ["steps", "epoch", None]
        ):
            LOG.warning(
                "hub_model_id is set without any models being saved. To save a model, set save_strategy."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_evals(cls, data):
        if (
            data.get("eval_strategy")
            and data.get("eval_steps")
            and data.get("eval_strategy") != "steps"
        ):
            raise ValueError(
                "eval_strategy and eval_steps mismatch. Please set eval_strategy to 'steps' or remove eval_steps."
            )

        if (
            data.get("val_set_size") == 0
            and (data.get("eval_steps") or data.get("eval_strategy"))
            and not data.get("test_datasets")
            and data.get("eval_strategy") != "no"
        ):
            raise ValueError(
                "eval_steps and eval_strategy are not supported with val_set_size == 0"
            )
        if data.get("evals_per_epoch") and data.get("eval_steps"):
            raise ValueError(
                "eval_steps and evals_per_epoch are mutually exclusive and cannot be used together."
            )
        if (
            data.get("evals_per_epoch")
            and data.get("eval_strategy")
            and data.get("eval_strategy") != "steps"
        ):
            raise ValueError(
                "eval_strategy must be empty or set to `steps` when used with evals_per_epoch."
            )

        if data.get("do_bench_eval") and not (
            data.get("evals_per_epoch") or data.get("eval_steps")
        ):
            raise ValueError(
                "do_bench_eval requires evals_per_epoch or eval_steps to be set."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_neftune(cls, data):
        if data.get("noisy_embedding_alpha") and not data.get("neftune_noise_alpha"):
            data["neftune_noise_alpha"] = data["noisy_embedding_alpha"]
            del data["noisy_embedding_alpha"]
        elif data.get("noisy_embedding_alpha") and data.get("neftune_noise_alpha"):
            raise ValueError(
                "noisy_embedding_alpha is deprecated, use neftune_noise_alpha; both are set, please remove the deprecated noisy_embedding_alpha setting"
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_multipack_buffer_size(cls, data):
        if data.get("pretrain_multipack_buffer_size") and not data.get(
            "streaming_multipack_buffer_size"
        ):
            LOG.warning(
                "`pretrain_multipack_buffer_size` is deprecated in v0.13.0, will be "
                "removed in v0.14.0. Use `streaming_multipack_buffer_size` instead."
            )
            data["streaming_multipack_buffer_size"] = data[
                "pretrain_multipack_buffer_size"
            ]
            del data["pretrain_multipack_buffer_size"]
        elif data.get("pretrain_multipack_buffer_size") and data.get(
            "streaming_multipack_buffer_size"
        ):
            raise ValueError(
                "pretrain_multipack_buffer_size is deprecated, use "
                "streaming_multipack_buffer_size; both are set, please remove the "
                "deprecated pretrain_multipack_buffer_size setting"
            )
        return data

    @model_validator(mode="after")
    def check_fft_possible_bad_config(self):
        if (
            not (self.bf16 or self.bfloat16)
            and (self.fp16 or self.float16)
            and not self.adapter
            and not self.flash_attention
            and self.sample_packing
        ):
            LOG.warning(
                "Full fine tune w/o FA2 w/ sample packing and fp16/float16 is likely to raise errors. Try LoRA."
            )
            # ValueError: Attempting to unscale FP16 gradients.
            # OR
            # RuntimeError: expected mat1 and mat2 to have the same dtype, but got: float != c10::Half
        return self

    @model_validator(mode="before")
    @classmethod
    def check_fp8_config(cls, data):
        if data.get("fp8") and not data.get("torch_compile"):
            LOG.warning(
                "torch_compile is strongly recommended for FP8 training in order to "
                "see speed improvements. Please consider setting `torch_compile: "
                "true` in your config."
            )
        fsdp_config = data.get("fsdp_config") or {}
        if data.get("fp8") and (
            fsdp_config.get("activation_checkpointing", False) is True
            or fsdp_config.get("fsdp_activation_checkpointing", False) is True
        ):
            LOG.warning(
                "FP8 + FSDP2 + activation checkpointing may be slower than BF16 "
                "training. Please considering setting `activation_checkpointing: false` "
                "in your FSDP config."
            )
        if (
            data.get("fp8_enable_fsdp_float8_all_gather")
            and not data.get("fsdp_version", None) == 2
        ):
            raise ValueError(
                "fp8_enable_fsdp_float8_all_gather requires FSDP2 (fsdp_version: 2) "
                "to be used."
            )

        return data

    @model_validator(mode="before")
    @classmethod
    def check_use_reentrant_mismatch(cls, data):
        if (
            data.get("unfrozen_parameters")
            and data.get("gradient_checkpointing_kwargs")
            and data.get("gradient_checkpointing_kwargs", {}).get("use_reentrant")
            is True
        ):
            # https://github.com/huggingface/transformers/issues/21381
            raise ValueError(
                "`use_reentrant` must be false when used with partially frozen model."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_eval_strategy(cls, data):
        if (
            data.get("evaluation_strategy") is not None
            and data.get("eval_strategy") is None
        ):
            LOG.info(
                "explicitly setting `eval_strategy` from the `evaluation_strategy`"
            )
            data["eval_strategy"] = data.get("evaluation_strategy")
        return data

    @model_validator(mode="before")
    @classmethod
    def check_causal_lm_evals(cls, data):
        if data.get("do_causal_lm_eval") and data.get("eval_sample_packing"):
            raise ValueError(
                "do_causal_lm_eval is enabled, eval_sample_packing must be set to False"
            )

        if data.get("eval_causal_lm_metrics"):
            if not isinstance(data.get("eval_causal_lm_metrics"), list):
                raise ValueError("eval_causal_lm_metrics must be a list")
            # only ["sacrebleu", "comet", "ter", "chrf"] supported
            if set(data.get("eval_causal_lm_metrics")) - SUPPORTED_METRICS:
                raise ValueError(
                    f"eval_causal_lm_metrics must be one of {SUPPORTED_METRICS}"
                )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_tokenizer_use_mistral_common(cls, data):
        if data.get("tokenizer_use_mistral_common") is None:
            if any(
                "magistral" in name.lower()
                for name in [
                    data.get("base_model", ""),
                    data.get("base_model_config", ""),
                    data.get("tokenizer_config", ""),
                ]
            ):
                LOG.warning(
                    "tokenizer_use_mistral_common auto inferred to True for Magistral models. Please set it to True explicitly if you want to use mistral-common tokenizer."
                )
                data["tokenizer_use_mistral_common"] = True

        return data

    @field_validator("tokenizer_use_mistral_common", mode="after")
    @classmethod
    def check_mistral_common_import(cls, tokenizer_use_mistral_common):
        if tokenizer_use_mistral_common:
            import importlib.util

            if importlib.util.find_spec("mistral_common") is None:
                raise ImportError(
                    "mistral-common is required for mistral models. Please install it with `pip install axolotl` or `pip install -e .`."
                )

        return tokenizer_use_mistral_common

    @model_validator(mode="before")
    @classmethod
    def check_mistral_common_incompatible_options(cls, data):
        if not data.get("tokenizer_use_mistral_common"):
            return data

        # NOTE: mistral-common tokenizer is not compatible with editing tokenizer at the moment

        if data.get("added_tokens_overrides"):
            raise ValueError(
                "added_tokens_overrides is not supported with mistral-common tokenizer"
            )

        if data.get("special_tokens"):
            raise ValueError(
                "special_tokens override is not supported with mistral-common tokenizer"
            )

        if data.get("tokens"):
            raise ValueError(
                "tokens override is not supported with mistral-common tokenizer"
            )

        if data.get("chat_template"):
            raise ValueError(
                "Setting chat_template is not supported with mistral-common tokenizer"
            )

        return data

    @model_validator(mode="before")
    @classmethod
    def pretrain_with_tps(cls, data):
        if data.get("pretraining_dataset") and data.get(
            "include_tokens_per_second", False
        ):
            # combining these would raise `TypeError: cannot pickle 'dict_keys' object`
            # due to trying to count the number of tokens total in the dataset
            raise ValueError(
                "pretraining_dataset and include_tokens_per_second cannot be used together."
            )

        return data


class LoRAValidationMixin:
    """Validation methods related to LoRA/QLoRA configuration."""

    @model_validator(mode="before")
    @classmethod
    def check_lr_groups(cls, data):
        if data.get("lr_groups") and data.get("loraplus_lr_ratio"):
            raise ValueError("lr_groups and loraplus_lr_ratio cannot be used together.")
        return data

    @model_validator(mode="before")
    @classmethod
    def check_frozen(cls, data):
        if (
            data.get("adapter")
            and data.get("peft_layers_to_transform")
            and data.get("unfrozen_parameters")
        ):
            raise ValueError(
                "`unfrozen_parameters` used with `peft_layers_to_transform` can have unexpected behavior."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_peft_layers_pattern(cls, data):
        if data.get("peft_layers_pattern") and not data.get("peft_layers_to_transform"):
            raise ValueError(
                "peft_layers_pattern requires peft_layers_to_transform to be set"
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_qlora_unsloth(cls, data):
        if (
            data.get("unsloth_lora_mlp")
            or data.get("unsloth_lora_qkv")
            or data.get("unsloth_lora_o")
        ):
            if data.get("adapter") == "lora" and data.get("load_in_8bit"):
                raise ValueError(
                    "unsloth_lora_mlp, unsloth_lora_qkv, and unsloth_lora_o are not compatible with 8-bit LoRA"
                )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_lora_axolotl_unsloth(cls, data):
        is_lora_kernel = any(
            data.get(k) for k in ["lora_mlp_kernel", "lora_qkv_kernel", "lora_o_kernel"]
        )
        is_unsloth_lora = any(
            data.get(k)
            for k in ["unsloth_lora_mlp", "unsloth_lora_qkv", "unsloth_lora_o"]
        )
        if is_lora_kernel and is_unsloth_lora:
            raise ValueError(
                "both lora_mlp_kernel and unsloth_lora_mlp cannot be true (similarly for lora_qkv_kernel, lora_o_kernel)"
            )
        return data

    @model_validator(mode="after")
    def check_fused_lora(self):
        if self.adapter in ["lora", "qlora"] and self.flash_attn_fuse_mlp:
            raise ValueError("Fused modules are not supported with LoRA/QLoRA")
        return self

    @model_validator(mode="before")
    @classmethod
    def warn_qlora_zero3_w_use_reentrant(cls, data):
        if (
            data.get("adapter") == "qlora"
            and data.get("gradient_checkpointing_kwargs", {})
            and data.get("gradient_checkpointing_kwargs", {}).get("use_reentrant")
            is False
            and data.get("deepspeed", "") is not None
            and "zero3" in data.get("deepspeed", "")
        ):
            # may result in:
            # torch.utils.checkpoint.CheckpointError: torch.utils.checkpoint:
            # Recomputed values for the following tensors have different metadata
            # than during the forward pass.
            LOG.warning(
                "qlora + zero3 with use_reentrant: false may result in a CheckpointError about recomputed values"
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_lora_kernels_8bit(cls, data):
        if (
            data.get("lora_mlp_kernel")
            or data.get("lora_qkv_kernel")
            or data.get("lora_o_kernel")
        ):
            if data.get("adapter") == "lora" and data.get("load_in_8bit"):
                raise ValueError(
                    "lora_mlp_kernel, lora_qkv_kernel, and lora_o_kernel are not "
                    "compatible with 8-bit LoRA a the moment."
                )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_lora_kernels_dora(cls, data):
        if (
            data.get("lora_mlp_kernel")
            or data.get("lora_qkv_kernel")
            or data.get("lora_o_kernel")
        ) and data.get("peft_use_dora"):
            raise ValueError(
                "lora_mlp_kernel, lora_qkv_kernel, and lora_o_kernel are not "
                "compatible with DoRA at the moment."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_lora_kernels_trust_remote_code(cls, data):
        if (
            data.get("lora_mlp_kernel")
            or data.get("lora_qkv_kernel")
            or data.get("lora_o_kernel")
        ) and data.get("trust_remote_code"):
            raise ValueError(
                "lora_mlp_kernel, lora_qkv_kernel, and lora_o_kernel are not "
                "compatible with trust_remote_code. Please disable trust_remote_code "
                "or explicitly set lora_*_kernel to false."
            )
        return data


class RLValidationMixin:
    """Validation methods related to RL training configuration."""

    @model_validator(mode="before")
    @classmethod
    def check_sample_packing_w_rl(cls, data):
        if data.get("sample_packing") and data.get("rl"):
            raise ValueError("`sample_packing: true` does not work with RLHF training")
        return data

    @model_validator(mode="before")
    @classmethod
    def check_kto_config(cls, data):
        if data.get("rl") == "kto":
            if data.get("sample_packing") or data.get("eval_sample_packing"):
                raise ValueError("sample_packing is not supported with kto")

            if data.get("remove_unused_columns") is not False:
                raise ValueError("Set `remove_unused_columns: False` when using kto")
        return data

    @model_validator(mode="before")
    @classmethod
    def check_grpo_liger_sequence_parallel(cls, data):
        if (
            data.get("rl") == "grpo"
            and data.get("trl", {})
            and data.get("trl").get("use_liger_loss")
            and data.get("context_parallel_size", 1) > 1
        ):
            raise ValueError("GRPO + SP + Liger not currently supported")
        return data

    @model_validator(mode="before")
    @classmethod
    def check_rl_config_gradient_checkpointing(cls, data):
        # TODO: SalmanMohammadi
        # Distributed RL with QLoRA + gradient checkpointing
        # and use_reentrant = True is broken upstream in TRL

        if (
            data.get("rl")
            and data.get("gradient_checkpointing")
            and data.get("gradient_checkpointing_kwargs")
            and data.get("gradient_checkpointing_kwargs").get("use_reentrant")
            and data.get("load_in_4bit")
            and data.get("adapter") == "qlora"
            and data.get("capabilities")
            and data.get("capabilities").get("n_gpu", 1) > 1
        ):
            raise ValueError(
                "The `use_reentrant: True` implementation of gradient checkpointing "
                "is not supported for distributed RL training with QLoRA. Please set "
                "`use_reentrant: False` in `gradient_checkpointing_kwargs`."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_gdpo(cls, data):
        if (
            data.get("rl") == "gdpo"
            and data.get("trl", {}).get("multi_objective_aggregation")
            == "sum_then_normalize"
        ):
            raise ValueError(
                "`multi_objective_aggregation` value set as `sum_then_normalize` => GRPO, but GDPO was selected"
            )
        return data


class OptimizationValidationMixin:
    """Validation methods related to optimization and performance."""

    @model_validator(mode="after")
    def check_adamw_optimizer_params(self):
        if any([self.adam_beta1, self.adam_beta2, self.adam_epsilon]) and (
            not self.optimizer or "adamw" not in str(self.optimizer).lower()
        ):
            LOG.warning("adamw hyperparameters found, but no adamw optimizer set")
        return self

    @staticmethod
    def _resolve_fsdp_version(data):
        """Resolve FSDP version from top-level fsdp_version or fsdp_config.fsdp_version."""
        fsdp_version = data.get("fsdp_version")
        if fsdp_version is None:
            fsdp_version = data.get("fsdp_config", {}).get("fsdp_version", 1)
        return fsdp_version

    @model_validator(mode="before")
    @classmethod
    def check_muon_deepspeed_fsdp(cls, data):
        if data.get("optimizer") == "muon":
            if data.get("deepspeed"):
                raise ValueError(
                    "Muon optimizer is currently incompatible with DeepSpeed"
                )
            if data.get("fsdp") or data.get("fsdp_config"):
                fsdp_version = cls._resolve_fsdp_version(data)
                if str(fsdp_version) != "2":
                    raise ValueError(
                        "Muon optimizer is only compatible with FSDP2. Set fsdp_version: 2 to use Muon with FSDP."
                    )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_flashoptim_deepspeed_fsdp(cls, data):
        optimizer = data.get("optimizer") or ""
        if str(optimizer).startswith("flash_"):
            if data.get("deepspeed"):
                raise ValueError(
                    f"{optimizer} optimizer is incompatible with DeepSpeed. "
                    "Flash optimizers only support DDP and FSDP2."
                )
            if data.get("fsdp") or data.get("fsdp_config"):
                fsdp_version = cls._resolve_fsdp_version(data)
                if str(fsdp_version) != "2":
                    raise ValueError(
                        f"{optimizer} optimizer is only compatible with FSDP2. "
                        "Set fsdp_version: 2 to use flash optimizers with FSDP."
                    )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_batch_flattening_fa(cls, data):
        if data.get("batch_flattening"):
            batch_flattening_auto = data.get("batch_flattening") == "auto"
            if not data.get("flash_attention") and not batch_flattening_auto:
                raise ValueError("batch_flattening requires flash attention")
            if data.get("sample_packing") and not batch_flattening_auto:
                raise ValueError("batch_flattening not compatible with sample_packing")
            if data.get("micro_batch_size") == 1 and not batch_flattening_auto:
                LOG.warning("batch_flattening has no effect with micro_batch_size == 1")

            if (
                batch_flattening_auto
                and data.get("flash_attention")
                and not data.get("sample_packing")
                and data.get("micro_batch_size") > 1
            ):
                data["batch_flattening"] = True
            elif batch_flattening_auto:
                data["batch_flattening"] = False

        return data

    @model_validator(mode="before")
    @classmethod
    def check_xentropy_patch_conflicts(cls, data):
        if data.get("flash_attn_cross_entropy") and data.get(
            "unsloth_cross_entropy_loss"
        ):
            raise ValueError(
                "flash_attn_cross_entropy and unsloth_cross_entropy_loss cannot be both enabled"
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_cross_entropy_conflicts(cls, data):
        """Check for mutual exclusivity between cross entropy patch options.

        Only one of the following can be enabled at a time:
        - cut_cross_entropy (CutCrossEntropyPlugin)
        - chunked_cross_entropy
        - liger_cross_entropy (LigerPlugin)
        - liger_fused_linear_cross_entropy (LigerPlugin)
        """
        ce_options = {
            "cut_cross_entropy": data.get("cut_cross_entropy"),
            "chunked_cross_entropy": data.get("chunked_cross_entropy"),
            "liger_cross_entropy": data.get("liger_cross_entropy"),
            "liger_fused_linear_cross_entropy": data.get(
                "liger_fused_linear_cross_entropy"
            ),
        }

        enabled_options = [k for k, v in ce_options.items() if v]

        if len(enabled_options) > 1:
            raise ValueError(
                f"Only one cross entropy optimization can be enabled at a time. "
                f"Found {len(enabled_options)} enabled: {', '.join(enabled_options)}. "
                "Please disable all but one."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_fsdp_version(cls, data):
        fsdp_config = data.get("fsdp_config", {})
        if fsdp_config and str(data.get("fsdp_version")) != "2":
            LOG.info(
                "FSDP1 will be deprecated in an upcoming release of Axolotl."
                "We recommend that you use FSDP version 2 for better performance and compatibility. "
                "Please see this link for more details: https://docs.axolotl.ai/docs/multi-gpu.html#sec-fsdp "
                "For more details on migrating your config. "
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_fsdp2_cpu_offload_pin_memory(cls, data):
        if not (fsdp_config := data.get("fsdp_config")):
            return data

        if fsdp_config.get("cpu_offload_pin_memory") is False:
            if str(data.get("fsdp_version")) != "2":
                raise ValueError(
                    "FSDP1 does not support disabling cpu_offload_pin_memory, please set `fsdp_version` to 2"
                )
            if not fsdp_config.get("offload_params"):
                raise ValueError(
                    "disabling cpu_offload_pin_memory requires enabling offload_params"
                )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_fsdp2_base_model_quant_rl(cls, data):
        if data.get("fsdp_version") == 2 and data.get("rl") in [
            RLType.DPO,
            RLType.KTO,
            RLType.ORPO,
            RLType.IPO,
        ]:
            if data.get("load_in_8bit") or data.get("load_in_4bit"):
                raise ValueError(
                    f"FSDP2 does not support load_in_8bit or load_in_4bit with {data.get('rl')}. Please use DeepSpeed or set `fsdp_version` to 1."
                )

        return data

    @model_validator(mode="before")
    @classmethod
    def check_fsdp_config_kwargs_prefix(cls, data):
        if fsdp_config := data.get("fsdp_config"):
            should_fix = False
            for key, _ in fsdp_config.items():
                if key.startswith("fsdp_"):
                    should_fix = True
                    LOG.warning_once(
                        "Configuring FSDP fields with the `fsdp_` prefix is deprecated. "
                        "Please omit the `fsdp_` prefix from the any fields in `fsdp_config`."
                    )
            if should_fix:
                update_fsdp_config = {}
                for key, value in fsdp_config.items():
                    if key.startswith("fsdp_") and key != "fsdp_version":
                        update_fsdp_config[key.replace("fsdp_", "")] = value
                    else:
                        update_fsdp_config[key] = value
                data["fsdp_config"] = update_fsdp_config
        return data

    @model_validator(mode="before")
    @classmethod
    def check_fsdp_version_in_fsdp_config(cls, data):
        fsdp_config = data.get("fsdp_config") or {}
        fsdp_version = data.get("fsdp_version", None)
        if not fsdp_version and fsdp_config and fsdp_config.get("version"):
            fsdp_cfg_version = fsdp_config.pop("version")
            data["fsdp_version"] = fsdp_cfg_version
            data["fsdp_config"]["fsdp_version"] = fsdp_cfg_version
        elif not fsdp_version and fsdp_config and fsdp_config.get("fsdp_version"):
            data["fsdp_version"] = fsdp_config.get("fsdp_version")
        if fsdp_version and fsdp_config and not fsdp_config.get("fsdp_version"):
            data["fsdp_config"]["fsdp_version"] = fsdp_version
        return data

    @model_validator(mode="after")
    def check_fsdp_offload_w_8bit_optimizer(self):
        if (
            hasattr(self, "fsdp_config")
            and self.fsdp_config
            and self.optimizer
            and "8bit" in self.optimizer.value
            and self.fsdp_config.offload_params
            and str(self.fsdp_version) != "2"
        ):
            raise ValueError(
                f"FSDP Offload not compatible with {str(self.optimizer.value)}"
            )
        return self

    @model_validator(mode="after")
    def check_fsdp2_w_8bit_optimizer(self):
        if (
            hasattr(self, "fsdp_config")
            and self.fsdp_config
            and self.optimizer
            and "8bit" in self.optimizer.value
            and str(self.fsdp_version) == "2"
        ):
            if self.optimizer in ["adamw_8bit", "adamw_bnb_8bit"]:
                # CUDA ops errors with bnb 8bit optimizer + FSDP2
                raise ValueError(
                    f"FSDP2 not compatible with {self.optimizer.value}, use `adamw_torch_8bit` instead"
                )

        return self

    @model_validator(mode="before")
    @classmethod
    def check_tensor_parallel_size_update_ds_json(cls, data):
        tensor_parallel_size = data.get("tensor_parallel_size")
        if tensor_parallel_size is not None and tensor_parallel_size > 1:
            if data.get("deepspeed"):
                with open(data.get("deepspeed"), "r", encoding="utf-8") as ds_fin:
                    ds_config = json.load(ds_fin)
                    should_save = False
                    if "tensor_parallel" not in ds_config:
                        ds_config["tensor_parallel"] = {
                            "autotp_size": tensor_parallel_size
                        }
                        should_save = True
                    if (
                        "gather_16bit_weights_on_model_save"
                        not in ds_config["zero_optimization"]
                    ):
                        ds_config["zero_optimization"][
                            "gather_16bit_weights_on_model_save"
                        ] = True
                        should_save = True
                    if should_save:
                        temp_dir = tempfile.mkdtemp()
                        with open(
                            Path(temp_dir) / "autotp_ds.json", "w", encoding="utf-8"
                        ) as ds_fout:
                            json.dump(ds_config, ds_fout, indent=4)
                        data["deepspeed"] = str(Path(temp_dir) / "autotp_ds.json")

        return data

    @model_validator(mode="before")
    @classmethod
    def check_deepcompile(cls, data):
        deepcompile = data.get("deepcompile")
        if deepcompile:
            if not data.get("deepspeed"):
                raise ValueError("DeepCompile is only supported with DeepSpeed")
            with open(data.get("deepspeed"), "r", encoding="utf-8") as ds_fin:
                ds_config = json.load(ds_fin)
                if "compile" not in ds_config:
                    ds_config["compile"] = {"deepcompile": True}
                    temp_dir = tempfile.mkdtemp()
                    with open(
                        Path(temp_dir) / "deepcompile_ds.json", "w", encoding="utf-8"
                    ) as ds_fout:
                        json.dump(ds_config, ds_fout, indent=4)
                    data["deepspeed"] = str(Path(temp_dir) / "deepcompile_ds.json")

        return data


class SystemValidationMixin:
    """Validation methods related to system and hardware configuration."""

    @model_validator(mode="before")
    @classmethod
    def check_mem_mismatch(cls, data):
        if (
            data.get("max_memory") is not None
            and data.get("gpu_memory_limit") is not None
        ):
            raise ValueError(
                "max_memory and gpu_memory_limit are mutually exclusive and cannot be used together."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_fsdp_deepspeed(cls, data):
        if data.get("deepspeed") and data.get("fsdp"):
            raise ValueError("deepspeed and fsdp cannot be used together.")
        return data

    @model_validator(mode="before")
    @classmethod
    def check_model_quantization_config_vs_bnb(cls, data):
        if data.get("model_quantization_config"):
            if data.get("load_in_8bit") or data.get("load_in_4bit"):
                raise ValueError(
                    "model_quantization_config and load_in_8bit or load_in_4bit cannot be used together."
                )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_npu_config(cls, data):
        if is_torch_npu_available():
            # check attention config
            attn_list = ["flash_attention", "sdp_attention", "s2_attention"]
            for attn in attn_list:
                if data.get(attn):
                    raise NotImplementedError(
                        f"{attn} is currently not supported in Ascend npu, please disable this configuration."
                    )

            # check quant config
            if data.get("optimizer") is not None and "bit" in data.get("optimizer"):
                optimizer = data.get("optimizer")
                raise NotImplementedError(
                    f"{optimizer} is currently not supported in Ascend npu, choose another one please."
                )

            quant_list = ["load_in_8bit", "load_in_4bit"]
            for quant in quant_list:
                if data.get(quant):
                    raise NotImplementedError(
                        f"Quantification is currently not supported in Ascend npu, please disable {quant}."
                    )

            # check dtype config
            if data.get("tf32"):
                raise NotImplementedError(
                    "tf32 dtype is currently not supported in Ascend npu, please disable this configuration"
                )

        return data


class ChatTemplateValidationMixin:
    """Validation methods related to chat template configuration."""

    @model_validator(mode="before")
    @classmethod
    def check_chat_template_config(cls, data):
        # if chat_template is set to jinja, chat_template_jinja is required
        if data.get("chat_template") == ChatTemplate.jinja and not data.get(
            "chat_template_jinja"
        ):
            raise ValueError(
                "chat_template_jinja is required when chat_template is set to jinja"
            )

        # If chat_template_jinja is set, set chat_template to jinja
        if data.get("chat_template_jinja") and not data.get("chat_template"):
            data["chat_template"] = ChatTemplate.jinja

        return data


class PretrainingValidationMixin:
    """Validation methods related to pretraining configuration."""

    @model_validator(mode="before")
    @classmethod
    def check_pretraining_w_max_steps(cls, data):
        if data.get("pretraining_dataset") and not data.get("max_steps"):
            raise ValueError(
                "max_steps must be set when using iterable pretraining_dataset, Trainer can't infer length and schedule optimizer/learning rate without it!"
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_pretraining_w_group_by_length(cls, data):
        if data.get("pretraining_dataset") and data.get("group_by_length"):
            LOG.warning(
                "You probably want to disable group_by_length as it will force a streamed dataset to download completely."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_pretraining_split_batches_accelerate(cls, data):
        # alternatively set ACCELERATE_SPLIT_BATCHES=False
        if data.get("pretraining_dataset"):
            accelerator_config = data.get("accelerator_config", {})
            if not accelerator_config:
                data["accelerator_config"] = {
                    "split_batches": False,
                    "dispatch_batches": False,
                }
            else:
                if accelerator_config.get("split_batches") is None:
                    data["accelerator_config"]["split_batches"] = False
                if accelerator_config.get("dispatch_batches") is None:
                    data["accelerator_config"]["dispatch_batches"] = False
        return data

    @model_validator(mode="before")
    @classmethod
    def check_pretraining_w_val_set_size(cls, data):
        if data.get("pretraining_dataset") and data.get("val_set_size"):
            raise ValueError(
                "val_set_size is not supported with pretraining_dataset. "
                "Use test_datasets to specify evaluation datasets for pretraining."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_streaming_w_val_set_size(cls, data):
        if data.get("streaming") and data.get("val_set_size"):
            raise ValueError(
                "val_set_size is not supported with streaming datasets. "
                "Use test_datasets to specify evaluation datasets when streaming is enabled."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_streaming_w_max_steps(cls, data):
        if data.get("streaming") and not data.get("max_steps"):
            raise ValueError(
                "max_steps must be set when using streaming datasets. "
                "Trainer cannot infer dataset length for iterable datasets."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_streaming_w_multiple_datasets(cls, data):
        if (
            data.get("streaming")
            and data.get("sample_packing")
            and data.get("datasets")
            and len(data.get("datasets")) > 1
        ):
            raise NotImplementedError(
                "Sample packing with multiple streaming datasets is not yet supported"
            )
        return data


class ModelCompatibilityValidationMixin:
    """Validation methods for specific model compatibility."""

    @model_validator(mode="after")
    def check_falcon_fsdp(self):
        if (self.base_model and "falcon" in self.base_model.lower()) and self.fsdp:
            raise ValueError("FSDP is not supported for falcon models")
        return self

    @model_validator(mode="after")
    def check_mpt_checkpointing(self):
        if (
            self.base_model and "mpt" in self.base_model.lower()
        ) and self.gradient_checkpointing:
            raise ValueError("gradient_checkpointing is not supported for MPT models")
        return self

    @model_validator(mode="after")
    def check_gradient_checkpointing_w_offload(self):
        if self.gradient_checkpointing == "offload":
            LOG.warning(
                "`offload` is deprecated for gradient_checkpointing, use `activation_offloading: true` or `activation_offloading: legacy`"
            )
            self.gradient_checkpointing = True
            LOG.warning(
                "`offload` now uses a new stream implementation; to use the previous implementation, use `activation_offloading: legacy`"
            )
            self.activation_offloading = True
        if self.gradient_checkpointing == "offload_disk":
            LOG.warning(
                "`offload_disk` is deprecated for gradient_checkpointing, use `activation_offloading: disk`"
            )
            self.gradient_checkpointing = True
            self.activation_offloading = "disk"
        return self

    @model_validator(mode="after")
    def check_activation_offloading_wo_gc(self):
        if self.activation_offloading and not self.gradient_checkpointing:
            raise ValueError("activation_offloading requires gradient_checkpointing")
        return self

    @model_validator(mode="after")
    def check_better_transformers(self):
        if self.flash_optimum is True:
            if self.adapter:
                LOG.warning(
                    "BetterTransformers probably doesn't work with PEFT adapters"
                )
            if self.fp16 or self.bf16:
                raise ValueError("AMP is not supported with BetterTransformer")
            if self.float16 is not True and self.bfloat16 is not True:
                LOG.warning(
                    "You should probably set bfloat16 or float16 to true to "
                    "load the model in float16 for BetterTransformers"
                )
        return self

    @model_validator(mode="before")
    @classmethod
    def check_gptq_w_revision(cls, data):
        if data.get("gptq") and data.get("revision_of_model"):
            raise ValueError(
                "revision_of_model is not supported for GPTQ models. "
                + "Please download the model from HuggingFace Hub manually for correct branch, "
                + "point to its path, and remove revision_of_model from the config."
            )
        return data

    @model_validator(mode="before")
    @classmethod
    def check_gpt_oss_fsdp_loading(cls, data):
        if data.get("model_quantization_config", "") == "Mxfp4Config":
            fsdp_config = data.get("fsdp_config") or {}
            if fsdp_config.get("cpu_ram_efficient_loading", False) is True:
                raise ValueError(
                    "FSDP cpu_ram_efficient_loading is not supported for Mxfp4Config model quantization."
                )
        return data


class ComplexValidationMixin:
    """Complex validation methods that involve multiple systems."""

    @field_validator("neftune_noise_alpha")
    @classmethod
    def validate_neftune_noise_alpha(cls, neftune_noise_alpha):
        if neftune_noise_alpha is not None and neftune_noise_alpha <= 0.0:
            raise ValueError("neftune_noise_alpha must be > 0.0")
        return neftune_noise_alpha

    @model_validator(mode="after")
    def check_rl_beta(self):
        if self.dpo_beta and not self.rl_beta:
            self.rl_beta = self.dpo_beta
            del self.dpo_beta
        return self

    @model_validator(mode="after")
    def check_simpo_warmup(self):
        if self.rl is RLType.SIMPO and self.warmup_ratio:
            raise ValueError(
                "warmup_ratio is not supported with the simpo trainer. Please use `warmup_steps` instead"
            )
        return self

    @model_validator(mode="after")
    def check_relora(self):
        if self.relora:
            if not self.jagged_restart_steps:
                raise ValueError("jagged_restart_steps must be set to use ReLoRA")
            if self.adapter not in ("lora", "qlora"):
                raise ValueError("cfg.adapter must be lora or qlora to use ReLoRA")

            if self.fsdp or self.fsdp_config:
                raise ValueError("fsdp not supported with ReLoRA")

            if self.deepspeed:
                raise ValueError("deepspeed not supported with ReLoRA")

            if self.lr_scheduler == "one_cycle":
                raise ValueError(
                    "ReLoRA is not compatible with the one_cycle scheduler"
                )

            if self.flash_attn_fuse_mlp:
                raise ValueError("Fused modules are not supported with ReLoRA")
        return self

    @model_validator(mode="after")
    def check_early_stopping(self):
        if self.early_stopping_patience:
            if not self.save_steps or not self.eval_steps:
                raise ValueError(
                    "`early_stopping_patience` requires save_steps and eval_steps to be set. eval_steps should evenly divide save_steps."
                )
            if self.save_steps % self.eval_steps != 0:
                raise ValueError(
                    "`early_stopping_patience` requires that eval_steps should evenly divide save_steps."
                )
        return self

    @model_validator(mode="after")
    def check_tensor_parallel_size(self):
        if not self.tensor_parallel_size:
            self.tensor_parallel_size = 1
        return self

    @model_validator(mode="after")
    def check_context_parallel_size(self):
        if self.sequence_parallel_degree and not self.context_parallel_size:
            LOG.warning(
                "`sequence_parallel_degree` is deprecated, use `context_parallel_size`"
            )
            self.context_parallel_size = self.sequence_parallel_degree
        if not self.context_parallel_size:
            self.context_parallel_size = 1
        elif self.context_parallel_size > 1:
            if not self.flash_attention:
                raise ValueError(
                    "flash_attention: true must be set with context_parallel_size > 1"
                )

            if self.sample_packing and self.micro_batch_size > 1:
                raise ValueError(
                    "micro_batch_size must be set to 1 when sample_packing is enabled "
                    "due to a `ring-flash-attn` requirement"
                )

            try:
                import transformers.modeling_flash_attention_utils
                from transformers.utils import is_flash_attn_greater_or_equal

                transformers.modeling_flash_attention_utils._flash_supports_window = (
                    True
                )
                sys.modules[
                    "transformers.modeling_flash_attention_utils"
                ]._flash_supports_window = True
                sys.modules[
                    "transformers.modeling_flash_attention_utils"
                ]._flash_supports_window_size = True
                sys.modules[
                    "transformers.modeling_flash_attention_utils"
                ].is_flash_attn_greater_or_equal = is_flash_attn_greater_or_equal
                import ring_flash_attn  # noqa: F401  # Required after monkey-patching
            except ImportError as exception:
                raise ImportError(
                    "context_parallel_size > 1 but ring_flash_attn is not installed. "
                    "Please install it with `pip install axolotl[ring-flash-attn] "
                    "or `pip install ring-flash-attn>=0.1.4`."
                ) from exception

            LOG.warning(
                "Sequence parallelism (SP) is enabled with "
                f"context_parallel_size={self.context_parallel_size}. "
                "Please note that logged losses may differ slightly to the non-SP "
                "losses due to transformers Trainer implementation details. "
                "Please see https://github.com/axolotl-ai-cloud/axolotl/pull/2495#issuecomment-2784022042 "
                "for more details."
            )

        return self

    @model_validator(mode="after")
    def validate_ring_attn_func(self):
        if getattr(self, "context_parallel_size", 1) == 1:
            return self

        if self.ring_attn_func is not None:
            self.ring_attn_func = RingAttnFunc(self.ring_attn_func)
        else:
            # Default ring attention function selection
            sample_packing = getattr(self, "sample_packing", False)
            self.ring_attn_func = (
                RingAttnFunc.VARLEN_LLAMA3
                if sample_packing
                else RingAttnFunc.BATCH_RING
            )

        return self

    def hint_gradient_checkpointing_dpo_lora_ddp(self):
        if (
            (self.gradient_checkpointing is True or self.gradient_checkpointing is None)
            and self.capabilities
            and self.capabilities.get("n_gpu", 1) > 1
            and self.adapter in ("lora", "qlora")
            and self.rl == RLType.DPO
            and not self.fsdp
            and not self.deepspeed
        ):
            LOG.warning(
                "gradient_checkpointing with DPO + DDP + LoRA is not recommended."
            )
        return self


class DistributedValidationMixin:
    """validation for distributed training."""

    @model_validator(mode="after")
    def check_tensor_parallel_optimizer(self):
        if self.tensor_parallel_size > 1:
            if self.optimizer in ["paged_adamw_8bit", "adamw_8bit", "adamw_bnb_8bit"]:
                raise ValueError(
                    "tensor_parallel_size is not supported with paged_adamw_8bit, adamw_8bit, and adamw_bnb_8bit optimizers"
                )

        return self


class GRPOVllmValidationMixin:
    """Validation mixin for vllm when using GRPO."""

    @model_validator(mode="after")
    def check_vllm_mode_set(self):
        if self.trl and self.trl.use_vllm and not self.trl.vllm_mode:
            LOG.warning(
                "vllm_mode must be set to either `server` or `colocate` when using vllm, using default value `server`"
            )
            self.trl.vllm_mode = "server"
        return self


class ValidationMixin(
    DatasetValidationMixin,
    AttentionValidationMixin,
    TrainingValidationMixin,
    LoRAValidationMixin,
    RLValidationMixin,
    OptimizationValidationMixin,
    SystemValidationMixin,
    ChatTemplateValidationMixin,
    PretrainingValidationMixin,
    ModelCompatibilityValidationMixin,
    ComplexValidationMixin,
    GRPOVllmValidationMixin,
):
    """Full validation mixin for Axolotl configuration."""


================================================
FILE: src/axolotl/utils/schemas/vllm.py
================================================
"""
Pydantic models for VLLM configuration, used primarily for RL training with TRL + grpo
"""

from pydantic import BaseModel, Field


class VllmConfig(BaseModel):
    """
    Configuration for VLLM server
    """

    device: str | None = Field(
        default="auto",
        json_schema_extra={"description": "Device to use for VLLM"},
    )
    tensor_parallel_size: int | None = Field(
        default=None,
        json_schema_extra={"description": "Tensor parallel size for VLLM"},
    )
    data_parallel_size: int | None = Field(
        default=None,
        json_schema_extra={"description": "Data parallel size for VLLM"},
    )
    gpu_memory_utilization: float | None = Field(
        default=0.9,
        json_schema_extra={"description": "GPU memory utilization for VLLM"},
    )
    dtype: str | None = Field(
        default="auto",
        json_schema_extra={"description": "Data type for VLLM"},
    )
    max_model_len: int | None = Field(
        default=None,
        json_schema_extra={
            "description": "Maximum length of the model context for VLLM"
        },
    )
    enable_prefix_caching: bool | None = Field(
        default=None,
        json_schema_extra={"description": "Enable prefix caching for VLLM"},
    )
    host: str | None = Field(
        default="0.0.0.0",  # nosec B104
        json_schema_extra={"description": "Host for the vLLM server to start on"},
    )
    port: int | None = Field(
        default=8000,
        json_schema_extra={"description": "Port of the vLLM server to start on"},
    )

    enable_reasoning: bool | None = Field(
        default=None,
        json_schema_extra={"description": "Enable reasoning for VLLM"},
    )
    reasoning_parser: str | None = Field(
        default=None,
        json_schema_extra={"description": "Reasoning parser for VLLM"},
    )
    serve_module: str | None = Field(
        default=None,
        json_schema_extra={
            "description": "Python module for vLLM serve script. Set to 'axolotl.scripts.vllm_serve_lora' "
            "for native LoRA support, or leave None for default TRL serve."
        },
    )


================================================
FILE: src/axolotl/utils/tee.py
================================================
"""
Utilities for managing the debug log file and providing a file-only stream for logging
handlers.
"""

from __future__ import annotations

import io
import os
import sys
import threading
from pathlib import Path
from typing import TextIO, cast

_lock = threading.Lock()
_file_handle: io.TextIOWrapper | None = None
_log_path: str | None = None
_tee_installed: bool = False
_orig_stdout: TextIO | None = None
_orig_stderr: TextIO | None = None


class _FileOnlyWriter(io.TextIOBase):
    """A stream-like object that writes only to the tee file.

    Before the file is prepared, writes are dropped (no-op).
    """

    def write(self, s: str) -> int:  # type: ignore[override]
        with _lock:
            if _file_handle is not None:
                _file_handle.write(s)
                return len(s)
            return len(s)

    def flush(self) -> None:  # type: ignore[override]
        with _lock:
            if _file_handle is not None:
                try:
                    _file_handle.flush()
                except Exception:
                    pass


file_only_stream: io.TextIOBase = _FileOnlyWriter()


class _StreamTee(io.TextIOBase):
    """A minimal tee that mirrors writes to the debug log file.

    Installed only after the debug log is prepared; no buffering.
    """

    def __init__(self, stream: io.TextIOBase):
        self._stream = stream

    def write(self, s: str) -> int:  # type: ignore[override]
        with _lock:
            n = self._stream.write(s)
            if _file_handle is not None:
                _file_handle.write(s)
            return n

    def flush(self) -> None:  # type: ignore[override]
        with _lock:
            self._stream.flush()
            if _file_handle is not None:
                try:
                    _file_handle.flush()
                except Exception:
                    pass

    @property
    def encoding(self):  # type: ignore[override]
        return getattr(self._stream, "encoding", None)

    @property
    def errors(self):  # type: ignore[override]
        return getattr(self._stream, "errors", None)

    def isatty(self):  # type: ignore[override]
        return getattr(self._stream, "isatty", lambda: False)()

    def fileno(self):  # type: ignore[override]
        if hasattr(self._stream, "fileno"):
            return self._stream.fileno()
        raise OSError("Underlying stream has no fileno")


def prepare_debug_log(cfg, filename: str = "debug.log") -> str:
    """
    Prepare the debug log.

    Creates the output directory, handles append/truncate logic based on cfg, and opens
    the debug log file for subsequent writes via file-only handlers.
    """
    global _file_handle, _log_path, _tee_installed

    with _lock:
        # If already initialized, reuse existing path
        if _log_path is not None:
            return _log_path

        output_dir = cfg.output_dir
        os.makedirs(output_dir, exist_ok=True)

        log_path = Path(output_dir) / filename
        append = bool(
            cfg.get("resume_from_checkpoint") or cfg.get("auto_resume_from_checkpoints")
        )

        if not append:
            log_path.unlink(missing_ok=True)

        fh = open(log_path, "a", encoding="utf-8")
        fh.flush()

        _file_handle = fh
        _log_path = str(log_path)

        # Install a tee so stdout/stderr are mirrored to the debug file
        # Allow disabling via env for testing or advanced usage.
        tee_enabled = os.getenv("AXOLOTL_TEE_STDOUT", "1").lower() not in {
            "0",
            "false",
            "no",
        }
        if tee_enabled and not _tee_installed:
            # Save originals so we can restore later (e.g., tests)
            global _orig_stdout, _orig_stderr
            _orig_stdout = sys.stdout
            _orig_stderr = sys.stderr
            sys.stdout = _StreamTee(cast(io.TextIOBase, sys.stdout))
            sys.stderr = _StreamTee(cast(io.TextIOBase, sys.stderr))
            _tee_installed = True

        return _log_path


def close_debug_log() -> None:
    """Flush and close the debug log and uninstall the stdout/stderr tee.

    Safe to call even if not initialized.
    """
    global _file_handle, _log_path, _tee_installed, _orig_stdout, _orig_stderr
    with _lock:
        # Restore original stdout/stderr if we installed a tee
        if _tee_installed:
            if _orig_stdout is not None:
                sys.stdout = _orig_stdout
            if _orig_stderr is not None:
                sys.stderr = _orig_stderr
            _tee_installed = False
            _orig_stdout = None
            _orig_stderr = None

        # Close the file handle if open
        if _file_handle is not None:
            try:
                _file_handle.flush()
                _file_handle.close()
            except Exception:
                pass
            finally:
                _file_handle = None
        _log_path = None


================================================
FILE: src/axolotl/utils/tokenization.py
================================================
"""Module for tokenization utilities"""

from termcolor import colored

from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def check_dataset_labels(
    dataset,
    tokenizer,
    num_examples=5,
    text_only=False,
    rl_mode=False,
):
    # the dataset is already shuffled, so let's just check the first 5 elements
    for idx in range(num_examples):
        if not rl_mode:
            check_example_labels(dataset[idx], tokenizer, text_only=text_only)
        else:
            check_rl_example_labels(dataset[idx], tokenizer, text_only=text_only)


def check_example_labels(example, tokenizer, text_only=False):
    # Get the input_ids, labels, and attention_mask from the dataset
    input_ids = example["input_ids"]
    labels = example["labels"]
    target_mask = example.pop("target_mask", None)

    # You can compare the input_ids and labels element-wise
    # Remember to ignore positions with IGNORE_TOKEN_ID (if you use it) or attention_mask equal to 0
    colored_tokens = []
    for _, (input_id, label_id) in enumerate(zip(input_ids, labels, strict=False)):
        decoded_input_token = tokenizer.decode(input_id)
        # Choose the color based on whether the label has the ignore value or not
        color = "red" if label_id == -100 else ("yellow" if label_id == 0 else "green")
        colored_token = colored(decoded_input_token, color) + (
            not text_only and colored(f"({label_id}, {input_id})", "white") or ""
        )
        colored_tokens.append(colored_token)

    delimiter = "" if text_only else " "
    LOG.info(delimiter.join(colored_tokens))
    LOG.info("\n\n\n")
    target_labels_count = sum(label_id != -100 for label_id in labels)
    total_len = len(input_ids)
    LOG.info(f"Total input len: {total_len}")
    LOG.info(f"Count of labels: {target_labels_count}")
    if target_mask:
        target_mask_positions = sum(m[0] for m in target_mask)
        LOG.info(f"Number of positions in target_mask: {target_mask_positions}")

    return " ".join(colored_tokens)


def color_token_for_rl_debug(decoded_token, encoded_token, color, text_only):
    """Helper function to color tokens based on their type."""
    colored_text = colored(decoded_token, color)
    return (
        colored_text
        if text_only
        else f"{colored_text}{colored(f'({encoded_token})', 'white')}"
    )


def process_tokens_for_rl_debug(tokens, color, tokenizer, text_only):
    """Helper function to process and color tokens."""
    colored_tokens = [
        color_token_for_rl_debug(tokenizer.decode(token), token, color, text_only)
        for token in tokenizer.encode(tokens, add_special_tokens=False)
    ]
    return colored_tokens


def check_rl_example_labels(example, tokenizer, text_only=False):
    field_prompt, field_chosen, field_rejected, field_completion = (
        "prompt",
        "chosen",
        "rejected",
        "completion",
    )

    input_tokens = example[field_prompt]

    labels_chosen = example.get(field_chosen)
    labels_rejected = example.get(field_rejected)
    labels_completion = example.get(field_completion)

    # Create a delimiter based on text_only flag
    delimiter = "" if text_only else " "

    # Process and color each type of token
    colored_tokens = process_tokens_for_rl_debug(
        input_tokens, "yellow", tokenizer, text_only
    )

    # Process tokens
    if labels_completion is None:
        colored_chosens = process_tokens_for_rl_debug(
            labels_chosen, "green", tokenizer, text_only
        )
        colored_rejecteds = process_tokens_for_rl_debug(
            labels_rejected, "red", tokenizer, text_only
        )
    else:
        colored_completion = process_tokens_for_rl_debug(
            labels_completion, "green", tokenizer, text_only
        )

    # Logging information
    LOG.info(f"INPUT PROMPT: {delimiter.join(colored_tokens)}\n\n")

    if labels_completion is None:
        LOG.info(f"CHOSEN RESPONSE: {delimiter.join(colored_chosens)}\n\n")
        LOG.info(f"REJECTED RESPONSE: {delimiter.join(colored_rejecteds)}\n\n\n")
    else:
        LOG.info(f"COMPLETION RESPONSE: {delimiter.join(colored_completion)}\n\n\n")

    return delimiter.join(colored_tokens)


================================================
FILE: src/axolotl/utils/trackio_.py
================================================
"""Module for trackio utilities"""

import os

from axolotl.utils.dict import DictDefault


def setup_trackio_env_vars(cfg: DictDefault):
    for key in cfg.keys():
        if key.startswith("trackio_"):
            value = cfg.get(key, "")

            if value and isinstance(value, str) and len(value) > 0:
                os.environ[key.upper()] = value

    if cfg.trackio_project_name and len(cfg.trackio_project_name) > 0:
        cfg.use_trackio = True


================================================
FILE: src/axolotl/utils/train.py
================================================
"""Training utils for checkpoints"""

from pathlib import Path

from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


def determine_last_checkpoint(cfg: DictDefault, update: bool = True) -> str | None:
    """
    Determine the checkpoint to resume from based on configuration.

    Args:
        cfg: Dictionary mapping `axolotl` config keys to values.
        update: Whether to update the config with the determined checkpoint

    Returns:
        Path to the checkpoint to resume from, or `None` if not resuming.
    """
    last_checkpoint = None
    checkpoints = sorted(
        (
            p
            for p in Path(cfg.output_dir).glob("checkpoint-*")
            if p.name.split("-")[-1].isdigit()
        ),
        key=lambda p: int(p.name.split("-")[-1]),
    )
    if checkpoints:
        last_checkpoint = str(checkpoints[-1])
        if not update:
            LOG.info(f"Resuming from last checkpoint at {last_checkpoint}")
            return last_checkpoint

    if (
        cfg.resume_from_checkpoint is None
        and cfg.auto_resume_from_checkpoints
        and last_checkpoint is not None
    ):
        cfg.resume_from_checkpoint = last_checkpoint
        LOG.info(
            "Using auto-resume functionality to resume from checkpoint at "
            f"{cfg.resume_from_checkpoint}"
        )
    return cfg.resume_from_checkpoint


================================================
FILE: src/axolotl/utils/trainer.py
================================================
"""Module containing the Trainer class and related functions"""

import json
import math
import os
import random
from contextlib import contextmanager
from functools import partial
from tempfile import NamedTemporaryFile
from typing import List, Optional

import numpy as np
import torch
import torch.cuda
from datasets import IterableDataset, disable_caching, enable_caching
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from transformers.utils import is_torch_bf16_gpu_available

from axolotl.utils.dict import DictDefault
from axolotl.utils.distributed import init_distributed_state, reduce_and_broadcast
from axolotl.utils.environment import check_cuda_p2p_ib_support
from axolotl.utils.logging import get_logger
from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths

LOG = get_logger(__name__)


@torch.jit.script
def weighted_cross_entropy(
    logits: torch.Tensor, labels: torch.Tensor, weights: torch.Tensor
):
    # Flatten the logits, labels, and weights tensors
    logits = logits.view(
        -1, logits.size(-1)
    )  # logits becomes of shape [batch_size*sequence_length, vocab_size]
    labels = labels.view(-1)  # labels becomes of shape [batch_size*sequence_length]
    weights = weights.view(-1)  # weights becomes of shape [batch_size*sequence_length]

    # Compute the unweighted cross entropy loss
    losses = torch.nn.functional.cross_entropy(logits, labels, reduction="none")

    # Apply the weights to the losses and compute their sum
    return (weights * losses).sum()


@torch.jit.script
def create_weighted_mask(labels: torch.Tensor):
    # Check if the tensor is 2D. If not, unsqueeze it to make it 2D
    if len(labels.shape) == 1:
        labels = labels.unsqueeze(0)

    weights = torch.zeros_like(labels).float()
    for i in range(labels.shape[0]):
        mask = labels[i] != -100

        # Create a tensor to track group ids
        group_ids = torch.zeros_like(labels[i]).int()
        curr_group_id = 0

        for j in range(1, len(labels[i])):
            if mask[j] and not mask[j - 1]:  # switch from masked to unmasked label
                curr_group_id += 1  # start new group
            group_ids[j] = (
                curr_group_id if mask[j] else 0
            )  # assign group id if unmasked label

        # Count only unmasked labels in each group
        group_counts = torch.bincount(group_ids[mask])

        mask_weights = torch.zeros_like(labels[i]).float()
        mask_weights[mask] = 1.0 / group_counts[group_ids[mask]]

        weights[i] = mask_weights

    return weights.squeeze()  # squeeze the output to match the input dimension


def trainer_weighted_loss(model_output, labels, shift_labels=True):
    logits = (
        model_output["logits"] if isinstance(model_output, dict) else model_output[0]
    )
    if shift_labels:
        logits = logits[..., :-1, :].contiguous()
        labels = labels[..., 1:].contiguous()

    weights = create_weighted_mask(labels)
    return weighted_cross_entropy(logits, labels, weights)


@contextmanager
def disable_datasets_caching():
    try:
        disable_caching()
        yield
    finally:
        enable_caching()


def add_position_ids(sample):
    """
    Handle both single-example and batched data.
    - single example: sample['input_ids'] is a list[int]
    - batched data: sample['input_ids'] is a list[list[int]]
    """
    # Return sample unchanged if "input_ids" is not present, or is empty
    if "input_ids" not in sample or not sample["input_ids"]:
        return sample

    input_ids = sample["input_ids"]

    # If first element is an int, it’s a single example
    # If first element is a list, it’s a batch
    if isinstance(input_ids[0], int):
        # ---- SINGLE EXAMPLE ----
        seq_len = len(input_ids)
        # Position IDs for a single example
        # As a list
        sample["position_ids"] = list(range(seq_len))
        sample["length"] = seq_len

    else:
        # ---- BATCHED EXAMPLES ----
        # input_ids is a list of lists
        position_ids_batch = []
        lengths_batch = []
        for seq in input_ids:
            seq_len = len(seq)
            position_ids_batch.append(list(range(seq_len)))
            lengths_batch.append(seq_len)

        # Now store them back
        sample["position_ids"] = position_ids_batch
        sample["length"] = lengths_batch

    return sample


def add_pose_position_ids(
    sample,
    max_context_len=32768,
    split_on_token_ids: Optional[List[int]] = None,
    chunks: int = 2,
):
    """
    use the PoSE technique to extend the context length by randomly skipping
    positions in the context. We only want to skip right before tokens in
    the split_on_token_ids list. We should attempt to randomly distribute
    the skips, but we don't need the final position_ids to be the full
    context_len. There may be multiple turns in the context, so we want to
    make sure we take into account the maximum possible number of skips
    remaining in each sample.
    """

    input_ids = sample["input_ids"]
    sample_len = len(input_ids)
    max_skips = max_context_len - sample_len

    if split_on_token_ids is None:
        split_on_token_ids = []

    if split_on_token_ids:
        split_indices = [
            i for i, token_id in enumerate(input_ids) if token_id in split_on_token_ids
        ]
    else:
        chunk_len = sample_len // chunks
        split_indices = [i * chunk_len for i in range(1, chunks)]
    split_indices.append(len(input_ids))  # make sure we go to the end of the sample
    if split_indices[0] < 2:
        # drop the first split index if it's too close to the beginning
        split_indices = split_indices[1:]

    position_ids = []
    prev_index = 0
    total_skips = 0

    for split_index in split_indices:
        num_skips = (
            random.randint(0, max_skips)  # nosec B311
            if prev_index != 0 and max_skips
            else 0
        )
        max_skips -= num_skips
        total_skips += num_skips

        segment_position_ids = list(
            range(prev_index + total_skips, split_index + total_skips)
        )

        position_ids.extend(segment_position_ids)
        prev_index = split_index

    sample["sequence_len"] = position_ids[-1]
    position_ids = torch.tensor(position_ids)

    sample["position_ids"] = position_ids
    sample["length"] = len(position_ids)
    assert len(position_ids) == len(input_ids)

    return sample


def add_length(sample):
    sample["length"] = len(sample["input_ids"])
    return sample


def filter_sequences_by_length(
    sample, sequence_len=2048, min_sequence_len=2, raise_on_drop=False
):
    """
    Filter sequences outside valid length range [min_sequence_len, sequence_len].

    Drops samples that are either too short (< min_sequence_len) or too long (> sequence_len).

    Works for both single-example (list[int]) or batched (list[list[int]]).

    If raise_on_drop is set, the code raises a ValueError if a sample is
    encountered that is too long and would have been dropped.
    """
    min_sequence_len = min_sequence_len or 2

    input_ids = sample["input_ids"]

    # Edge case: if input_ids is empty
    if not input_ids:
        # Decide if you want to drop or keep empty. Let's drop.
        return False

    # Check if single example or batched by looking at the first element
    if isinstance(input_ids[0], int):
        # Single example (input_ids is a list of int)
        length = len(input_ids)
        if raise_on_drop and length > sequence_len:
            raise ValueError(
                f"Sequence encountered with {length} tokens, which exceeds the maximum {sequence_len}."
            )
        return min_sequence_len <= length <= sequence_len

    # Batched (input_ids is a list of lists)
    results = []
    for seq in input_ids:
        length = len(seq)
        if raise_on_drop and length > sequence_len:
            raise ValueError(
                f"Sequence encountered with {length} tokens, which exceeds the maximum {sequence_len}."
            )
        results.append(min_sequence_len <= length <= sequence_len)
    return results


def process_datasets_for_packing(cfg, train_dataset, eval_dataset):
    drop_attn_mask = cfg.model_config_type in ["mamba", "gemma3"]
    if drop_attn_mask:
        LOG.info("dropping attention_mask column")
        train_dataset = train_dataset.remove_columns("attention_mask")
        if eval_dataset:
            eval_dataset = eval_dataset.remove_columns("attention_mask")

    if cfg.model_config_type in ["falcon", "mistral"]:
        LOG.info("dropping token_type_ids column if it exists")
        if "token_type_ids" in train_dataset.column_names:
            train_dataset = train_dataset.remove_columns("token_type_ids")
        if eval_dataset and "token_type_ids" in eval_dataset.column_names:
            eval_dataset = eval_dataset.remove_columns("token_type_ids")

    def drop_no_trainable_tokens(sample):
        """
        Drop samples if all labels are -100 (i.e., zero trainable tokens).
        Works for both single-example or batched input.
        """
        labels = sample["labels"]
        if not labels:
            return True

        # Check if single example or batch
        # If first element is an int, we assume a single example
        # If it's a list, we assume we're dealing with a batch
        if isinstance(labels[0], int):
            # Single example: return a single bool
            return np.any(labels != -100)

        # Batched: 'labels' is a list of lists
        # Return a list of booleans, one per sub-list
        results = [np.any(row_labels != -100) for row_labels in labels]
        return results

    try:
        prior_len = len(train_dataset)
    except TypeError:
        # handle iterable datasets case
        prior_len = None
    filter_map_kwargs = {}
    if not isinstance(train_dataset, IterableDataset):
        filter_map_kwargs["num_proc"] = cfg.dataset_num_proc
        filter_map_kwargs["load_from_cache_file"] = not cfg.is_preprocess

    drop_long_kwargs = {}
    if filter_map_kwargs:
        drop_long_kwargs["desc"] = "Drop Samples with Zero Trainable Tokens"
    train_dataset = train_dataset.filter(
        drop_no_trainable_tokens,
        batched=True,
        **filter_map_kwargs,
        **drop_long_kwargs,
    )
    if prior_len:
        dropped = prior_len - len(train_dataset)
        if dropped:
            LOG.warning(
                f"Dropped {dropped} samples with no trainable tokens from train dataset"
            )

    if eval_dataset:
        try:
            prior_len = len(eval_dataset)
        except TypeError:
            # handle iterable datasets case
            prior_len = None
        eval_dataset = eval_dataset.filter(
            drop_no_trainable_tokens,
            **filter_map_kwargs,
            **drop_long_kwargs,
        )
        if prior_len:
            dropped = prior_len - len(eval_dataset)
            if dropped:
                LOG.warning(
                    f"Dropped {dropped} samples with no trainable tokens from eval dataset"
                )

    if cfg.group_by_length:
        train_dataset = train_dataset.map(
            add_length,
            num_proc=cfg.dataset_num_proc,
            load_from_cache_file=not cfg.is_preprocess,
            desc="Group By Length",
        )

    if cfg.use_pose:
        pose_kwargs = {}
        if cfg.pose_num_chunks is not None:
            pose_kwargs["chunks"] = cfg.pose_num_chunks
        pose_fn = partial(
            add_pose_position_ids,
            max_context_len=cfg.pose_max_context_len,
            split_on_token_ids=cfg.pose_split_on_token_ids,
            **pose_kwargs,
        )
        train_dataset = train_dataset.map(
            pose_fn,
            num_proc=cfg.dataset_num_proc,
            load_from_cache_file=not cfg.is_preprocess,
            desc="Add position_id column (PoSE)",
        )
        train_dataset = train_dataset.sort("sequence_len")
        if cfg.eval_sample_packing is not False:
            if eval_dataset:
                eval_dataset = eval_dataset.map(
                    pose_fn,
                    num_proc=cfg.dataset_num_proc,
                    load_from_cache_file=not cfg.is_preprocess,
                    desc="Add position_id column (PoSE)",
                )
    elif cfg.sample_packing:
        drop_long_kwargs = {}
        if filter_map_kwargs:
            drop_long_kwargs["desc"] = "Add position_id column (Sample Packing)"
        train_dataset = train_dataset.map(
            add_position_ids,
            batched=True,
            **filter_map_kwargs,
            **drop_long_kwargs,
        )
        if cfg.eval_sample_packing:
            if eval_dataset:
                eval_dataset = eval_dataset.map(
                    add_position_ids,
                    **filter_map_kwargs,
                    **drop_long_kwargs,
                )

    return train_dataset, eval_dataset


def process_pretraining_datasets_for_packing(
    train_dataset, sequence_len, skip_position_ids=True, drop_attention_mask=False
):
    drop_outside_range = partial(filter_sequences_by_length, sequence_len=sequence_len)

    train_dataset = train_dataset.filter(
        drop_outside_range,
        desc="Dropping Long Sequences",
        load_from_cache_file=False,
    )
    if not skip_position_ids:
        train_dataset = train_dataset.map(
            add_position_ids,
            batched=True,
            desc="Add position_id column (Pretraining Sample Packing)",
        )
    if drop_attention_mask:
        train_dataset = train_dataset.remove_columns("attention_mask")

    return train_dataset


def calculate_total_num_steps(cfg, train_dataset, update=True):
    if (
        not cfg.total_num_tokens
        and not cfg.skip_prepare_dataset
        and not cfg.reward_model
    ):
        total_num_tokens = np.sum(
            train_dataset.select_columns("input_ids")
            .to_pandas()["input_ids"]
            .apply(len)
            .values
        )
        LOG.debug(f"total_num_tokens: {total_num_tokens:_}")
        if update:
            cfg.total_num_tokens = total_num_tokens

    skip_estimates = cfg.model_config_type == "mamba"

    if (
        not skip_estimates
        and not cfg.total_supervised_tokens
        and not cfg.skip_prepare_dataset
        and not cfg.reward_model
    ):
        total_supervised_tokens = (
            train_dataset.data.column("labels")
            .to_pandas()
            .apply(lambda x: np.sum(np.array(x) != -100))
            .sum()
        )
        LOG.debug(f"`total_supervised_tokens: {total_supervised_tokens:_}`")
        if update:
            cfg.total_supervised_tokens = total_supervised_tokens

    if not skip_estimates and cfg.sample_packing:
        # we have to drop anything longer then sequence len otherwise
        # flash attention with position ids fails

        if cfg.sample_packing_eff_est:
            total_num_steps = (
                # match count to len est in dataloader
                int(
                    math.floor(
                        0.99
                        * cfg.total_num_tokens
                        / cfg.sample_packing_eff_est
                        / cfg.sequence_len
                        // cfg.batch_size
                    )
                    - 1
                )
                * cfg.num_epochs
            )
            LOG.debug(
                f"total_num_tokens: {cfg.total_num_tokens:_}, total_num_steps: {total_num_steps:_}"
            )
        else:
            if cfg.flash_attention and not cfg.multipack_real_batches:
                sampler_batch_size = 1
                batch_max_len = cfg.micro_batch_size * cfg.sequence_len
            else:
                sampler_batch_size = cfg.micro_batch_size
                batch_max_len = cfg.sequence_len
            if cfg.curriculum_sampling:
                sampler = SequentialSampler(train_dataset)
            else:
                sampler = RandomSampler(train_dataset)
            sampler = MultipackBatchSampler(
                sampler=sampler,
                lengths=get_dataset_lengths(train_dataset),
                batch_size=sampler_batch_size,
                batch_max_len=batch_max_len,
                group_size=cfg.sample_packing_group_size,
                bin_size=cfg.sample_packing_bin_size,
                sequential=cfg.sample_packing_sequentially,
                drop_last=True,
                num_processes=cfg.dataset_num_proc,
                mp_start_method=cfg.sample_packing_mp_start_method or "fork",
            )

            data_loader = DataLoader(
                train_dataset.remove_columns(["length"]),
                batch_sampler=sampler,
            )
            data_loader_len = max(
                1, len(data_loader) * cfg.micro_batch_size // cfg.batch_size
            )
            LOG.debug(f"data_loader_len: {data_loader_len}")
            # FIXME: is there a bug here somewhere? the total num steps depends
            # on the agreed on value for sample_packing_eff_est
            total_num_steps = int(math.floor(data_loader_len * cfg.num_epochs))
            if cfg.dataloader_drop_last:
                # drop the last batch for each epoch
                total_num_steps -= int(math.ceil(cfg.num_epochs))

            def calc_sample_packing_eff_est(estimates: List[float]):
                LOG.info(f"sample_packing_eff_est across ranks: {repr(estimates)}")
                return max(estimates)

            sample_packing_actual_eff_all = reduce_and_broadcast(
                lambda: sampler.efficiency(),
                calc_sample_packing_eff_est,
            )
            sample_packing_eff_est = (
                math.ceil(sample_packing_actual_eff_all * 100.0) / 100.0
            )
            if update:
                cfg.sample_packing_eff_est = sample_packing_eff_est
            LOG.debug(f"sample_packing_eff_est: {cfg.sample_packing_eff_est}")
    else:
        total_num_steps = int(
            math.ceil(len(train_dataset) * cfg.num_epochs / cfg.batch_size)
        )
    LOG.debug(f"total_num_steps: {total_num_steps}")
    return total_num_steps


def setup_torch_compile_env(cfg):
    if cfg.torch_compile:
        if not cfg.torch_compile_backend:
            os.environ["ACCELERATE_DYNAMO_BACKEND"] = "INDUCTOR"
        else:
            os.environ["ACCELERATE_DYNAMO_BACKEND"] = cfg.torch_compile_backend.upper()


def setup_deepspeed_env(cfg, stage=None):
    from transformers.integrations.deepspeed import HfTrainerDeepSpeedConfig

    from axolotl.utils.distributed import distributed_state

    if distributed_state and distributed_state.initialized:
        raise RuntimeError(
            "Distributed State already initialized before Deepspeed setup"
        )

    os.environ["ACCELERATE_USE_DEEPSPEED"] = "true"
    if isinstance(cfg.deepspeed, DictDefault):
        with NamedTemporaryFile(
            mode="w", delete=False, suffix=".json", prefix="deepspeed_config_"
        ) as temp_file:
            temp_file.write(json.dumps(cfg.deepspeed.to_dict(), indent=4))
            temp_file.close()
            cfg.deepspeed = str(temp_file.name)
    os.environ["ACCELERATE_DEEPSPEED_CONFIG_FILE"] = cfg.deepspeed
    os.environ["ACCELERATE_GRADIENT_ACCUMULATION_STEPS"] = str(
        cfg.gradient_accumulation_steps
    )
    if stage:
        os.environ["ACCELERATE_DEEPSPEED_ZERO_STAGE"] = str(stage)
        if stage == 3:
            os.environ["ACCELERATE_DEEPSPEED_ZERO3_INIT"] = "true"

    device_count = torch.cuda.device_count()
    if device_count == 1:
        os.environ.setdefault("WORLD_SIZE", "1")
        os.environ.setdefault("LOCAL_RANK", "0")
        os.environ.setdefault("MASTER_ADDR", "0.0.0.0")  # nosec B104
        os.environ.setdefault("MASTER_PORT", "29500")

    # NOTE(djsaunde): The distribued state cannot be initialized prior to the
    # ACCELERATE_USE_DEEPSPEED assignment, but it must be initialized some time prior
    # to model load.
    if (
        int(os.environ.get("WORLD_SIZE", "1")) == 1
        and os.environ.get("AXOLOTL_IS_PREPROCESS", "0") != "1"
        and cfg.use_ray is not True
    ):
        os.environ["WORLD_SIZE"] = "1"  # force it in case not set
        os.environ["LOCAL_RANK"] = "0"  # force it in case not set
        os.environ["RANK"] = os.environ.get("LOCAL_RANK", "0")
        import deepspeed.comm as dist

        dist.init_distributed(
            dist_backend="nccl", auto_mpi_discovery=False, dist_init_required=True
        )
    init_distributed_state()

    # If we don't assign this, it doesn't actually get set in the accelerate weakref
    _ = HfTrainerDeepSpeedConfig(cfg.deepspeed)


def setup_fsdp_envs(cfg):
    os.environ["ACCELERATE_USE_FSDP"] = "true"

    # TODO @SalmanMohammadi remove FSDP1 args in 0.12
    if str(cfg.fsdp_version) == "2":
        os.environ["FSDP_VERSION"] = "2"
    if cfg.fsdp_config.activation_checkpointing:
        os.environ["FSDP_ACTIVATION_CHECKPOINTING"] = "true"
    if cfg.fsdp_config.offload_params:
        os.environ["FSDP_OFFLOAD_PARAMS"] = "true"
    if cfg.fsdp_config.sync_module_states:
        os.environ["FSDP_SYNC_MODULE_STATES"] = "true"
    if cfg.fsdp_config.cpu_ram_efficient_loading:
        os.environ["FSDP_CPU_RAM_EFFICIENT_LOADING"] = "true"
    if cfg.fsdp_config.use_orig_params:
        os.environ["FSDP_USE_ORIG_PARAMS"] = "true"
    if cfg.fsdp_config.state_dict_type:
        os.environ["FSDP_STATE_DICT_TYPE"] = cfg.fsdp_config.state_dict_type
    if cfg.fsdp_config.cpu_offload_pin_memory is not None:
        os.environ["FSDP_CPU_OFFLOAD_PIN_MEMORY"] = str(
            cfg.fsdp_config.cpu_offload_pin_memory
        ).lower()
    if cfg.fsdp_config.auto_wrap_policy:
        os.environ["FSDP_AUTO_WRAP_POLICY"] = cfg.fsdp_config.auto_wrap_policy
    if cfg.fsdp_config.transformer_layer_cls_to_wrap:
        os.environ["FSDP_TRANSFORMER_CLS_TO_WRAP"] = (
            cfg.fsdp_config.transformer_layer_cls_to_wrap
        )
    if cfg.fsdp_config.reshard_after_forward:
        os.environ["FSDP_RESHARD_AFTER_FORWARD"] = "true"


def setup_parallelism_envs(cfg):
    set_accelerate_parallelism_config = False
    if cfg.tensor_parallel_size and cfg.tensor_parallel_size > 1:
        set_accelerate_parallelism_config = True
        os.environ["PARALLELISM_CONFIG_TP_SIZE"] = str(cfg.tensor_parallel_size)
    if cfg.dp_shard_size and cfg.dp_shard_size > 1:
        set_accelerate_parallelism_config = True
        os.environ["PARALLELISM_CONFIG_DP_SHARD_SIZE"] = str(cfg.dp_shard_size)
    if cfg.dp_replicate_size and cfg.dp_replicate_size > 1:
        set_accelerate_parallelism_config = True
        os.environ["PARALLELISM_CONFIG_DP_REPLICATE_SIZE"] = str(cfg.dp_replicate_size)
    if cfg.context_parallel_size and cfg.context_parallel_size > 1:
        set_accelerate_parallelism_config = True
        os.environ["PARALLELISM_CONFIG_CP_SIZE"] = str(cfg.context_parallel_size)
        os.environ["ACCELERATE_ALLOW_CP_STANDALONE"] = "true"
        from axolotl.monkeypatch.accelerate.parallelism_config import patch_prepare_cp

        patch_prepare_cp()
    if set_accelerate_parallelism_config:
        os.environ["ACCELERATE_USE_PARALLELISM_CONFIG"] = "true"


def prepare_optim_env(cfg):
    if not check_cuda_p2p_ib_support():
        if os.getenv("NCCL_P2P_DISABLE") is None:
            LOG.warning("P2P support not detected, setting `NCCL_P2P_DISABLE=1`")
            os.environ["NCCL_P2P_DISABLE"] = "1"
    # TODO @SalmanMohammadi remove the cfg.fsdp check in 0.12
    if cfg.fsdp or cfg.fsdp_config:
        cfg.fsdp = True if not cfg.fsdp else cfg.fsdp
        setup_fsdp_envs(cfg)
    elif cfg.deepspeed:
        stage = None
        deepspeed_config = None
        # check if the cfg.deepspeed is a file
        if isinstance(cfg.deepspeed, DictDefault):
            deepspeed_config = cfg.deepspeed
        elif os.path.isfile(cfg.deepspeed):
            # parse with json
            with open(cfg.deepspeed, "r", encoding="utf-8") as fin:
                deepspeed_config = json.load(fin)
        if deepspeed_config:
            stage = deepspeed_config.get("zero_optimization", {}).get("stage", None)
        setup_deepspeed_env(cfg, stage=stage)

    setup_parallelism_envs(cfg)
    setup_torch_compile_env(cfg)

    if cfg.fp8:
        os.environ["ACCELERATE_MIXED_PRECISION"] = "fp8"
    elif (cfg.bf16 == "auto" and is_torch_bf16_gpu_available()) or cfg.bf16 is True:
        os.environ["ACCELERATE_MIXED_PRECISION"] = "bf16"
    elif cfg.fp16:
        os.environ["ACCELERATE_MIXED_PRECISION"] = "fp16"
    else:
        os.environ["ACCELERATE_MIXED_PRECISION"] = "no"


def setup_trainer(
    cfg,
    train_dataset,
    eval_dataset,
    model,
    tokenizer,
    processor,
    total_num_steps,
    model_ref=None,
    peft_config=None,
):
    """
    Helper method for instantiating and building a (causal or RLHF) trainer.

    Args:
        cfg: Axolotl config object containing training parameters.
        train_dataset: Dataset to use for training.
        eval_dataset: Dataset to use for evaluation.
        model: The model to train.
        tokenizer: Tokenizer for processing text input.
        processor: Processor for data preparation.
        total_num_steps: The total number of training steps.
        model_ref: Optional reference model for RLHF training. Default is None.
        peft_config: Optional PEFT (Parameter-Efficient Fine-Tuning) configuration. Default is None.

    Returns:
        A trainer instance (either `HFRLTrainer` or `HFCausalTrainer`) configured based
            on the provided parameters.
    """
    from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder

    if cfg.rl:
        trainer_builder = HFRLTrainerBuilder(cfg, model, tokenizer, processor)
        trainer_builder.model_ref = model_ref
        trainer_builder.peft_config = peft_config
    else:
        trainer_builder = HFCausalTrainerBuilder(cfg, model, tokenizer, processor)

    trainer_builder.train_dataset = train_dataset
    trainer_builder.eval_dataset = eval_dataset

    return trainer_builder.build(total_num_steps)


================================================
FILE: src/axolotl/utils/wandb_.py
================================================
"""Module for wandb utilities"""

import os

from axolotl.utils.dict import DictDefault


def setup_wandb_env_vars(cfg: DictDefault):
    for key in cfg.keys():
        if key.startswith("wandb_"):
            value = cfg.get(key, "")

            if value and isinstance(value, str) and len(value) > 0:
                os.environ[key.upper()] = value

    # Enable wandb if project name is present
    if cfg.wandb_project and len(cfg.wandb_project) > 0:
        cfg.use_wandb = True


================================================
FILE: src/setuptools_axolotl_dynamic_dependencies.py
================================================
"""
dynamic requirements for axolotl
"""

import platform
import re
from importlib.metadata import PackageNotFoundError, version

from setuptools.command.build_py import build_py as _build_py


def parse_requirements():
    _install_requires = []
    _dependency_links = []
    with open("./requirements.txt", encoding="utf-8") as requirements_file:
        lines = [r.strip() for r in requirements_file.readlines()]
        for line in lines:
            is_extras = (
                "flash-attn" in line
                or "flash-attention" in line
                or "deepspeed" in line
                or "mamba-ssm" in line
                or "lion-pytorch" in line
            )
            if line.startswith("--extra-index-url"):
                # Handle custom index URLs
                _, url = line.split()
                _dependency_links.append(url)
            elif not is_extras and line and line[0] != "#":
                # Handle standard packages
                _install_requires.append(line)

    try:
        xformers_version = [req for req in _install_requires if "xformers" in req][0]
        torchao_version = [req for req in _install_requires if "torchao" in req][0]

        if "Darwin" in platform.system():
            # don't install xformers on MacOS
            _install_requires.pop(_install_requires.index(xformers_version))
        else:
            # detect the version of torch already installed
            # and set it so dependencies don't clobber the torch version
            try:
                torch_version = version("torch")
            except PackageNotFoundError:
                torch_version = "2.5.1"
            _install_requires.append(f"torch=={torch_version}")

            version_match = re.match(r"^(\d+)\.(\d+)(?:\.(\d+))?", torch_version)
            if version_match:
                major, minor, patch = version_match.groups()
                major, minor = int(major), int(minor)
                patch = (
                    int(patch) if patch is not None else 0
                )  # Default patch to 0 if not present
            else:
                raise ValueError("Invalid version format")

            if (major, minor) >= (2, 5):
                _install_requires.pop(_install_requires.index(xformers_version))
                if patch == 0:
                    _install_requires.append("xformers==0.0.28.post2")
                else:
                    _install_requires.append("xformers==0.0.28.post3")
            elif (major, minor) >= (2, 4):
                if patch == 0:
                    _install_requires.pop(_install_requires.index(xformers_version))
                    _install_requires.append("xformers>=0.0.27")
                else:
                    _install_requires.pop(_install_requires.index(xformers_version))
                    _install_requires.append("xformers==0.0.28.post1")
            elif (major, minor) >= (2, 3):
                _install_requires.pop(_install_requires.index(torchao_version))
                if patch == 0:
                    _install_requires.pop(_install_requires.index(xformers_version))
                    _install_requires.append("xformers>=0.0.26.post1")
                else:
                    _install_requires.pop(_install_requires.index(xformers_version))
                    _install_requires.append("xformers>=0.0.27")
            elif (major, minor) >= (2, 2):
                _install_requires.pop(_install_requires.index(torchao_version))
                _install_requires.pop(_install_requires.index(xformers_version))
                _install_requires.append("xformers>=0.0.25.post1")
            else:
                _install_requires.pop(_install_requires.index(torchao_version))
                _install_requires.pop(_install_requires.index(xformers_version))
                _install_requires.append("xformers>=0.0.23.post1")

    except PackageNotFoundError:
        pass
    return _install_requires, _dependency_links


class BuildPyCommand(_build_py):
    """
    custom build_py command to parse dynamic requirements
    """

    def finalize_options(self):
        super().finalize_options()
        install_requires, _ = parse_requirements()
        self.distribution.install_requires = install_requires


================================================
FILE: styles.css
================================================
/* TYPOGRAPHY SECTION */

/* Import fonts */
@import url('https://fonts.googleapis.com/css2?family=Be+Vietnam+Pro:wght@400;500&display=swap');
@import url('https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400&display=swap');

/* Typography hierarchy */
:root {
    --font-title: 'Be Vietnam Pro', sans-serif;
    --font-body: 'JetBrains Mono', monospace;
}

/* Title (h1) */
h1 {
    font-family: var(--font-title);
    font-weight: 400;
    font-size: 3rem;
    line-height: 1.1;
    letter-spacing: -0.05em;
    font-feature-settings: "ss01" on;
}

/* Heading (h2) */
h2 {
    font-family: var(--font-title);
    font-weight: 500;
    font-size: 1.5rem;
    line-height: 1.2;
    letter-spacing: -0.03em;
    font-feature-settings: "ss01" on;
}

/* Subtitle/Preamble */
h3,
h4 {
    font-family: var(--font-body);
    font-weight: 400;
    font-size: 1.25rem;
    line-height: 1.5;
    letter-spacing: -0.02em;
}

/* Body text */
body {
    font-family: var(--font-body);
    font-weight: 400;
    font-size: 1rem;
    line-height: 1.5;
    letter-spacing: -0.02em;
}

/* Links */
a {
    font-family: var(--font-body);
    font-weight: 400;
    font-size: 0.875rem;
    line-height: 1;
    letter-spacing: -0.02em;
}

/* NAV BAR SECTION */

/* Navbar logo styling */
.navbar-brand img {
    height: 32px;
    margin-right: 10px;
}

/* COLORS SECTION */

/* Brand colors */
:root {
    --white: #ffffff;
    --greige-300: #EEEEE7;
    --greige-600: #CCCAC0;
    --black: #141310;
    --lime: #E3F8A8;
    --cyan: #A0F4EA;
    --purple: #C8D0F8;
}

/* Base styles */
body {
    background-color: var(--black);
    color: var(--greige-300);
}

/* Navigation */
.navbar {
    background-color: var(--black) !important;
}

.navbar-dark .navbar-nav .nav-link {
    color: var(--greige-300);
}

.navbar-dark .navbar-nav .nav-link:hover {
    color: var(--lime);
}

/* Sidebar */
.sidebar-navigation {
    background-color: var(--black);
    border-right: 1px solid var(--greige-600);
}

.sidebar nav[role="doc-toc"] ul>li>a {
    color: var(--greige-300);
}

.sidebar nav[role="doc-toc"] ul>li>a:hover {
    color: var(--lime);
}

/* Links */
a {
    color: var(--lime);
}

a:hover {
    color: var(--cyan);
}

/* Headers */
h1,
h2,
h3,
h4,
h5,
h6 {
    color: var(--white);
}

/* Code blocks */
pre {
    background-color: #1a1a1a !important;
    border: 1px solid var(--greige-600);
}

/* Tables */
.table {
    color: var(--greige-300);
}

/* TOC */
#toc-title {
    color: var(--white);
}

.toc-active {
    color: var(--lime) !important;
}

/* Buttons */
.btn-primary {
    background-color: var(--lime);
    color: var(--black);
    border: none;
}

.btn-primary:hover {
    background-color: var(--cyan);
    color: var(--black);
}

/* For inline code (single backtick) */
code {
    background-color: #1a1a1a !important;
    color: var(--lime) !important;
    padding: 2px 4px;
    border-radius: 4px;
}

/* For inline code that is also a link */
a code {
    color: var(--cyan) !important;
}

/* For code blocks (triple backtick) */
pre.sourceCode {
    background-color: #1a1a1a !important;
}

/* Make comments in bash/shell scripts green */
code span.co {
    color: #5cb85c !important;
}

/* Remove underlines from JSON comments and make them green */
code span.er {
    color: #5cb85c !important;
    text-decoration: none !important;
}

/* API Documentation Styling */

/* Improve docstring section rendering */
.level3 p {
    white-space: pre-line !important;
}

/* Format docstring sections */
.level3 p strong {
    display: block;
    margin-top: 1em;
    font-weight: bold;
    color: var(--cyan);
}

/* Add spacing after sections */
.level3 p:has(strong) {
    margin-bottom: 0.5em;
}

/* Format Args and Returns sections */
p:has(code) {
    line-height: 1.6;
}

/* Function signatures */
.sourceCode {
    margin-bottom: 1.5em;
}

/* Parameter tables */
.doc-section-parameters table,
.doc-section-returns table {
    margin-top: 1em;
    margin-bottom: 1.5em;
}

/* Make parameter and returns headers smaller */
h2.anchored[data-anchor-id="parameters"],
h2.anchored[data-anchor-id="returns"],
.doc-section-parameters h4,
.doc-section-returns h4 {
    font-size: 1.25rem;
    margin-top: 2rem;
    margin-bottom: 1rem;
    color: var(--lime);
    border-bottom: 1px solid var(--lime);
    padding-bottom: 0.3rem;
    font-family: var(--font-body);
    font-weight: 500;
    letter-spacing: normal;
}

/* Style documentation tables */
table {
    width: 100%;
    margin-bottom: 1.5rem;
    border-collapse: collapse;
}

table th {
    background-color: #1a1a1a;
    padding: 0.5rem 1rem;
    border-bottom: 2px solid var(--greige-600);
    text-align: left;
}

table td {
    padding: 0.5rem 1rem;
    border-bottom: 1px solid var(--greige-600);
}

/* Code in table cells */
table td code {
    background-color: transparent !important;
    padding: 0;
}

/* Improve spacing in parameter and return tables */
.doc-section-parameters,
.doc-section-returns {
    margin-top: 1rem;
}


================================================
FILE: tests/__init__.py
================================================


================================================
FILE: tests/cli/__init__.py
================================================


================================================
FILE: tests/cli/conftest.py
================================================
"""Shared pytest fixtures for cli module."""

import pytest
from click.testing import CliRunner

VALID_TEST_CONFIG = """
base_model: HuggingFaceTB/SmolLM2-135M
datasets:
  - path: mhenrichsen/alpaca_2k_test
    type: alpaca
sequence_len: 2048
max_steps: 1
micro_batch_size: 1
gradient_accumulation_steps: 1
learning_rate: 1e-3
special_tokens:
  pad_token: <|endoftext|>
"""


@pytest.fixture
def cli_runner():
    return CliRunner()


@pytest.fixture
def valid_test_config():
    return VALID_TEST_CONFIG


@pytest.fixture
def config_path(tmp_path):
    """Creates a temporary config file"""
    path = tmp_path / "config.yml"
    path.write_text(VALID_TEST_CONFIG)

    return path


================================================
FILE: tests/cli/test_cli_base.py
================================================
"""Base test class for CLI commands."""

from pathlib import Path
from unittest.mock import patch

from axolotl.cli.main import cli


class BaseCliTest:
    """Base class for CLI command tests."""

    def _test_cli_validation(self, cli_runner, command: str):
        """Test CLI validation for a command.

        Args:
            cli_runner: CLI runner fixture
            command: Command to test (train/evaluate)
        """
        # Test missing config file
        result = cli_runner.invoke(cli, [command, "--launcher", "python"])
        assert result.exit_code != 0

        # Test non-existent config file
        result = cli_runner.invoke(
            cli, [command, "nonexistent.yml", "--launcher", "python"]
        )
        assert result.exit_code != 0
        assert "Error: Invalid value for 'CONFIG'" in result.output

    def _test_basic_execution(
        self,
        cli_runner,
        tmp_path: Path,
        valid_test_config: str,
        command: str,
        train: bool = True,
    ):
        """Test basic execution with accelerate.

        Args:
            cli_runner: CLI runner fixture
            tmp_path: Temporary path fixture
            valid_test_config: Valid config fixture
            command: Command to test (train/evaluate)
            train: Whether to test training (default) or evaluation
        """
        config_path = tmp_path / "config.yml"
        config_path.write_text(valid_test_config)

        mock_fn = "os.execvpe" if command == "train" else "subprocess.run"

        with patch(mock_fn) as mock:
            result = cli_runner.invoke(cli, [command, str(config_path)])

            assert mock.called

            expected = [
                "accelerate",
                "launch",
                "-m",
                f"axolotl.cli.{command}",
                str(config_path),
                "--debug=False",
                "--debug-text-only=False",
                "--debug-num-examples=0",
            ]
            if train:
                expected.append("--shard=False")

            if command == "train":
                assert mock.call_args.args[0] == "accelerate"
                assert mock.call_args.args[1] == expected
            else:
                assert mock.call_args.args[0] == expected
                assert mock.call_args.kwargs == {"check": True}
            assert result.exit_code == 0

    def _test_cli_overrides(self, tmp_path: Path, valid_test_config: str):
        """Test CLI argument overrides.

        Args:
            tmp_path: Temporary path fixture
            valid_test_config: Valid config fixture
            command: Command to test (train/evaluate)
        """
        config_path = tmp_path / "config.yml"
        output_dir = tmp_path / "model-out"

        test_config = valid_test_config.replace(
            "output_dir: model-out", f"output_dir: {output_dir}"
        )
        config_path.write_text(test_config)
        return config_path


================================================
FILE: tests/cli/test_cli_evaluate.py
================================================
"""Tests for evaluate CLI command."""

from unittest.mock import patch

from axolotl.cli.main import cli

from .test_cli_base import BaseCliTest


class TestEvaluateCommand(BaseCliTest):
    """Test cases for evaluate command."""

    cli = cli

    def test_evaluate_cli_validation(self, cli_runner):
        """Test CLI validation"""
        self._test_cli_validation(cli_runner, "evaluate")

    def test_evaluate_basic_execution(self, cli_runner, tmp_path, valid_test_config):
        """Test basic successful execution"""
        self._test_basic_execution(
            cli_runner, tmp_path, valid_test_config, "evaluate", train=False
        )

    def test_evaluate_basic_execution_no_accelerate(
        self, cli_runner, tmp_path, valid_test_config
    ):
        """Test basic successful execution without accelerate"""
        config_path = tmp_path / "config.yml"
        config_path.write_text(valid_test_config)

        with patch("axolotl.cli.evaluate.do_evaluate") as mock_evaluate:
            result = cli_runner.invoke(
                cli,
                [
                    "evaluate",
                    str(config_path),
                    "--launcher",
                    "python",
                ],
                catch_exceptions=False,
            )

            assert result.exit_code == 0
            mock_evaluate.assert_called_once()

    def test_evaluate_cli_overrides(self, cli_runner, tmp_path, valid_test_config):
        """Test CLI arguments properly override config values"""
        config_path = self._test_cli_overrides(tmp_path, valid_test_config)

        with patch("axolotl.cli.evaluate.do_evaluate") as mock_evaluate:
            result = cli_runner.invoke(
                cli,
                [
                    "evaluate",
                    str(config_path),
                    "--micro-batch-size",
                    "2",
                    "--sequence-len",
                    "128",
                    "--launcher",
                    "python",
                ],
                catch_exceptions=False,
            )

            assert result.exit_code == 0
            mock_evaluate.assert_called_once()
            cfg = mock_evaluate.call_args[0][0]
            assert cfg.micro_batch_size == 2
            assert cfg.sequence_len == 128

    def test_evaluate_with_launcher_args_torchrun(
        self, cli_runner, tmp_path, valid_test_config
    ):
        """Test evaluate with torchrun launcher arguments"""
        config_path = tmp_path / "config.yml"
        config_path.write_text(valid_test_config)

        with patch("subprocess.run") as mock_subprocess:
            result = cli_runner.invoke(
                cli,
                [
                    "evaluate",
                    str(config_path),
                    "--launcher",
                    "torchrun",
                    "--",
                    "--nproc_per_node=2",
                    "--nnodes=1",
                ],
                catch_exceptions=False,
            )

            assert result.exit_code == 0
            mock_subprocess.assert_called_once()

            # Verify launcher args are passed to torchrun
            called_cmd = mock_subprocess.call_args.args[0]
            assert called_cmd[0] == "torchrun"
            assert "--nproc_per_node=2" in called_cmd
            assert "--nnodes=1" in called_cmd
            assert "-m" in called_cmd
            assert "axolotl.cli.evaluate" in called_cmd

    def test_evaluate_with_launcher_args_accelerate(
        self, cli_runner, tmp_path, valid_test_config
    ):
        """Test evaluate with accelerate launcher arguments"""
        config_path = tmp_path / "config.yml"
        config_path.write_text(valid_test_config)

        with patch("subprocess.run") as mock_subprocess:
            result = cli_runner.invoke(
                cli,
                [
                    "evaluate",
                    str(config_path),
                    "--launcher",
                    "accelerate",
                    "--",
                    "--config_file=accelerate_config.yml",
                    "--num_processes=4",
                ],
                catch_exceptions=False,
            )

            assert result.exit_code == 0
            mock_subprocess.assert_called_once()

            # Verify launcher args are passed to accelerate
            called_cmd = mock_subprocess.call_args.args[0]
            assert called_cmd[0] == "accelerate"
            assert called_cmd[1] == "launch"
            assert "--config_file=accelerate_config.yml" in called_cmd
            assert "--num_processes=4" in called_cmd
            assert "-m" in called_cmd
            assert "axolotl.cli.evaluate" in called_cmd

    def test_evaluate_backward_compatibility_no_launcher_args(
        self, cli_runner, tmp_path, valid_test_config
    ):
        """Test that existing evaluate commands work without launcher args"""
        config_path = tmp_path / "config.yml"
        config_path.write_text(valid_test_config)

        with patch("subprocess.run") as mock_subprocess:
            result = cli_runner.invoke(
                cli,
                [
                    "evaluate",
                    str(config_path),
                    "--launcher",
                    "accelerate",
                    "--micro-batch-size",
                    "2",
                ],
                catch_exceptions=False,
            )

            assert result.exit_code == 0
            mock_subprocess.assert_called_once()

            # Verify no launcher args contamination
            called_cmd = mock_subprocess.call_args.args[0]
            assert called_cmd[0] == "accelerate"
            assert called_cmd[1] == "launch"
            # Should not contain any extra launcher args
            launcher_section = called_cmd[2 : called_cmd.index("-m")]
            assert (
                len(launcher_section) == 0
            )  # No launcher args between 'launch' and '-m'


================================================
FILE: tests/cli/test_cli_fetch.py
================================================
"""pytest tests for axolotl CLI fetch command."""

from unittest.mock import patch

from axolotl.cli.main import fetch


def test_fetch_cli_examples(cli_runner):
    """Test fetch command with examples directory"""
    with patch("axolotl.cli.main.fetch_from_github") as mock_fetch:
        result = cli_runner.invoke(fetch, ["examples"])

        assert result.exit_code == 0
        mock_fetch.assert_called_once_with("examples/", None)


def test_fetch_cli_deepspeed(cli_runner):
    """Test fetch command with deepspeed_configs directory"""
    with patch("axolotl.cli.main.fetch_from_github") as mock_fetch:
        result = cli_runner.invoke(fetch, ["deepspeed_configs"])

        assert result.exit_code == 0
        mock_fetch.assert_called_once_with("deepspeed_configs/", None)


def test_fetch_cli_with_dest(cli_runner, tmp_path):
    """Test fetch command with custom destination"""
    with patch("axolotl.cli.main.fetch_from_github") as mock_fetch:
        custom_dir = tmp_path / "tmp_examples"
        result = cli_runner.invoke(fetch, ["examples", "--dest", str(custom_dir)])

        assert result.exit_code == 0
        mock_fetch.assert_called_once_with("examples/", str(custom_dir))


def test_fetch_cli_invalid_directory(cli_runner):
    """Test fetch command with invalid directory choice"""
    result = cli_runner.invoke(fetch, ["invalid"])
    assert result.exit_code != 0


================================================
FILE: tests/cli/test_cli_inference.py
================================================
"""pytest tests for axolotl CLI inference command."""

from unittest.mock import patch

from axolotl.cli.main import cli


def test_inference_basic(cli_runner, config_path):
    """Test basic inference"""
    with patch("axolotl.cli.inference.do_inference") as mock:
        result = cli_runner.invoke(
            cli,
            ["inference", str(config_path), "--launcher", "python"],
            catch_exceptions=False,
        )

        assert mock.called
        assert result.exit_code == 0


def test_inference_gradio(cli_runner, config_path):
    """Test basic inference (gradio path)"""
    with patch("axolotl.cli.inference.do_inference_gradio") as mock:
        result = cli_runner.invoke(
            cli,
            ["inference", str(config_path), "--launcher", "python", "--gradio"],
            catch_exceptions=False,
        )

        assert mock.called
        assert result.exit_code == 0


def test_inference_with_launcher_args_torchrun(cli_runner, config_path):
    """Test inference with torchrun launcher arguments"""
    with patch("subprocess.run") as mock_subprocess:
        result = cli_runner.invoke(
            cli,
            [
                "inference",
                str(config_path),
                "--launcher",
                "torchrun",
                "--",
                "--nproc_per_node=2",
                "--nnodes=1",
            ],
            catch_exceptions=False,
        )

        assert result.exit_code == 0
        mock_subprocess.assert_called_once()

        # Verify launcher args are passed to torchrun
        called_cmd = mock_subprocess.call_args.args[0]
        assert called_cmd[0] == "torchrun"
        assert "--nproc_per_node=2" in called_cmd
        assert "--nnodes=1" in called_cmd
        assert "-m" in called_cmd
        assert "axolotl.cli.inference" in called_cmd


def test_inference_with_launcher_args_accelerate(cli_runner, config_path):
    """Test inference with accelerate launcher arguments"""
    with patch("subprocess.run") as mock_subprocess:
        result = cli_runner.invoke(
            cli,
            [
                "inference",
                str(config_path),
                "--launcher",
                "accelerate",
                "--",
                "--config_file=accelerate_config.yml",
                "--num_processes=4",
            ],
            catch_exceptions=False,
        )

        assert result.exit_code == 0
        mock_subprocess.assert_called_once()

        # Verify launcher args are passed to accelerate
        called_cmd = mock_subprocess.call_args.args[0]
        assert called_cmd[0] == "accelerate"
        assert called_cmd[1] == "launch"
        assert "--config_file=accelerate_config.yml" in called_cmd
        assert "--num_processes=4" in called_cmd
        assert "-m" in called_cmd
        assert "axolotl.cli.inference" in called_cmd


def test_inference_gradio_with_launcher_args(cli_runner, config_path):
    """Test inference with gradio and launcher arguments"""
    with patch("subprocess.run") as mock_subprocess:
        result = cli_runner.invoke(
            cli,
            [
                "inference",
                str(config_path),
                "--launcher",
                "accelerate",
                "--gradio",
                "--",
                "--num_processes=2",
            ],
            catch_exceptions=False,
        )

        assert result.exit_code == 0
        mock_subprocess.assert_called_once()

        # Verify both gradio flag and launcher args are present
        called_cmd = mock_subprocess.call_args.args[0]
        assert called_cmd[0] == "accelerate"
        assert called_cmd[1] == "launch"
        assert "--num_processes=2" in called_cmd
        assert "--gradio" in called_cmd
        assert "-m" in called_cmd
        assert "axolotl.cli.inference" in called_cmd


def test_inference_backward_compatibility_no_launcher_args(cli_runner, config_path):
    """Test that existing inference commands work without launcher args"""
    with patch("subprocess.run") as mock_subprocess:
        result = cli_runner.invoke(
            cli,
            [
                "inference",
                str(config_path),
                "--launcher",
                "accelerate",
            ],
            catch_exceptions=False,
        )

        assert result.exit_code == 0
        mock_subprocess.assert_called_once()

        # Verify no launcher args contamination
        called_cmd = mock_subprocess.call_args.args[0]
        assert called_cmd[0] == "accelerate"
        assert called_cmd[1] == "launch"
        # Should not contain any extra launcher args
        launcher_section = called_cmd[2 : called_cmd.index("-m")]
        assert len(launcher_section) == 0  # No launcher args between 'launch' and '-m'


================================================
FILE: tests/cli/test_cli_interface.py
================================================
"""General pytest tests for axolotl.cli.main interface."""

from axolotl.cli.main import build_command, cli


def test_build_command():
    """Test converting dict of options to CLI arguments"""
    base_cmd = ["accelerate", "launch"]
    options = {
        "learning_rate": 1e-4,
        "batch_size": 8,
        "debug": True,
        "use_fp16": False,
        "null_value": None,
    }

    result = build_command(base_cmd, options)
    assert result == [
        "accelerate",
        "launch",
        "--learning-rate=0.0001",
        "--batch-size=8",
        "--debug=True",
        "--use-fp16=False",
    ]


def test_invalid_command_options(cli_runner):
    """Test handling of invalid command options"""
    result = cli_runner.invoke(
        cli,
        [
            "train",
            "config.yml",
            "--invalid-option",
            "value",
        ],
    )
    assert result.exit_code != 0
    assert "does not exist" in result.output


def test_required_config_argument(cli_runner):
    """Test commands fail properly when config argument is missing"""
    result = cli_runner.invoke(cli, ["train"])
    assert result.exit_code != 0
    assert "Missing argument 'CONFIG'" in result.output


================================================
FILE: tests/cli/test_cli_merge_lora.py
================================================
"""pytest tests for axolotl CLI merge_lora command."""

from unittest.mock import patch

from axolotl.cli.main import cli


def test_merge_lora_basic(cli_runner, config_path):
    """Test basic merge_lora command"""
    with patch("axolotl.cli.merge_lora.do_cli") as mock_do_cli:
        result = cli_runner.invoke(cli, ["merge-lora", str(config_path)])
        assert result.exit_code == 0

        mock_do_cli.assert_called_once()
        assert mock_do_cli.call_args.kwargs["config"] == str(config_path)


def test_merge_lora_with_dirs(cli_runner, config_path, tmp_path):
    """Test merge_lora with custom lora and output directories"""
    lora_dir = tmp_path / "lora"
    output_dir = tmp_path / "output"
    lora_dir.mkdir()

    with patch("axolotl.cli.merge_lora.do_cli") as mock_do_cli:
        result = cli_runner.invoke(
            cli,
            [
                "merge-lora",
                str(config_path),
                "--lora-model-dir",
                str(lora_dir),
                "--output-dir",
                str(output_dir),
            ],
        )
        assert result.exit_code == 0

        mock_do_cli.assert_called_once()
        assert mock_do_cli.call_args.kwargs["config"] == str(config_path)
        assert mock_do_cli.call_args.kwargs["lora_model_dir"] == str(lora_dir)
        assert mock_do_cli.call_args.kwargs["output_dir"] == str(output_dir)


def test_merge_lora_nonexistent_config(cli_runner, tmp_path):
    """Test merge_lora with nonexistent config"""
    config_path = tmp_path / "nonexistent.yml"
    result = cli_runner.invoke(cli, ["merge-lora", str(config_path)])
    assert result.exit_code != 0


def test_merge_lora_nonexistent_lora_dir(cli_runner, config_path, tmp_path):
    """Test merge_lora with nonexistent lora directory"""
    lora_dir = tmp_path / "nonexistent"
    result = cli_runner.invoke(
        cli, ["merge-lora", str(config_path), "--lora-model-dir", str(lora_dir)]
    )
    assert result.exit_code != 0


================================================
FILE: tests/cli/test_cli_merge_sharded_fsdp_weights.py
================================================
"""pytest tests for axolotl CLI merge_sharded_fsdp_weights command."""

from unittest.mock import patch

from axolotl.cli.main import cli


def test_merge_sharded_fsdp_weights_no_accelerate(cli_runner, config_path):
    """Test merge_sharded_fsdp_weights command without accelerate"""
    with patch("axolotl.cli.merge_sharded_fsdp_weights.do_cli") as mock:
        result = cli_runner.invoke(
            cli,
            ["merge-sharded-fsdp-weights", str(config_path), "--launcher", "python"],
        )

        assert mock.called
        assert mock.call_args.kwargs["config"] == str(config_path)
        assert result.exit_code == 0


def test_merge_sharded_fsdp_weights_with_launcher_args_torchrun(
    cli_runner, config_path
):
    """Test merge-sharded-fsdp-weights with torchrun launcher arguments"""
    with patch("subprocess.run") as mock_subprocess:
        result = cli_runner.invoke(
            cli,
            [
                "merge-sharded-fsdp-weights",
                str(config_path),
                "--launcher",
                "torchrun",
                "--",
                "--nproc_per_node=2",
                "--nnodes=1",
            ],
            catch_exceptions=False,
        )

        assert result.exit_code == 0
        mock_subprocess.assert_called_once()

        # Verify launcher args are passed to torchrun
        called_cmd = mock_subprocess.call_args.args[0]
        assert called_cmd[0] == "torchrun"
        assert "--nproc_per_node=2" in called_cmd
        assert "--nnodes=1" in called_cmd
        assert "-m" in called_cmd
        assert "axolotl.cli.merge_sharded_fsdp_weights" in called_cmd


def test_merge_sharded_fsdp_weights_with_launcher_args_accelerate(
    cli_runner, config_path
):
    """Test merge-sharded-fsdp-weights with accelerate launcher arguments"""
    with patch("subprocess.run") as mock_subprocess:
        result = cli_runner.invoke(
            cli,
            [
                "merge-sharded-fsdp-weights",
                str(config_path),
                "--launcher",
                "accelerate",
                "--",
                "--config_file=accelerate_config.yml",
                "--num_processes=4",
            ],
            catch_exceptions=False,
        )

        assert result.exit_code == 0
        mock_subprocess.assert_called_once()

        # Verify launcher args are passed to accelerate
        called_cmd = mock_subprocess.call_args.args[0]
        assert called_cmd[0] == "accelerate"
        assert called_cmd[1] == "launch"
        assert "--config_file=accelerate_config.yml" in called_cmd
        assert "--num_processes=4" in called_cmd
        assert "-m" in called_cmd
        assert "axolotl.cli.merge_sharded_fsdp_weights" in called_cmd


def test_merge_sharded_fsdp_weights_backward_compatibility_no_launcher_args(
    cli_runner, config_path
):
    """Test that existing merge-sharded-fsdp-weights commands work without launcher args"""
    with patch("subprocess.run") as mock_subprocess:
        result = cli_runner.invoke(
            cli,
            [
                "merge-sharded-fsdp-weights",
                str(config_path),
                "--launcher",
                "accelerate",
            ],
            catch_exceptions=False,
        )

        assert result.exit_code == 0
        mock_subprocess.assert_called_once()

        # Verify no launcher args contamination
        called_cmd = mock_subprocess.call_args.args[0]
        assert called_cmd[0] == "accelerate"
        assert called_cmd[1] == "launch"
        # Should not contain any extra launcher args
        launcher_section = called_cmd[2 : called_cmd.index("-m")]
        assert len(launcher_section) == 0  # No launcher args between 'launch' and '-m'


================================================
FILE: tests/cli/test_cli_preprocess.py
================================================
"""pytest tests for axolotl CLI preprocess command."""

import shutil
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

from axolotl.cli.main import cli


@pytest.fixture(autouse=True)
def cleanup_last_run_prepared():
    yield

    if Path("last_run_prepared").exists():
        shutil.rmtree("last_run_prepared")


def test_preprocess_config_not_found(cli_runner):
    """Test preprocess fails when config not found"""
    result = cli_runner.invoke(cli, ["preprocess", "nonexistent.yml"])
    assert result.exit_code != 0


def test_preprocess_basic(cli_runner, config_path):
    """Test basic preprocessing with minimal config"""
    with patch("axolotl.cli.preprocess.do_cli") as mock_do_cli:
        with patch("axolotl.cli.preprocess.load_datasets") as mock_load_datasets:
            mock_load_datasets.return_value = MagicMock()

            result = cli_runner.invoke(cli, ["preprocess", str(config_path)])
            assert result.exit_code == 0

            mock_do_cli.assert_called_once()
            assert mock_do_cli.call_args.kwargs["config"] == str(config_path)
            assert mock_do_cli.call_args.kwargs["download"] is True


def test_preprocess_without_download(cli_runner, config_path):
    """Test preprocessing without model download"""
    with patch("axolotl.cli.preprocess.do_cli") as mock_do_cli:
        result = cli_runner.invoke(
            cli, ["preprocess", str(config_path), "--no-download"]
        )
        assert result.exit_code == 0

        mock_do_cli.assert_called_once()
        assert mock_do_cli.call_args.kwargs["config"] == str(config_path)
        assert mock_do_cli.call_args.kwargs["download"] is False


def test_preprocess_custom_path(cli_runner, tmp_path, valid_test_config):
    """Test preprocessing with custom dataset path"""
    config_path = tmp_path / "config.yml"
    custom_path = tmp_path / "custom_prepared"
    config_path.write_text(valid_test_config)

    with patch("axolotl.cli.preprocess.do_cli") as mock_do_cli:
        with patch("axolotl.cli.preprocess.load_datasets") as mock_load_datasets:
            mock_load_datasets.return_value = MagicMock()

            result = cli_runner.invoke(
                cli,
                [
                    "preprocess",
                    str(config_path),
                    "--dataset-prepared-path",
                    str(custom_path.absolute()),
                ],
            )
            assert result.exit_code == 0

            mock_do_cli.assert_called_once()
            assert mock_do_cli.call_args.kwargs["config"] == str(config_path)
            assert mock_do_cli.call_args.kwargs["dataset_prepared_path"] == str(
                custom_path.absolute()
            )


================================================
FILE: tests/cli/test_cli_sweeps.py
================================================
"""
unit tests for generating sweep configurations
"""

from axolotl.cli.utils import generate_sweep_configs


def test_generate_sweep_configs_no_pairs():
    base_config = {
        "learning_rate": 0.1,
        "micro_batch_size": 1,
        "sample_packing": True,
    }

    sweeps_config = {"micro_batch_size": [1, 2, 4], "weight_decay": [0.0, 0.1]}

    generate_sweep_configs(base_config, sweeps_config)

    assert len(generate_sweep_configs(base_config, sweeps_config)) == 6

    cfg_1 = {
        "learning_rate": 0.1,
        "micro_batch_size": 2,
        "weight_decay": 0.0,
        "sample_packing": True,
    }

    assert any(
        cfg_1 == cfg for cfg in generate_sweep_configs(base_config, sweeps_config)
    )


def test_generate_sweep_configs_with_pairs():
    base_config = {
        "learning_rate": 0.1,
        "micro_batch_size": 1,
        "sample_packing": True,
    }

    sweeps_config = {
        "_": [
            {
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 8,
            },
            {
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 4,
            },
            {
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
            },
            {
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
            },
        ],
        "weight_decay": [0.0, 0.1],
    }

    generate_sweep_configs(base_config, sweeps_config)

    assert len(generate_sweep_configs(base_config, sweeps_config)) == 8

    assert all(
        cfg["gradient_accumulation_steps"] * cfg["micro_batch_size"] == 8
        for cfg in generate_sweep_configs(base_config, sweeps_config)
    )


================================================
FILE: tests/cli/test_cli_train.py
================================================
"""Tests for train CLI command."""

from unittest.mock import MagicMock, patch

from axolotl.cli.main import cli

from .test_cli_base import BaseCliTest


class TestTrainCommand(BaseCliTest):
    """Test cases for train command."""

    cli = cli

    def test_train_cli_validation(self, cli_runner):
        """Test CLI validation"""
        self._test_cli_validation(cli_runner, "train")

    def test_train_basic_execution(self, cli_runner, tmp_path, valid_test_config):
        """Test basic successful execution"""
        self._test_basic_execution(
            cli_runner, tmp_path, valid_test_config, "train", train=True
        )

    def test_train_basic_execution_no_accelerate(
        self, cli_runner, tmp_path, valid_test_config
    ):
        """Test basic successful execution without accelerate"""
        config_path = tmp_path / "config.yml"
        config_path.write_text(valid_test_config)

        with patch("axolotl.cli.train.train") as mock_train:
            mock_train.return_value = (MagicMock(), MagicMock(), MagicMock())
            with patch("axolotl.cli.train.load_datasets") as mock_load_datasets:
                mock_load_datasets.return_value = MagicMock()

                result = cli_runner.invoke(
                    cli,
                    [
                        "train",
                        str(config_path),
                        "--launcher",
                        "python",
                    ],
                    catch_exceptions=False,
                )

                assert result.exit_code == 0
                mock_train.assert_called_once()

    def test_train_cli_overrides(self, cli_runner, tmp_path, valid_test_config):
        """Test CLI arguments properly override config values"""
        config_path = self._test_cli_overrides(tmp_path, valid_test_config)

        with patch("axolotl.cli.train.train") as mock_train:
            mock_train.return_value = (MagicMock(), MagicMock(), MagicMock())
            with patch("axolotl.cli.train.load_datasets") as mock_load_datasets:
                mock_load_datasets.return_value = MagicMock()

                result = cli_runner.invoke(
                    cli,
                    [
                        "train",
                        str(config_path),
                        "--learning-rate=1e-4",
                        "--micro-batch-size=2",
                        "--launcher",
                        "python",
                    ],
                    catch_exceptions=False,
                )

                assert result.exit_code == 0
                mock_train.assert_called_once()
                cfg = mock_train.call_args[1]["cfg"]
                assert cfg["learning_rate"] == 1e-4
                assert cfg["micro_batch_size"] == 2

    def test_train_with_launcher_args_torchrun(
        self, cli_runner, tmp_path, valid_test_config
    ):
        """Test train with torchrun launcher arguments"""
        config_path = tmp_path / "config.yml"
        config_path.write_text(valid_test_config)

        with patch("os.execvpe") as mock_subprocess:
            result = cli_runner.invoke(
                cli,
                [
                    "train",
                    str(config_path),
                    "--launcher",
                    "torchrun",
                    "--",
                    "--nproc_per_node=2",
                    "--nnodes=1",
                ],
                catch_exceptions=False,
            )

            assert result.exit_code == 0
            mock_subprocess.assert_called_once()

            # Verify launcher args are passed to torchrun
            called_cmd = mock_subprocess.call_args.args[1]
            assert called_cmd[0] == "torchrun"
            assert "--nproc_per_node=2" in called_cmd
            assert "--nnodes=1" in called_cmd
            assert "-m" in called_cmd
            assert "axolotl.cli.train" in called_cmd

    def test_train_with_launcher_args_accelerate(
        self, cli_runner, tmp_path, valid_test_config
    ):
        """Test train with accelerate launcher arguments"""
        config_path = tmp_path / "config.yml"
        config_path.write_text(valid_test_config)

        with patch("os.execvpe") as mock_subprocess:
            result = cli_runner.invoke(
                cli,
                [
                    "train",
                    str(config_path),
                    "--launcher",
                    "accelerate",
                    "--",
                    "--config_file=accelerate_config.yml",
                    "--num_processes=4",
                ],
                catch_exceptions=False,
            )

            assert result.exit_code == 0
            mock_subprocess.assert_called_once()

            # Verify launcher args are passed to accelerate
            assert mock_subprocess.call_args.args[0] == "accelerate"
            called_cmd = mock_subprocess.call_args.args[1]
            assert called_cmd[0] == "accelerate"
            assert called_cmd[1] == "launch"
            assert "--config_file=accelerate_config.yml" in called_cmd
            assert "--num_processes=4" in called_cmd
            assert "-m" in called_cmd
            assert "axolotl.cli.train" in called_cmd

    def test_train_backward_compatibility_no_launcher_args(
        self, cli_runner, tmp_path, valid_test_config
    ):
        """Test that existing train commands work without launcher args"""
        config_path = tmp_path / "config.yml"
        config_path.write_text(valid_test_config)

        with patch("os.execvpe") as mock_subprocess:
            result = cli_runner.invoke(
                cli,
                [
                    "train",
                    str(config_path),
                    "--launcher",
                    "accelerate",
                    "--learning-rate",
                    "1e-4",
                ],
                catch_exceptions=False,
            )

            assert result.exit_code == 0
            mock_subprocess.assert_called_once()

            # Verify no launcher args contamination
            assert mock_subprocess.call_args.args[0] == "accelerate"
            called_cmd = mock_subprocess.call_args.args[1]
            assert called_cmd[0] == "accelerate"
            assert called_cmd[1] == "launch"
            # Should not contain any extra launcher args
            launcher_section = called_cmd[2 : called_cmd.index("-m")]
            assert (
                len(launcher_section) == 0
            )  # No launcher args between 'launch' and '-m'

    def test_train_mixed_args_with_launcher_args(
        self, cli_runner, tmp_path, valid_test_config
    ):
        """Test train with both regular CLI args and launcher args"""
        config_path = tmp_path / "config.yml"
        config_path.write_text(valid_test_config)

        with patch("os.execvpe") as mock_subprocess:
            result = cli_runner.invoke(
                cli,
                [
                    "train",
                    str(config_path),
                    "--launcher",
                    "torchrun",
                    "--learning-rate",
                    "2e-4",
                    "--micro-batch-size",
                    "4",
                    "--",
                    "--nproc_per_node=8",
                ],
                catch_exceptions=False,
            )

            assert result.exit_code == 0
            mock_subprocess.assert_called_once()

            assert mock_subprocess.call_args.args[0] == "torchrun"
            called_cmd = mock_subprocess.call_args.args[1]
            # Verify launcher args
            assert "--nproc_per_node=8" in called_cmd
            # Verify axolotl args are also present
            assert "--learning-rate=2e-4" in called_cmd
            assert "--micro-batch-size=4" in called_cmd

    def test_train_cloud_with_launcher_args(
        self, cli_runner, tmp_path, valid_test_config
    ):
        """Test train with cloud and launcher arguments"""
        config_path = tmp_path / "config.yml"
        config_path.write_text(valid_test_config)

        cloud_path = tmp_path / "cloud.yml"
        cloud_path.write_text("provider: modal\ngpu: a100")

        with patch("axolotl.cli.cloud.do_cli_train") as mock_cloud_train:
            result = cli_runner.invoke(
                cli,
                [
                    "train",
                    str(config_path),
                    "--cloud",
                    str(cloud_path),
                    "--launcher",
                    "torchrun",
                    "--",
                    "--nproc_per_node=4",
                    "--nnodes=2",
                ],
                catch_exceptions=False,
            )

            assert result.exit_code == 0
            mock_cloud_train.assert_called_once()

            # Verify cloud training was called with launcher args
            call_kwargs = mock_cloud_train.call_args.kwargs
            assert call_kwargs["launcher"] == "torchrun"
            assert call_kwargs["launcher_args"] == ["--nproc_per_node=4", "--nnodes=2"]


================================================
FILE: tests/cli/test_cli_version.py
================================================
"""pytest tests for axolotl CLI --version"""

from axolotl.cli.main import cli


def test_print_version(cli_runner):
    """Test that version is printed when --version is used."""

    result = cli_runner.invoke(cli, ["--version"])
    assert result.exit_code == 0
    assert "axolotl, version " in result.output


================================================
FILE: tests/cli/test_nested_options.py
================================================
"""Tests for nested config option handling via CLI dot-notation."""

import click
from click.testing import CliRunner
from pydantic import BaseModel, Field

from axolotl.cli.utils.args import add_options_from_config, filter_none_kwargs


class InnerConfig(BaseModel):
    """A nested config model for testing."""

    beta: float | None = Field(
        default=None,
        description="Beta parameter.",
    )
    host: str | None = Field(
        default=None,
        description="Server host.",
    )
    use_feature: bool = Field(
        default=False,
        description="Whether to use the feature.",
    )


class OuterConfig(BaseModel):
    """A top-level config model for testing."""

    learning_rate: float | None = Field(
        default=None,
        description="Learning rate.",
    )
    inner: InnerConfig | None = Field(
        default=None,
        description="Inner config.",
    )
    name: str | None = Field(
        default=None,
        description="Model name.",
    )


class TestAddOptionsFromConfigNested:
    """Test that add_options_from_config handles nested BaseModel fields."""

    def setup_method(self):
        self.runner = CliRunner()

    def test_nested_dot_notation_options_are_registered(self):
        """Nested model fields should create --parent.child CLI options."""

        @click.command()
        @add_options_from_config(OuterConfig)
        @filter_none_kwargs
        def cmd(**kwargs):
            for k, v in sorted(kwargs.items()):
                click.echo(f"{k}={v}")

        result = self.runner.invoke(cmd, ["--inner.beta=0.5", "--inner.host=localhost"])
        assert result.exit_code == 0, result.output
        assert "inner__beta=0.5" in result.output
        assert "inner__host=localhost" in result.output

    def test_nested_bool_option(self):
        """Nested bool fields should support --parent.field/--no-parent.field."""

        @click.command()
        @add_options_from_config(OuterConfig)
        @filter_none_kwargs
        def cmd(**kwargs):
            for k, v in sorted(kwargs.items()):
                click.echo(f"{k}={v}")

        result = self.runner.invoke(cmd, ["--inner.use-feature"])
        assert result.exit_code == 0, result.output
        assert "inner__use_feature=True" in result.output

    def test_flat_and_nested_options_together(self):
        """Flat and nested options should work together."""

        @click.command()
        @add_options_from_config(OuterConfig)
        @filter_none_kwargs
        def cmd(**kwargs):
            for k, v in sorted(kwargs.items()):
                click.echo(f"{k}={v}")

        result = self.runner.invoke(
            cmd, ["--learning-rate=0.001", "--inner.beta=0.1", "--name=test"]
        )
        assert result.exit_code == 0, result.output
        assert "learning_rate=0.001" in result.output
        assert "inner__beta=0.1" in result.output
        assert "name=test" in result.output

    def test_no_nested_options_passed(self):
        """When no nested options are passed, they should not appear in kwargs."""

        @click.command()
        @add_options_from_config(OuterConfig)
        @filter_none_kwargs
        def cmd(**kwargs):
            click.echo(f"keys={sorted(kwargs.keys())}")

        result = self.runner.invoke(cmd, ["--learning-rate=0.01"])
        assert result.exit_code == 0, result.output
        assert "inner__" not in result.output


class TestLoadCfgNestedKwargs:
    """Test that load_cfg correctly applies nested (double-underscore) kwargs."""

    @staticmethod
    def _apply_nested_kwargs(cfg, kwargs):
        """Helper that mirrors the nested kwargs handling from load_cfg,
        including type coercion for string CLI values."""
        from axolotl.cli.config import _coerce_value

        nested_kwargs: dict = {}
        flat_kwargs: dict = {}
        for key, value in kwargs.items():
            if "__" in key:
                parent, child = key.split("__", 1)
                nested_kwargs.setdefault(parent, {})[child] = value
            else:
                flat_kwargs[key] = value

        cfg_keys = cfg.keys()
        for key, value in flat_kwargs.items():
            if key in cfg_keys:
                cfg[key] = _coerce_value(value, cfg.get(key))

        for parent, children in nested_kwargs.items():
            if cfg[parent] is None:
                cfg[parent] = {}
            if not isinstance(cfg[parent], dict):
                cfg[parent] = {}
            for child_key, child_value in children.items():
                existing = cfg[parent].get(child_key)
                cfg[parent][child_key] = _coerce_value(child_value, existing)

        return cfg

    def test_nested_kwargs_applied_to_cfg(self, tmp_path):
        """Double-underscore kwargs should set nested config values."""
        from axolotl.utils.dict import DictDefault

        cfg = DictDefault({"trl": {"beta": 0.1}, "learning_rate": 0.01})
        # CLI passes strings, so simulate that
        kwargs = {
            "trl__beta": "0.5",
            "trl__host": "192.168.1.1",
            "learning_rate": "0.02",
        }

        cfg = self._apply_nested_kwargs(cfg, kwargs)

        assert cfg["learning_rate"] == 0.02
        assert isinstance(cfg["learning_rate"], float)
        assert cfg["trl"]["beta"] == 0.5
        assert isinstance(cfg["trl"]["beta"], float)
        assert cfg["trl"]["host"] == "192.168.1.1"

    def test_nested_kwargs_creates_parent_if_none(self):
        """If the parent key is None, nested kwargs should create the dict."""
        from axolotl.utils.dict import DictDefault

        cfg = DictDefault({"trl": None, "learning_rate": 0.01})
        cfg = self._apply_nested_kwargs(cfg, {"trl__beta": "0.5"})

        # No existing value, YAML-style inference: "0.5" -> 0.5
        assert cfg["trl"]["beta"] == 0.5
        assert isinstance(cfg["trl"]["beta"], float)

    def test_nested_kwargs_overwrites_string_parent(self):
        """If the parent key is a string, it should be replaced with a dict."""
        from axolotl.utils.dict import DictDefault

        cfg = DictDefault({"trl": "some_string", "learning_rate": 0.01})
        cfg = self._apply_nested_kwargs(cfg, {"trl__beta": "0.5"})

        assert cfg["trl"]["beta"] == 0.5


class TestCoerceValue:
    """Test YAML-style type coercion for CLI string values."""

    def test_coerce_with_existing_float(self):
        from axolotl.cli.config import _coerce_value

        assert _coerce_value("0.5", 0.1) == 0.5
        assert isinstance(_coerce_value("0.5", 0.1), float)

    def test_coerce_with_existing_int(self):
        from axolotl.cli.config import _coerce_value

        assert _coerce_value("42", 10) == 42
        assert isinstance(_coerce_value("42", 10), int)

    def test_coerce_with_existing_bool(self):
        from axolotl.cli.config import _coerce_value

        assert _coerce_value("true", False) is True
        assert _coerce_value("false", True) is False
        assert _coerce_value("1", False) is True
        assert _coerce_value("0", True) is False

    def test_coerce_yaml_inference_no_existing(self):
        """Without an existing value, use YAML-style inference."""
        from axolotl.cli.config import _coerce_value

        assert _coerce_value("true", None) is True
        assert _coerce_value("false", None) is False
        assert _coerce_value("42", None) == 42
        assert isinstance(_coerce_value("42", None), int)
        assert _coerce_value("3.14", None) == 3.14
        assert isinstance(_coerce_value("3.14", None), float)
        assert _coerce_value("null", None) is None
        assert _coerce_value("hello", None) == "hello"

    def test_coerce_non_string_passthrough(self):
        """Non-string values should pass through unchanged."""
        from axolotl.cli.config import _coerce_value

        assert _coerce_value(0.5, 0.1) == 0.5
        assert _coerce_value(True, False) is True


================================================
FILE: tests/cli/test_utils.py
================================================
"""pytest tests for axolotl CLI utils."""

import json
from unittest.mock import Mock, patch

import click
import pytest
import requests

from axolotl.cli.utils import fetch_from_github

# Sample GitHub API response
MOCK_TREE_RESPONSE = {
    "tree": [
        {"path": "examples/config1.yml", "type": "blob", "sha": "abc123"},
        {"path": "examples/config2.yml", "type": "blob", "sha": "def456"},
        {"path": "other/file.txt", "type": "blob", "sha": "xyz789"},
    ]
}


@pytest.fixture
def mock_responses():
    """Mock responses for API and file downloads"""

    def mock_get(url, timeout=None):
        response = Mock()
        if "api.github.com" in url:
            response.text = json.dumps(MOCK_TREE_RESPONSE)
        else:
            response.content = b"file content"
        return response

    return mock_get


def test_fetch_from_github_new_files(tmp_path, mock_responses):
    """Test fetching new files"""
    with patch("requests.get", mock_responses):
        fetch_from_github("examples/", tmp_path)

        # Verify files were created
        assert (tmp_path / "config1.yml").exists()
        assert (tmp_path / "config2.yml").exists()
        assert not (tmp_path / "file.txt").exists()


def test_fetch_from_github_unchanged_files(tmp_path, mock_responses):
    """Test handling of unchanged files"""
    # Create existing file with matching SHA
    existing_file = tmp_path / "config1.yml"
    existing_file.write_bytes(b"file content")

    with patch("requests.get", mock_responses):
        fetch_from_github("examples/", tmp_path)

        # File should not be downloaded again
        assert existing_file.read_bytes() == b"file content"


def test_fetch_from_github_invalid_prefix(mock_responses):
    """Test error handling for invalid directory prefix"""
    with patch("requests.get", mock_responses):
        with pytest.raises(click.ClickException):
            fetch_from_github("nonexistent/", None)


def test_fetch_from_github_network_error():
    """Test handling of network errors"""
    with patch("requests.get", side_effect=requests.RequestException):
        with pytest.raises(requests.RequestException):
            fetch_from_github("examples/", None)


def assert_launcher_args_in_command(
    mock_subprocess_call,
    launcher: str,
    expected_launcher_args: list[str],
    command_module: str,
):
    """
    Helper function to verify launcher arguments are properly passed in subprocess calls.

    Args:
        mock_subprocess_call: The mock subprocess.run call
        launcher: Expected launcher ("accelerate", "torchrun", etc.)
        expected_launcher_args: List of expected launcher arguments
        command_module: Expected module name (e.g., "axolotl.cli.train")
    """
    assert mock_subprocess_call.called, "subprocess.run should have been called"
    called_cmd = mock_subprocess_call.call_args.args[0]

    # Verify launcher
    assert called_cmd[0] == launcher, (
        f"Expected launcher {launcher}, got {called_cmd[0]}"
    )

    # Verify launcher args are present
    for arg in expected_launcher_args:
        assert arg in called_cmd, (
            f"Expected launcher arg '{arg}' not found in command: {called_cmd}"
        )

    # Verify module is present
    assert "-m" in called_cmd, "Expected -m flag for module execution"
    assert command_module in called_cmd, (
        f"Expected module {command_module} not found in command: {called_cmd}"
    )


def assert_no_launcher_args_contamination(mock_subprocess_call, launcher: str):
    """
    Helper function to verify no unwanted launcher arguments are present.

    Args:
        mock_subprocess_call: The mock subprocess.run call
        launcher: Expected launcher ("accelerate", "torchrun", etc.)
    """
    assert mock_subprocess_call.called, "subprocess.run should have been called"
    called_cmd = mock_subprocess_call.call_args.args[0]

    if launcher == "accelerate":
        # For accelerate, launcher args should be between 'launch' and '-m'
        launch_idx = called_cmd.index("launch")
        m_idx = called_cmd.index("-m")
        launcher_section = called_cmd[launch_idx + 1 : m_idx]
        assert len(launcher_section) == 0, (
            f"Unexpected launcher args found: {launcher_section}"
        )
    elif launcher == "torchrun":
        # For torchrun, launcher args should be between 'torchrun' and '-m'
        torchrun_idx = called_cmd.index("torchrun")
        m_idx = called_cmd.index("-m")
        launcher_section = called_cmd[torchrun_idx + 1 : m_idx]
        assert len(launcher_section) == 0, (
            f"Unexpected launcher args found: {launcher_section}"
        )


@pytest.fixture
def common_launcher_args():
    """Fixture providing common launcher argument combinations for testing."""
    return {
        "torchrun": ["--nproc_per_node=2", "--nnodes=1"],
        "accelerate": ["--config_file=accelerate_config.yml", "--num_processes=4"],
    }


def test_add_default_rdzv_args_with_endpoint():
    """Test that default RDZV args are added when rdzv_endpoint is present."""
    from axolotl.cli.utils.train import _add_default_rdzv_args

    launcher_args = ["--nnodes=2", "--rdzv_endpoint=127.0.0.1:29400"]
    result = _add_default_rdzv_args(launcher_args)

    # Should have added rdzv_backend
    assert "--rdzv_backend" in result
    assert "c10d" in result

    # Original args should still be present
    assert "--nnodes=2" in result
    assert "--rdzv_endpoint=127.0.0.1:29400" in result


def test_add_default_rdzv_args_with_existing_backend():
    """Test that existing rdzv_backend is not overridden."""
    from axolotl.cli.utils.train import _add_default_rdzv_args

    launcher_args = [
        "--nnodes=2",
        "--rdzv_endpoint=127.0.0.1:29400",
        "--rdzv_backend=static",
    ]
    result = _add_default_rdzv_args(launcher_args)

    # Should not add another rdzv_backend
    backend_count = sum(1 for arg in result if "--rdzv_backend" in arg)
    assert backend_count == 1
    assert "--rdzv_backend=static" in result


def test_add_default_rdzv_args_with_existing_id():
    """Test that existing rdzv_id is not overridden."""
    from axolotl.cli.utils.train import _add_default_rdzv_args

    launcher_args = [
        "--nnodes=2",
        "--rdzv_endpoint=127.0.0.1:29400",
        "--rdzv_id=my_job_123",
    ]
    result = _add_default_rdzv_args(launcher_args)

    # Should not add another rdzv_id
    id_count = sum(1 for arg in result if "--rdzv_id" in arg)
    assert id_count == 1
    assert "--rdzv_id=my_job_123" in result

    # Should still add rdzv_backend
    assert "--rdzv_backend" in result
    assert "c10d" in result


def test_add_default_rdzv_args_without_endpoint():
    """Test that no RDZV args are added when rdzv_endpoint is not present."""
    from axolotl.cli.utils.train import _add_default_rdzv_args

    launcher_args = ["--nnodes=2", "--nproc_per_node=4"]
    result = _add_default_rdzv_args(launcher_args)

    # Should not add any rdzv args
    assert "--rdzv_backend" not in result
    assert result == launcher_args


def test_add_default_rdzv_args_with_all_existing():
    """Test that no defaults are added when all RDZV args are present."""
    from axolotl.cli.utils.train import _add_default_rdzv_args

    launcher_args = [
        "--nnodes=2",
        "--rdzv_endpoint=127.0.0.1:29400",
        "--rdzv_backend=static",
        "--rdzv_id=existing_job",
    ]
    result = _add_default_rdzv_args(launcher_args)

    # Should not add any additional args
    assert len(result) == len(launcher_args)
    assert result == launcher_args


================================================
FILE: tests/conftest.py
================================================
"""Shared pytest fixtures"""

import functools
import importlib
import logging
import os
import shutil
import sys
import tempfile
import time
from pathlib import Path
from typing import Generator

import datasets
import pytest
import requests
import torch
from huggingface_hub import snapshot_download
from huggingface_hub.errors import LocalEntryNotFoundError
from tokenizers import AddedToken
from transformers import AutoTokenizer

from axolotl.utils.dict import DictDefault

from tests.hf_offline_utils import (
    enable_hf_offline,
    hf_offline_context,
)

logging.getLogger("filelock").setLevel(logging.CRITICAL)


def retry_on_request_exceptions(max_retries=3, delay=1):
    def decorator(func):
        @functools.wraps(func)
        def wrapper(*args, **kwargs):
            for attempt in range(max_retries):
                try:
                    return func(*args, **kwargs)
                except (
                    requests.exceptions.ReadTimeout,
                    requests.exceptions.ConnectionError,
                    requests.exceptions.HTTPError,
                ) as exc:
                    if attempt < max_retries - 1:
                        wait = 2**attempt * delay  # in seconds
                        time.sleep(wait)
                    else:
                        raise exc

        return wrapper

    return decorator


@retry_on_request_exceptions(max_retries=3, delay=5)
def snapshot_download_w_retry(*args, **kwargs):
    """
    download a model or dataset from HF Hub, retrying in requests failures. We also try to fetch it from the local
    cache first using hf_hub_offline to avoid hitting HF Hub API rate limits. If it doesn't exist in the cache,
    disable hf_hub_offline and actually fetch from the hub
    """
    with hf_offline_context(True):
        try:
            return snapshot_download(*args, local_files_only=True, **kwargs)
        except LocalEntryNotFoundError:
            pass
    with hf_offline_context(False):
        return snapshot_download(*args, **kwargs)


@pytest.fixture(scope="session", autouse=True)
def download_ds_fixture_bundle():
    ds_dir = snapshot_download_w_retry(
        "axolotl-ai-internal/axolotl-oss-dataset-fixtures", repo_type="dataset"
    )
    return Path(ds_dir)


@pytest.fixture(scope="session", autouse=True)
def download_smollm2_135m_model():
    # download the model
    snapshot_download_w_retry("HuggingFaceTB/SmolLM2-135M", repo_type="model")


@pytest.fixture(scope="session", autouse=True)
def download_smollm2_135m_instruct_model():
    # download the model
    snapshot_download_w_retry("HuggingFaceTB/SmolLM2-135M-Instruct", repo_type="model")


@pytest.fixture(scope="session", autouse=True)
def download_smollm2_135m_gptq_model():
    # download the model
    snapshot_download_w_retry("lilmeaty/SmolLM2-135M-Instruct-GPTQ", repo_type="model")


@pytest.fixture(scope="session", autouse=True)
def download_qwen_2_5_half_billion_model():
    # download the model
    snapshot_download_w_retry("Qwen/Qwen2.5-0.5B", repo_type="model")


@pytest.fixture(scope="session", autouse=True)
def download_qwen3_half_billion_model():
    # download the model
    snapshot_download_w_retry("Qwen/Qwen3-0.6B", repo_type="model")


@pytest.fixture(scope="session", autouse=True)
def download_tatsu_lab_alpaca_dataset():
    # download the dataset
    snapshot_download_w_retry("tatsu-lab/alpaca", repo_type="dataset")


@pytest.fixture(scope="session", autouse=True)
def download_mhenrichsen_alpaca_2k_dataset():
    # download the dataset
    snapshot_download_w_retry("mhenrichsen/alpaca_2k_test", repo_type="dataset")


@pytest.fixture(scope="session", autouse=True)
def download_mhenrichsen_alpaca_2k_w_revision_dataset():
    # download the dataset
    snapshot_download_w_retry(
        "mhenrichsen/alpaca_2k_test", repo_type="dataset", revision="d05c1cb"
    )


@pytest.fixture(scope="session", autouse=True)
def download_mlabonne_finetome_100k_dataset():
    # download the dataset
    snapshot_download_w_retry("mlabonne/FineTome-100k", repo_type="dataset")


@pytest.fixture(scope="session", autouse=True)
def download_argilla_distilabel_capybara_dpo_7k_binarized_dataset():
    # download the dataset
    snapshot_download_w_retry(
        "argilla/distilabel-capybara-dpo-7k-binarized", repo_type="dataset"
    )


@pytest.fixture(scope="session", autouse=True)
def download_argilla_distilabel_intel_orca_dpo_dataset():
    # download the dataset
    snapshot_download_w_retry(
        "argilla/distilabel-intel-orca-dpo-pairs", repo_type="dataset"
    )


@pytest.fixture(scope="session", autouse=True)
def download_argilla_ultrafeedback_binarized_preferences_cleaned_dataset():
    # download the dataset
    snapshot_download_w_retry(
        "argilla/ultrafeedback-binarized-preferences-cleaned", repo_type="dataset"
    )


@pytest.fixture(scope="session", autouse=True)
def download_argilla_ultrafeedback_binarized_preferences_cleaned_kto_dataset():
    # download the dataset
    snapshot_download_w_retry(
        "argilla/ultrafeedback-binarized-preferences-cleaned-kto", repo_type="dataset"
    )


# @pytest.fixture(scope="session", autouse=True)
# def download_fozzie_alpaca_dpo_dataset():
#     # download the dataset
#     snapshot_download_w_retry(
#         "fozziethebeat/alpaca_messages_2k_dpo_test", repo_type="dataset"
#     )
#     snapshot_download_w_retry(
#         "fozziethebeat/alpaca_messages_2k_dpo_test",
#         repo_type="dataset",
#         revision="ea82cff",
#     )


# @pytest.fixture(scope="session")
# @disable_hf_offline
# def dataset_fozzie_alpaca_dpo_dataset(
#     download_fozzie_alpaca_dpo_dataset,
# ):
#     return load_dataset("fozziethebeat/alpaca_messages_2k_dpo_test", split="train")
#
#
# @pytest.fixture(scope="session")
# @disable_hf_offline
# def dataset_fozzie_alpaca_dpo_dataset_rev_ea82cff(
#     download_fozzie_alpaca_dpo_dataset,
# ):
#     return load_dataset(
#         "fozziethebeat/alpaca_messages_2k_dpo_test", split="train", revision="ea82cff"
#     )


@pytest.fixture(scope="session", autouse=True)
def download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset():
    # download the dataset
    snapshot_download_w_retry(
        "arcee-ai/distilabel-intel-orca-dpo-pairs-binarized", repo_type="dataset"
    )


@pytest.fixture(scope="session", autouse=True)
def download_argilla_dpo_pairs_dataset():
    # download the dataset
    snapshot_download_w_retry(
        "argilla/distilabel-intel-orca-dpo-pairs", repo_type="dataset"
    )


@pytest.fixture(scope="session", autouse=True)
def download_tiny_shakespeare_dataset():
    # download the dataset
    snapshot_download_w_retry("winglian/tiny-shakespeare", repo_type="dataset")


@pytest.fixture(scope="session", autouse=True)
def download_evolkit_kd_sample_dataset():
    # download the dataset
    snapshot_download_w_retry(
        "axolotl-ai-co/evolkit-logprobs-pipeline-75k-v2-sample", repo_type="dataset"
    )


@pytest.fixture(scope="session", autouse=True)
def download_deepseek_model_fixture():
    snapshot_download_w_retry("axolotl-ai-co/DeepSeek-V3-11M", repo_type="model")


@pytest.fixture(scope="session", autouse=True)
def download_huggyllama_model_fixture():
    # download the tokenizer only
    snapshot_download_w_retry(
        "huggyllama/llama-7b",
        repo_type="model",
        allow_patterns=["*token*", "config.json"],
    )


@pytest.fixture(scope="session", autouse=True)
def download_llama33_70b_model_fixture():
    # download the tokenizer only
    snapshot_download_w_retry(
        "axolotl-ai-co/Llama-3.3-70B-Instruct-tokenizer",
        repo_type="model",
        allow_patterns=["*token*", "config.json"],
    )


@pytest.fixture(scope="session", autouse=True)
def download_llama_1b_model_fixture():
    # download the tokenizer only
    snapshot_download_w_retry(
        "NousResearch/Llama-3.2-1B",
        repo_type="model",
        allow_patterns=["*token*", "config.json"],
    )


@pytest.fixture(scope="session", autouse=True)
def download_llama3_8b_model_fixture():
    # download the tokenizer only
    snapshot_download_w_retry(
        "NousResearch/Meta-Llama-3-8B",
        repo_type="model",
        allow_patterns=["*token*", "config.json"],
    )


@pytest.fixture(scope="session", autouse=True)
def download_llama3_8b_instruct_model_fixture():
    # download the tokenizer only
    snapshot_download_w_retry(
        "NousResearch/Meta-Llama-3-8B-Instruct",
        repo_type="model",
        allow_patterns=["*token*", "config.json"],
    )


@pytest.fixture(scope="session", autouse=True)
def download_phi_35_mini_model_fixture():
    # download the tokenizer only
    snapshot_download_w_retry(
        "microsoft/Phi-3.5-mini-instruct",
        repo_type="model",
        allow_patterns=["*token*", "config.json"],
    )


@pytest.fixture(scope="session", autouse=True)
def download_phi_4_reasoning_model_fixture():
    # download the tokenizer only
    snapshot_download_w_retry(
        "microsoft/Phi-4-reasoning",
        repo_type="model",
        allow_patterns=["*token*", "config.json"],
    )


@pytest.fixture(scope="session", autouse=True)
def download_phi_3_medium_model_fixture():
    # download the tokenizer only
    snapshot_download_w_retry(
        "microsoft/Phi-3-medium-128k-instruct",
        repo_type="model",
        allow_patterns=["*token*", "config.json"],
    )


@pytest.fixture(scope="session", autouse=True)
def download_mistral_7b_model_fixture():
    # download the tokenizer only
    snapshot_download_w_retry(
        "casperhansen/mistral-7b-instruct-v0.1-awq",
        repo_type="model",
        allow_patterns=["*token*", "config.json"],
    )


@pytest.fixture(scope="session", autouse=True)
def download_gemma3_4b_model_fixture():
    # download the tokenizer only
    snapshot_download_w_retry(
        "mlx-community/gemma-3-4b-it-8bit",
        repo_type="model",
        allow_patterns=["*token*", "config.json"],
    )


@pytest.fixture(scope="session", autouse=True)
def download_gemma_2b_model_fixture():
    # download the tokenizer only
    snapshot_download_w_retry(
        "unsloth/gemma-2b-it",
        revision="703fb4a",
        repo_type="model",
        allow_patterns=["*token*", "config.json"],
    )


@pytest.fixture(scope="session", autouse=True)
def download_gemma2_9b_model_fixture():
    # download the tokenizer only
    snapshot_download_w_retry(
        "mlx-community/gemma-2-9b-it-4bit",
        repo_type="model",
        allow_patterns=["*token*", "config.json"],
    )


@pytest.fixture(scope="session", autouse=True)
def download_mlx_mistral_7b_model_fixture():
    # download the tokenizer only
    snapshot_download_w_retry(
        "mlx-community/Mistral-7B-Instruct-v0.3-4bit",
        repo_type="model",
        allow_patterns=["*token*", "config.json"],
    )


@pytest.fixture
def download_llama2_model_fixture():
    # download the tokenizer only
    snapshot_download_w_retry(
        "NousResearch/Llama-2-7b-hf",
        repo_type="model",
        allow_patterns=["*token*", "config.json"],
    )


@pytest.fixture(scope="session", autouse=True)
def download_llama32_1b_model_fixture():
    snapshot_download_w_retry(
        "osllmai-community/Llama-3.2-1B",
        repo_type="model",
    )


@pytest.fixture
@enable_hf_offline
def tokenizer_huggyllama(
    download_huggyllama_model_fixture,
):
    tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
    tokenizer.pad_token = "</s>"

    return tokenizer


@pytest.fixture
@enable_hf_offline
def tokenizer_huggyllama_w_special_tokens(
    tokenizer_huggyllama,
):
    tokenizer_huggyllama.add_special_tokens(
        {
            "bos_token": "<s>",
            "eos_token": "</s>",
            "unk_token": "<unk>",
        }
    )

    return tokenizer_huggyllama


@pytest.fixture
@enable_hf_offline
def tokenizer_llama2_7b(
    download_llama2_model_fixture,
):
    tokenizer = AutoTokenizer.from_pretrained("NousResearch/Llama-2-7b-hf")

    return tokenizer


@pytest.fixture
@enable_hf_offline
def tokenizer_mistral_7b_instruct(
    download_mlx_mistral_7b_model_fixture,
):
    return AutoTokenizer.from_pretrained("casperhansen/mistral-7b-instruct-v0.1-awq")


@pytest.fixture
def tokenizer_mistral_7b_instruct_chatml(tokenizer_mistral_7b_instruct):
    tokenizer_mistral_7b_instruct.add_special_tokens(
        {
            "eos_token": AddedToken(
                "<|im_end|>", rstrip=False, lstrip=False, normalized=False
            )
        }
    )
    tokenizer_mistral_7b_instruct.add_tokens(
        [
            AddedToken("<|im_start|>", rstrip=False, lstrip=False, normalized=False),
        ]
    )
    return tokenizer_mistral_7b_instruct


@pytest.fixture
def temp_dir() -> Generator[str, None, None]:
    # Create a temporary directory
    _temp_dir = tempfile.mkdtemp()
    yield _temp_dir
    # Clean up the directory after the test
    shutil.rmtree(_temp_dir)


@pytest.fixture(scope="function", autouse=True)
def torch_manual_seed():
    torch.manual_seed(42)


@pytest.fixture(scope="function", autouse=True)
def cleanup_monkeypatches():
    from transformers import Trainer
    from transformers.models.llama.modeling_llama import (  # LlamaFlashAttention2,
        LlamaAttention,
        LlamaForCausalLM,
    )

    # original_fa2_forward = LlamaFlashAttention2.forward
    original_llama_attn_forward = LlamaAttention.forward
    original_llama_forward = LlamaForCausalLM.forward
    original_trainer_inner_training_loop = Trainer._inner_training_loop
    original_trainer_training_step = Trainer.training_step
    # monkey patches can happen inside the tests
    yield
    # Reset LlamaFlashAttention2 forward
    # LlamaFlashAttention2.forward = original_fa2_forward
    LlamaAttention.forward = original_llama_attn_forward
    LlamaForCausalLM.forward = original_llama_forward
    Trainer._inner_training_loop = original_trainer_inner_training_loop
    Trainer.training_step = original_trainer_training_step

    # Reset other known monkeypatches
    modules_to_reset: list[tuple[str, list[str]]] = [
        ("transformers.models.llama",),
        (
            "transformers.models.llama.modeling_llama",
            [
                # "LlamaFlashAttention2",
                "LlamaAttention",
            ],
        ),
        ("transformers.trainer",),
        ("transformers", ["Trainer"]),
        ("transformers.loss.loss_utils",),
    ]
    for module_name_tuple in modules_to_reset:
        module_name = module_name_tuple[0]

        spec = importlib.util.spec_from_file_location(
            module_name, sys.modules[module_name].__file__
        )
        sys.modules[module_name] = importlib.util.module_from_spec(spec)
        spec.loader.exec_module(sys.modules[module_name])

        sys.modules[module_name] = importlib.reload(sys.modules[module_name])
        if len(module_name_tuple) > 1:
            module_globals = module_name_tuple[1]
            for module_global in module_globals:
                globals().pop(module_global, None)


@pytest.fixture
def dataset_winglian_tiny_shakespeare(
    download_ds_fixture_bundle: Path,
):
    ds_path = download_ds_fixture_bundle / "winglian__tiny-shakespeare"
    return datasets.load_from_disk(ds_path)


@pytest.fixture
def dataset_tatsu_lab_alpaca(
    download_ds_fixture_bundle: Path,
):
    ds_path = download_ds_fixture_bundle / "tatsu-lab__alpaca"
    return datasets.load_from_disk(ds_path)["train"]


@pytest.fixture
def dataset_mhenrichsen_alpaca_2k_test(
    download_ds_fixture_bundle: Path,
):
    ds_path = download_ds_fixture_bundle / "mhenrichsen__alpaca_2k_test"
    return datasets.load_from_disk(ds_path)["train"]


@pytest.fixture
def dataset_argilla_ultrafeedback_binarized_preferences_cleaned(
    download_ds_fixture_bundle: Path,
):
    ds_path = (
        download_ds_fixture_bundle
        / "argilla__ultrafeedback-binarized-preferences-cleaned"
    )
    return datasets.load_from_disk(ds_path)["train"]


@pytest.fixture
def dataset_fozziethebeat_alpaca_messages_2k_dpo_test(
    download_ds_fixture_bundle: Path,
):
    ds_path = download_ds_fixture_bundle / "fozziethebeat__alpaca_messages_2k_dpo_test"
    return datasets.load_from_disk(ds_path)["train"]


@pytest.fixture
def dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff(
    download_ds_fixture_bundle: Path,
):
    ds_path = (
        download_ds_fixture_bundle
        / "fozziethebeat__alpaca_messages_2k_dpo_test__rev_ea82cff"
    )
    return datasets.load_from_disk(ds_path)["train"]


@pytest.fixture(name="min_base_cfg")
def fixture_min_base_cfg():
    return DictDefault(
        base_model="HuggingFaceTB/SmolLM2-135M",
        learning_rate=1e-3,
        datasets=[
            {
                "path": "mhenrichsen/alpaca_2k_test",
                "type": "alpaca",
            },
        ],
        micro_batch_size=1,
        gradient_accumulation_steps=1,
    )


#
@pytest.mark.skipif(
    os.environ.get("AXOLOTL_IS_CI_CACHE_PRELOAD", "-1") != "1",
    reason="Not running in CI cache preload",
)
def test_load_fixtures(
    download_smollm2_135m_model,
    download_qwen_2_5_half_billion_model,
    download_tatsu_lab_alpaca_dataset,
    download_mhenrichsen_alpaca_2k_dataset,
    download_mhenrichsen_alpaca_2k_w_revision_dataset,
    download_mlabonne_finetome_100k_dataset,
    download_argilla_ultrafeedback_binarized_preferences_cleaned_dataset,
    download_argilla_ultrafeedback_binarized_preferences_cleaned_kto_dataset,
    download_argilla_distilabel_capybara_dpo_7k_binarized_dataset,
    download_arcee_ai_distilabel_intel_orca_dpo_pairs_dataset,
    download_argilla_dpo_pairs_dataset,
    download_tiny_shakespeare_dataset,
    download_deepseek_model_fixture,
    download_huggyllama_model_fixture,
    download_llama_1b_model_fixture,
    download_llama3_8b_model_fixture,
    download_llama3_8b_instruct_model_fixture,
    download_phi_35_mini_model_fixture,
    download_phi_3_medium_model_fixture,
    download_phi_4_reasoning_model_fixture,
    download_mistral_7b_model_fixture,
    download_gemma_2b_model_fixture,
    download_gemma2_9b_model_fixture,
    download_mlx_mistral_7b_model_fixture,
    download_llama2_model_fixture,
):
    pass


@pytest.fixture(autouse=True)
def disable_telemetry(monkeypatch):
    monkeypatch.setenv("AXOLOTL_DO_NOT_TRACK", "1")
    yield


================================================
FILE: tests/constants.py
================================================
# constants.py
"""
This module contains constants and configuration dictionaries used for
datasets and other utilities in the Axolotl project, specifically for testing.
"""

# Configuration for Alpaca Messages Dataset
ALPACA_MESSAGES_CONFIG_OG = {
    "path": "fozziethebeat/alpaca_messages_2k_dpo_test",
    "type": "chat_template.default",
    "chat_template": "llama3",
    "field_messages": "conversation",
    "field_chosen": "chosen",
    "field_rejected": "rejected",
    "message_field_role": "role",
    "message_field_content": "content",
    "roles": {
        "system": ["system"],
        "user": ["user"],
        "assistant": ["assistant"],
    },
}

# Revision configuration extending the original
ALPACA_MESSAGES_CONFIG_REVISION = ALPACA_MESSAGES_CONFIG_OG.copy()
ALPACA_MESSAGES_CONFIG_REVISION["revision"] = "ea82cff"


SPECIAL_TOKENS = {
    "bos_token": "<s>",
    "eos_token": "</s>",
    "unk_token": "<unk>",
}


================================================
FILE: tests/core/chat/__init__.py
================================================


================================================
FILE: tests/core/chat/format/__init__.py
================================================


================================================
FILE: tests/core/chat/test_messages.py
================================================
"""
Tests for the chat messages module
"""

import unittest

import pytest
from transformers import AddedToken, AutoTokenizer

from axolotl.core.chat.format.chatml import format_message
from axolotl.core.chat.messages import ChatFormattedChats, Chats

from tests.hf_offline_utils import enable_hf_offline  # noqa


@pytest.fixture(scope="session", name="llama_tokenizer")
@enable_hf_offline
def llama_tokenizer_fixture():
    return AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B")


@pytest.fixture(scope="session", name="chatml_tokenizer")
def llama_tokenizer_w_chatml(llama_tokenizer):
    llama_tokenizer.add_special_tokens(
        {
            "eos_token": AddedToken(
                "<|im_end|>", rstrip=False, lstrip=False, normalized=False
            )
        }
    )
    llama_tokenizer.add_tokens(
        [
            AddedToken("<|im_start|>", rstrip=False, lstrip=False, normalized=False),
        ]
    )

    return llama_tokenizer


@pytest.fixture(scope="session", name="chat_msgs")
def chat_msgs_fixture():
    return {
        "conversation": [
            {
                "role": "system",
                "content": [
                    {"type": "text", "value": "You are a helpful assistant."},
                ],
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "value": "What is today's stock price of Apple?"},
                ],
            },
            {
                "role": "assistant",
                "content": [
                    {
                        "type": "tool_call",
                        "value": {
                            "name": "get_date",
                            "arguments": {},
                        },
                    },
                    {
                        "type": "tool_call",
                        "value": {
                            "name": "get_stock_price",
                            "arguments": {"symbol": "AAPL"},
                        },
                    },
                ],
                "weight": 1,
            },
            {
                "role": "tool",
                "content": [
                    {
                        "type": "tool_response",
                        "value": {
                            "name": "get_date",
                            "content": {"date": "2024-09-09"},
                        },
                    },
                    {
                        "type": "tool_response",
                        "value": {
                            "name": "get_stock_price",
                            "content": {"symbol": "AAPL", "price": 123.45},
                        },
                    },
                ],
            },
            {
                "role": "assistant",
                "content": [
                    {
                        "type": "text",
                        "value": "The stock price of Apple is $123.45.\n",
                        "weight": 0,
                    },
                    {
                        "type": "text",
                        "value": "<reflection>The original query asked for today's stock price of Apple. This implies they also wanted the date included in the response.</reflection>",
                    },
                    {
                        "type": "text",
                        "value": "The stock price of Apple on September 9, 2024 is $123.45.",
                    },
                ],
                "weight": 1,
            },
        ]
    }


class TestMessagesCase:
    """
    Test cases for the chat messages module
    """

    def test_tool_call_stringify(self, chat_msgs):
        chat_msgs_as_obj = Chats(**chat_msgs)
        assert '{"name": "get_stock_price", "arguments": {"symbol": "AAPL"}}' == str(
            chat_msgs_as_obj.conversation[2].content[1].value
        )

    def test_chatml_formatted_wrapper(self, chat_msgs):
        chat_msg_formatted = ChatFormattedChats(**chat_msgs, formatter=format_message)
        target_chatml = """<|im_start|>system
You are a helpful assistant.<|im_end|>
<|im_start|>user
What is today's stock price of Apple?<|im_end|>
<|im_start|>assistant
<tool_call>
{"name": "get_date", "arguments": {}}
</tool_call>
<tool_call>
{"name": "get_stock_price", "arguments": {"symbol": "AAPL"}}
</tool_call>
<|im_end|>
<|im_start|>tool
<tool_response>
{"name": "get_date", "content": {"date": "2024-09-09"}}
</tool_response>
<tool_response>
{"name": "get_stock_price", "content": {"symbol": "AAPL", "price": 123.45}}
</tool_response>
<|im_end|>
<|im_start|>assistant
The stock price of Apple is $123.45.
<reflection>The original query asked for today's stock price of Apple. This implies they also wanted the date included in the response.</reflection>The stock price of Apple on September 9, 2024 is $123.45.<|im_end|>\n"""
        assert target_chatml == str(chat_msg_formatted)

    def test_chatml_formatting_tool_call(self, chat_msgs):
        chat_msgs_as_obj = Chats(**chat_msgs)
        target_chatml_turn2 = """<|im_start|>assistant\n<tool_call>\n{"name": "get_date", "arguments": {}}\n</tool_call>\n<tool_call>\n{"name": "get_stock_price", "arguments": {"symbol": "AAPL"}}\n</tool_call>\n<|im_end|>\n"""
        assert target_chatml_turn2 == str(
            format_message(chat_msgs_as_obj.conversation[2])
        )

    def test_train_labels(self, chatml_tokenizer, chat_msgs):
        chat_msg_formatted = ChatFormattedChats(**chat_msgs, formatter=format_message)
        tokenized = chat_msg_formatted.conversation[2].tokenized(chatml_tokenizer)
        # fmt: off
        target_labels = [
            -100, -100, -100,  # role
            27, 14506, 13735, 397, 5018, 609, 794,
            330, 456, 4257, 498, 330, 16774, 794, 4792, 534, 524,
            14506, 13735, 397, 27, 14506, 13735, 397, 5018, 609, 794,
            330, 456, 31641, 9217, 498, 330, 16774, 794, 5324, 19314,
            794, 330, 84016, 43, 96742, 524, 14506, 13735, 397,
            128256,  # <|im_end|>
            -100  # trailing newline
        ]
        # fmt: on
        assert tokenized["labels"] == target_labels

    def test_train_labels_2(self, chatml_tokenizer, chat_msgs):
        # also test if indivudal contents are set not to train
        chat_msg_formatted = ChatFormattedChats(**chat_msgs, formatter=format_message)
        tokenized = chat_msg_formatted.conversation[4].tokenized(chatml_tokenizer)
        # fmt: off
        target_labels = [
            -100, -100, -100,  # role
            -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,  # initial response
            27, 78098, 16761, 4113, 3319, 4691, 369, 3432, 596, 5708, 3430,
            315, 8325, 13, 1115, 24897, 814, 1101, 4934, 279, 2457,
            5343, 304, 279, 2077, 4005, 78098, 16761, 5708, 3430, 315,
            8325, 389, 6250, 220, 24, 11, 220, 2366, 19, 374, 400,
            4513, 13, 1774, 13,
            128256,  # <|im_end|>
            -100,  # trailing newline
        ]
        # fmt: on
        assert tokenized["labels"] == target_labels


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/core/test_async_grpo.py
================================================
"""Unit tests for async GRPO"""

import unittest
from unittest.mock import MagicMock

import torch


class TestReplayBuffer(unittest.TestCase):
    """Tests for ReplayBuffer edge cases."""

    def test_add_noop_when_max_size_zero(self):
        from axolotl.core.trainers.grpo.replay_buffer import ReplayBuffer

        buf = ReplayBuffer(max_size=0)
        buf.add(1.0, {"data": "test"})
        self.assertEqual(len(buf), 0)

    def test_add_noop_when_max_size_negative(self):
        from axolotl.core.trainers.grpo.replay_buffer import ReplayBuffer

        buf = ReplayBuffer(max_size=-1)
        buf.add(1.0, {"data": "test"})
        self.assertEqual(len(buf), 0)

    def test_sample_returns_none_when_max_size_zero(self):
        from axolotl.core.trainers.grpo.replay_buffer import ReplayBuffer

        buf = ReplayBuffer(max_size=0)
        self.assertIsNone(buf.sample(1))

    def test_sample_returns_none_when_empty(self):
        from axolotl.core.trainers.grpo.replay_buffer import ReplayBuffer

        buf = ReplayBuffer(max_size=5)
        self.assertIsNone(buf.sample(1))

    def test_normal_add_and_sample(self):
        from axolotl.core.trainers.grpo.replay_buffer import ReplayBuffer

        buf = ReplayBuffer(max_size=3)
        buf.add(1.0, {"a": 1})
        buf.add(2.0, {"a": 2})
        buf.add(3.0, {"a": 3})
        self.assertEqual(len(buf), 3)
        result = buf.sample(1)
        self.assertIsNotNone(result)
        self.assertEqual(len(result), 1)

    def test_replaces_lowest_when_full(self):
        from axolotl.core.trainers.grpo.replay_buffer import ReplayBuffer

        buf = ReplayBuffer(max_size=2)
        buf.add(1.0, {"a": 1})
        buf.add(2.0, {"a": 2})
        buf.add(3.0, {"a": 3})  # should replace score=1.0
        self.assertEqual(len(buf), 2)
        scores = sorted(item[0] for item in buf._heap)
        self.assertEqual(scores, [2.0, 3.0])


class TestGRPOStrategyConflict(unittest.TestCase):
    """Tests for sequence_parallel + async_grpo conflict detection."""

    def test_raises_on_both_enabled(self):
        from axolotl.core.trainers.grpo import GRPOStrategy

        with self.assertRaises(ValueError) as ctx:
            GRPOStrategy.get_trainer_class(sequence_parallel=True, async_grpo=True)
        self.assertIn("sequence_parallel", str(ctx.exception))
        self.assertIn("async_grpo", str(ctx.exception))

    def test_sequence_parallel_only(self):
        from axolotl.core.trainers.grpo import GRPOStrategy
        from axolotl.core.trainers.grpo.trainer import (
            AxolotlGRPOSequenceParallelTrainer,
        )

        cls = GRPOStrategy.get_trainer_class(sequence_parallel=True, async_grpo=False)
        self.assertIs(cls, AxolotlGRPOSequenceParallelTrainer)

    def test_async_only(self):
        from axolotl.core.trainers.grpo import GRPOStrategy
        from axolotl.core.trainers.grpo.trainer import AxolotlAsyncGRPOTrainer

        cls = GRPOStrategy.get_trainer_class(sequence_parallel=False, async_grpo=True)
        self.assertIs(cls, AxolotlAsyncGRPOTrainer)

    def test_neither(self):
        from axolotl.core.trainers.grpo import GRPOStrategy
        from axolotl.core.trainers.grpo.trainer import AxolotlGRPOTrainer

        cls = GRPOStrategy.get_trainer_class(sequence_parallel=False, async_grpo=False)
        self.assertIs(cls, AxolotlGRPOTrainer)


class TestDequantizeFP8TailBlocks(unittest.TestCase):
    """Tests for FP8 dequantization with non-divisible dimensions."""

    def test_exact_divisible_shape(self):
        from axolotl.kernels.quantize import dequantize_fp8

        W = torch.randn(256, 128, dtype=torch.bfloat16).to(torch.float8_e4m3fn)
        scale_inv = torch.ones(2, 1, dtype=torch.bfloat16)
        result = dequantize_fp8(W, scale_inv)
        self.assertEqual(result.shape, (256, 128))
        self.assertEqual(result.dtype, torch.bfloat16)

    def test_non_divisible_rows(self):
        from axolotl.kernels.quantize import dequantize_fp8

        # 130 rows, scale has 2 blocks (block_size ~65 for exact div, but with
        # tail blocks: first block=65 rows, second=65 rows, 130%2=0 actually).
        # Use 131 rows with 2 scale blocks to trigger tail handling.
        W = torch.ones(131, 128, dtype=torch.bfloat16).to(torch.float8_e4m3fn)
        scale_inv = torch.tensor([[2.0], [3.0]], dtype=torch.bfloat16)
        result = dequantize_fp8(W, scale_inv)
        self.assertEqual(result.shape, (131, 128))
        self.assertEqual(result.dtype, torch.bfloat16)

    def test_non_divisible_cols(self):
        from axolotl.kernels.quantize import dequantize_fp8

        W = torch.ones(128, 200, dtype=torch.bfloat16).to(torch.float8_e4m3fn)
        scale_inv = torch.ones(1, 2, dtype=torch.bfloat16)
        result = dequantize_fp8(W, scale_inv)
        self.assertEqual(result.shape, (128, 200))

    def test_scalar_scale(self):
        from axolotl.kernels.quantize import dequantize_fp8

        W = torch.ones(64, 64, dtype=torch.bfloat16).to(torch.float8_e4m3fn)
        scale_inv = torch.tensor(2.0, dtype=torch.bfloat16)
        result = dequantize_fp8(W, scale_inv)
        self.assertEqual(result.shape, (64, 64))


class TestLoraFP8Guard(unittest.TestCase):
    """Tests that get_lora_parameters only uses weight_scale_inv for FP8 weights."""

    def test_non_fp8_weight_skips_scale_inv(self):
        """Non-FP8 weight should NOT pick up weight_scale_inv as quant_state."""
        from axolotl.kernels.lora import get_lora_parameters

        proj = MagicMock()
        proj.disable_adapters = True
        base_layer = MagicMock(spec=[])  # empty spec to control attrs precisely

        # Use a real tensor for weight (bf16, no quant_state attr)
        base_layer.weight = torch.randn(64, 64, dtype=torch.bfloat16)
        base_layer.bias = None
        base_layer.weight_scale_inv = torch.ones(1)  # should NOT be used for bf16

        proj.base_layer = base_layer

        W, b, quant_state, A, B, s = get_lora_parameters(proj)
        # quant_state should be None since weight is bf16, not FP8
        self.assertIsNone(quant_state)

    def test_fp8_weight_uses_scale_inv(self):
        """FP8 weight should pick up weight_scale_inv as quant_state."""
        from axolotl.kernels.lora import get_lora_parameters

        proj = MagicMock()
        proj.disable_adapters = True
        base_layer = MagicMock()
        proj.base_layer = base_layer

        # FP8 weight
        base_layer.weight = torch.randn(64, 64, dtype=torch.bfloat16).to(
            torch.float8_e4m3fn
        )
        base_layer.bias = None
        scale_inv = torch.ones(1)
        base_layer.weight_scale_inv = scale_inv

        W, b, quant_state, A, B, s = get_lora_parameters(proj)
        self.assertIs(quant_state, scale_inv)


class TestValidateQuantPatchRestore(unittest.TestCase):
    """Test that validate_quantization_for_training is restored after trainer creation."""

    def test_patch_restored_on_success(self):
        """Monkeypatch should be restored even after successful trainer creation."""
        import transformers.trainer as _trainer_module

        original = _trainer_module.validate_quantization_for_training

        # After the build() method runs, original should be restored.
        # We can't easily test the full build(), but we can test the pattern.
        _orig = _trainer_module.validate_quantization_for_training
        _trainer_module.validate_quantization_for_training = lambda model: None
        try:
            pass  # simulate trainer_cls() succeeding
        finally:
            _trainer_module.validate_quantization_for_training = _orig

        self.assertIs(_trainer_module.validate_quantization_for_training, original)

    def test_patch_restored_on_error(self):
        """Monkeypatch should be restored even if trainer creation raises."""
        import transformers.trainer as _trainer_module

        original = _trainer_module.validate_quantization_for_training

        _orig = _trainer_module.validate_quantization_for_training
        _trainer_module.validate_quantization_for_training = lambda model: None
        try:
            raise ValueError("test error")
        except ValueError:
            pass
        finally:
            _trainer_module.validate_quantization_for_training = _orig

        self.assertIs(_trainer_module.validate_quantization_for_training, original)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/core/test_builders.py
================================================
"""Unit tests for axolotl.core.builders"""

import sys
from pathlib import Path
from unittest.mock import MagicMock, patch

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.core.builders import HFCausalTrainerBuilder, HFRLTrainerBuilder
from axolotl.loaders import ModelLoader, load_tokenizer
from axolotl.utils.config import normalize_config
from axolotl.utils.data import prepare_preference_datasets
from axolotl.utils.dict import DictDefault
from axolotl.utils.schemas.enums import RLType

from tests.constants import ALPACA_MESSAGES_CONFIG_REVISION


@pytest.fixture(name="base_cfg")
def fixture_base_cfg():
    """
    Base config with all common arguments between SFT and RLHF
    """
    cfg = DictDefault(
        {
            # Model and tokenizer settings
            "base_model": "HuggingFaceTB/SmolLM2-135M-Instruct",
            "sequence_len": 2048,
            "model_config_type": "llama",  # example type
            # Basic training settings
            "micro_batch_size": 2,
            "eval_batch_size": 2,
            "num_epochs": 1,
            "gradient_accumulation_steps": 1,
            "max_steps": 100,
            "val_set_size": 0,
            # Optimizer settings
            "optimizer": "adamw_torch_fused",
            "learning_rate": 0.00005,
            "weight_decay": 0.01,
            "adam_beta1": 0.998,
            "adam_beta2": 0.9,
            "adam_epsilon": 0.00001,
            "max_grad_norm": 1.0,
            # LR scheduler settings
            "lr_scheduler": "cosine",
            "lr_scheduler_kwargs": {"foo": "bar"},
            "warmup_steps": 10,
            "warmup_ratio": None,
            "cosine_min_lr_ratio": 0.1,
            "cosine_constant_lr_ratio": 0.2,
            # Checkpointing and saving
            "save_steps": 100,
            "output_dir": "./model-out",
            "save_total_limit": 4,
            "save_only_model": False,
            # Hardware/performance settings
            "gradient_checkpointing": False,
            "gradient_checkpointing_kwargs": {"use_reentrant": False},
            "dataloader_num_workers": 1,
            "dataloader_pin_memory": True,
            "dataloader_prefetch_factor": 2,
            "context_parallel_size": 1,
            "tensor_parallel_size": 1,
            # Dtype
            "fp16": False,
            "bf16": False,
            "tf32": False,
            # Logging and evaluation
            "logging_steps": 10,
            "eval_steps": 50,
            "eval_strategy": "steps",
            "save_strategy": "steps",
            "include_tokens_per_second": True,
            # Other common settings
            "seed": 42,
            "remove_unused_columns": True,
            "ddp_timeout": 1800,
            "ddp_bucket_cap_mb": 25,
            "ddp_broadcast_buffers": False,
            "dataset_num_proc": 4,
        }
    )

    normalize_config(cfg)
    return cfg


@pytest.fixture(name="dpo_cfg")
def fixture_dpo_cfg(base_cfg):
    cfg = base_cfg.copy()
    cfg.update(
        {
            "rl": RLType.DPO,
            "dpo_use_weighting": True,
            "dpo_label_smoothing": 0.1,
            "beta": 0.1,  # DPO beta
        }
    )
    return cfg


@pytest.fixture(name="orpo_cfg")
def fixture_orpo_cfg(base_cfg):
    cfg = base_cfg.copy()
    cfg.update(
        {
            "rl": RLType.ORPO,
            "orpo_alpha": 0.1,
            "max_prompt_len": 512,
        }
    )
    return cfg


@pytest.fixture(name="kto_cfg")
def fixture_kto_cfg(base_cfg):
    cfg = base_cfg.copy()
    cfg.update(
        {
            "rl": RLType.KTO,
            "kto_desirable_weight": 1.0,
            "kto_undesirable_weight": 1.0,
            "max_prompt_len": 512,
        }
    )
    return cfg


@pytest.fixture(name="grpo_cfg")
def fixture_grpo_cfg(base_cfg):
    cfg = base_cfg.copy()
    cfg.update(
        {
            "rl": RLType.GRPO,
            "trl": DictDefault(
                {
                    "beta": 0.001,
                    "max_completion_length": 256,
                    "use_vllm": False,  # run on CPU
                    # "vllm_device": "auto",
                    # "vllm_gpu_memory_utilization": 0.15,
                    "num_generations": 4,
                    "reward_funcs": ["rewards.rand_reward_func"],
                }
            ),
            # Must be evenly divisible by num_generations
            "micro_batch_size": 4,
            "datasets": [
                {
                    "path": "openai/gsm8k",
                    "name": "main",
                    "split": "train[:1%]",
                }
            ],
        }
    )
    return DictDefault(cfg)


@pytest.fixture(name="ipo_cfg")
def fixture_ipo_cfg(base_cfg):
    cfg = base_cfg.copy()
    cfg.update(
        {
            "rl": RLType.IPO,
            "dpo_label_smoothing": 0,
            "beta": 0.1,
        }
    )
    return cfg


@pytest.fixture(name="simpo_cfg")
def fixture_simpo_cfg(base_cfg):
    cfg = base_cfg.copy()
    cfg.update(
        {
            "rl": RLType.SIMPO,
            "rl_beta": 0.2,
            "cpo_alpha": 0.9,
            "simpo_gamma": 0.4,
        }
    )
    return cfg


@pytest.fixture(name="sft_cfg")
def fixture_sft_cfg(base_cfg):
    cfg = base_cfg.copy()
    cfg.update(
        {
            "rl": None,
            "sample_packing": False,
            "eval_sample_packing": False,
            "flash_attention": False,
        }
    )
    return cfg


@pytest.fixture(name="rm_cfg")
def fixture_rm_cfg(sft_cfg):
    cfg = sft_cfg.copy()
    cfg.update(
        DictDefault(
            {
                "reward_model": True,
                "datasets": [
                    {
                        "path": "argilla/distilabel-intel-orca-dpo-pairs",
                        "type": "bradley_terry.chat_template",
                        "split": "train[:1%]",
                    }
                ],
            }
        )
    )
    return cfg


@pytest.fixture(name="prm_cfg")
def fixture_prm_cfg(sft_cfg):
    cfg = sft_cfg.copy()
    cfg.update(
        DictDefault(
            {
                "process_reward_model": True,
                "datasets": [
                    {
                        "path": "trl-lib/math_shepherd",
                        "type": "stepwise_supervised",
                        "split": "train[:1%]",
                    }
                ],
            }
        )
    )
    return cfg


@pytest.fixture(name="tokenizer")
def fixture_tokenizer(base_cfg):
    return load_tokenizer(base_cfg)


@pytest.fixture(name="model")
def fixture_model(base_cfg, tokenizer):
    model, _ = ModelLoader(base_cfg, tokenizer).load()
    return model


class TestHFRLTrainerBuilder:
    """
    TestCase class for RLHF trainer builders
    """

    def _test_common_training_arguments(self, training_arguments, rl: str):
        """Helper to test common arguments across all variants"""
        # Basic training settings
        if rl == "grpo":
            # grpo_cfg's micro_batch_size is diff from others
            assert training_arguments.per_device_train_batch_size == 4
        else:
            assert training_arguments.per_device_train_batch_size == 2
        assert training_arguments.gradient_accumulation_steps == 1
        assert training_arguments.max_steps == 100

        # Optimizer settings
        assert training_arguments.learning_rate == 0.00005
        assert training_arguments.weight_decay == 0.01
        assert training_arguments.adam_beta1 == 0.998
        assert training_arguments.adam_beta2 == 0.9
        assert training_arguments.adam_epsilon == 0.00001
        assert training_arguments.max_grad_norm == 1.0

        # LR scheduler settings
        assert training_arguments.lr_scheduler_type == "cosine"
        assert training_arguments.warmup_steps == 10
        assert training_arguments.cosine_min_lr_ratio == 0.1
        assert training_arguments.cosine_constant_lr_ratio == 0.2

        # Other settings
        assert training_arguments.dataloader_num_workers == 1
        assert training_arguments.dataloader_pin_memory is True

        # TODO(wing): restore once trl releases 0.22.0
        # assert training_arguments.gradient_checkpointing is True

    def test_dpo_training_arguments(self, dpo_cfg, model, tokenizer):
        builder = HFRLTrainerBuilder(dpo_cfg, model, tokenizer)
        training_arguments, _ = builder._build_training_arguments(100)

        self._test_common_training_arguments(training_arguments, rl=dpo_cfg.rl)
        # DPO specific
        assert training_arguments.beta == 0.1
        assert hasattr(training_arguments, "use_weighting")
        assert training_arguments.use_weighting is True
        assert training_arguments.label_smoothing == 0.1

    def test_orpo_training_arguments(self, orpo_cfg, model, tokenizer):
        builder = HFRLTrainerBuilder(orpo_cfg, model, tokenizer)
        training_arguments, _ = builder._build_training_arguments(100)

        self._test_common_training_arguments(training_arguments, rl=orpo_cfg.rl)
        # ORPO specific
        assert training_arguments.beta == 0.1  # maps from orpo_alpha

    def test_kto_training_arguments(self, kto_cfg, model, tokenizer):
        builder = HFRLTrainerBuilder(kto_cfg, model, tokenizer)
        training_arguments, _ = builder._build_training_arguments(100)

        self._test_common_training_arguments(training_arguments, rl=kto_cfg.rl)
        # KTO specific
        assert training_arguments.desirable_weight == 1.0
        assert training_arguments.undesirable_weight == 1.0

    def _write_rewards_file(self, rewards_dir: Path):
        """
        Writes reward function to local tmp path to be loaded on trainer building
        """
        # Create rewards.py in a directory we can import from
        rewards_dir.mkdir()
        rewards_file = rewards_dir / "rewards.py"
        rewards_file.write_text(
            """import random
def rand_reward_func(prompts, completions) -> list[float]:
    return [random.uniform(0, 1) for _ in completions]
"""
        )

    def test_grpo_training_arguments(self, grpo_cfg, model, tokenizer, tmp_path):
        rewards_dir = tmp_path / "rewards_test"
        self._write_rewards_file(rewards_dir)

        # Add the directory to Python path so we can import the module
        sys.path.insert(0, str(rewards_dir))

        try:
            builder = HFRLTrainerBuilder(grpo_cfg, model, tokenizer)
            training_arguments, _ = builder._build_training_arguments(100)
            builder.train_dataset = MagicMock()

            self._test_common_training_arguments(training_arguments, rl=grpo_cfg.rl)
            # GRPO specific
            assert training_arguments.beta == 0.001
            assert training_arguments.max_completion_length == 256
            assert training_arguments.use_vllm is False
            # assert training_arguments.vllm_device == "auto"
            # assert training_arguments.vllm_gpu_memory_utilization == 0.15
            assert training_arguments.num_generations == 4

            # Test trainer creation to verify reward_funcs
            trainer = builder.build(100)

            # Verify reward functions are properly loaded
            assert len(trainer.reward_funcs) == 1
            assert trainer.reward_funcs[0].__module__ == "rewards"
            assert trainer.reward_funcs[0].__name__ == "rand_reward_func"
        finally:
            # remove imported module from path
            if str(rewards_dir) in sys.path:
                sys.path.remove(str(rewards_dir))

    def test_ipo_training_arguments(self, ipo_cfg, model, tokenizer):
        builder = HFRLTrainerBuilder(ipo_cfg, model, tokenizer)
        training_arguments, _ = builder._build_training_arguments(100)

        self._test_common_training_arguments(training_arguments, rl=ipo_cfg.rl)
        # IPO specific
        assert training_arguments.beta == 0.1
        assert training_arguments.loss_type == ["ipo"]
        assert training_arguments.label_smoothing == 0

    def test_simpo_training_arguments(self, simpo_cfg, model, tokenizer):
        builder = HFRLTrainerBuilder(simpo_cfg, model, tokenizer)
        training_arguments, _ = builder._build_training_arguments(100)

        self._test_common_training_arguments(training_arguments, rl=simpo_cfg.rl)
        # SIMPO specific
        assert training_arguments.beta == 0.2
        assert training_arguments.cpo_alpha == 0.9
        assert training_arguments.simpo_gamma == 0.4

    @pytest.mark.parametrize(
        ("cfg_string", "dataset_name"),
        [
            (
                "dpo_cfg",
                "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
            ),
            (
                "ipo_cfg",
                "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
            ),
            (
                "grpo_cfg",
                "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
            ),
            ("orpo_cfg", None),  # don't use fixture for orpo to use smaller split
            ("kto_cfg", None),  # no fixture for kto
            # (
            #     "simpo_cfg",
            #     "dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff",
            # ),
        ],
    )
    def test_custom_optimizer_cls_and_kwargs(
        self,
        request,
        cfg_string,
        dataset_name,
        tmp_path,
        model,
        tokenizer,
    ):
        cfg = request.getfixturevalue(cfg_string)

        builder = HFRLTrainerBuilder(cfg, model, tokenizer)
        cfg["optimizer"] = "muon"

        if cfg_string in ["dpo_cfg", "ipo_cfg", "grpo_cfg", "simpo_cfg"]:
            cfg["datasets"] = [DictDefault(ALPACA_MESSAGES_CONFIG_REVISION)]
        elif cfg_string == "kto_cfg":
            cfg["datasets"] = [
                DictDefault(
                    {
                        "path": "argilla/ultrafeedback-binarized-preferences-cleaned-kto",
                        "type": "llama3.ultra",
                        "split": "train[:1%]",
                    }
                )
            ]
        elif cfg_string == "orpo_cfg":
            cfg["datasets"] = [
                DictDefault(
                    {
                        "path": "argilla/ultrafeedback-binarized-preferences-cleaned",
                        "type": "chat_template.argilla",
                        "split": "train[:1%]",
                    }
                )
            ]
        else:
            raise ValueError(f"Unhandled cfg_string: {cfg_string}")
        cfg["dataset_num_proc"] = 4

        if cfg_string == "grpo_cfg":
            rewards_dir = tmp_path / "rewards_test"
            self._write_rewards_file(rewards_dir)

            # Add the directory to Python path so we can import the module
            sys.path.insert(0, str(rewards_dir))

        try:
            # Only use mock for the commented out configs
            if dataset_name is not None:
                with patch(
                    "axolotl.utils.data.rl.load_dataset_with_config"
                ) as mock_load_dataset:
                    mock_load_dataset.return_value = request.getfixturevalue(
                        dataset_name
                    )
                    train_dataset, eval_dataset = prepare_preference_datasets(
                        cfg, tokenizer
                    )
            else:
                # Load actual datasets for orpo_cfg and kto_cfg
                train_dataset, eval_dataset = prepare_preference_datasets(
                    cfg, tokenizer
                )

            builder.train_dataset = train_dataset
            builder.eval_dataset = eval_dataset

            trainer = builder.build(100)

            assert trainer.optimizer_cls_and_kwargs is not None

            from axolotl.contribs.mit.muon import MuonOptimizerFactory
            from axolotl.contribs.mit.muon.muon import Muon

            optimizer_cls, optimizer_kwargs = trainer.optimizer_cls_and_kwargs
            assert optimizer_cls is MuonOptimizerFactory
            assert optimizer_kwargs["lr"] == 0.00005
            assert optimizer_kwargs["weight_decay"] == 0.01
            assert optimizer_kwargs["betas"] == (0.998, 0.9)
            assert optimizer_kwargs["eps"] == 0.00001

            # Ensure optimizer is created with correct class
            optim = trainer.create_optimizer()
            assert isinstance(optim, Muon)

        finally:
            # remove imported module from path
            if cfg_string == "grpo_cfg" and str(rewards_dir) in sys.path:
                sys.path.remove(str(rewards_dir))


class TestHFCausalTrainerBuilder:
    """
    TestCase class for SFT trainer builder
    """

    def test_training_arguments(self, sft_cfg, model, tokenizer):
        builder = HFCausalTrainerBuilder(sft_cfg, model, tokenizer)
        trainer = builder.build(100)
        training_arguments = trainer.args

        # Test common arguments
        assert training_arguments.per_device_train_batch_size == 2
        assert training_arguments.gradient_accumulation_steps == 1
        assert training_arguments.max_steps == 100

        assert training_arguments.learning_rate == 0.00005
        assert training_arguments.weight_decay == 0.01
        assert training_arguments.adam_beta1 == 0.998
        assert training_arguments.adam_beta2 == 0.9
        assert training_arguments.adam_epsilon == 0.00001
        assert training_arguments.max_grad_norm == 1.0

        assert training_arguments.lr_scheduler_type == "cosine"
        assert training_arguments.warmup_steps == 10
        assert training_arguments.cosine_min_lr_ratio == 0.1

        assert training_arguments.dataloader_num_workers == 1
        assert training_arguments.dataloader_pin_memory is True
        assert training_arguments.gradient_checkpointing is False

        # SFT specific
        assert training_arguments.sample_packing is False
        assert training_arguments.eval_sample_packing is False

    @pytest.mark.parametrize(
        "cfg_string",
        [
            "sft_cfg",
            "rm_cfg",
            "prm_cfg",
        ],
    )
    def test_builder_w_rm_trainers(self, request, cfg_string, model, tokenizer):
        cfg = request.getfixturevalue(cfg_string)
        builder = HFCausalTrainerBuilder(cfg, model, tokenizer)
        cfg["optimizer"] = "muon"

        # need to load datasets for reward model and process reward model trainer
        if cfg_string in ["rm_cfg", "prm_cfg"]:
            dataset_meta = load_datasets(cfg=cfg)

            builder.train_dataset = dataset_meta.train_dataset
            builder.eval_dataset = dataset_meta.eval_dataset

        trainer = builder.build(100)

        assert trainer.optimizer_cls_and_kwargs is not None

        from axolotl.contribs.mit.muon import MuonOptimizerFactory
        from axolotl.contribs.mit.muon.muon import Muon

        optimizer_cls, optimizer_kwargs = trainer.optimizer_cls_and_kwargs
        assert optimizer_cls is MuonOptimizerFactory
        assert optimizer_kwargs["lr"] == 0.00005
        assert optimizer_kwargs["weight_decay"] == 0.01
        assert optimizer_kwargs["betas"] == (0.998, 0.9)
        assert optimizer_kwargs["eps"] == 0.00001

        # Ensure optimizer is created with correct class
        optim = trainer.create_optimizer()
        assert isinstance(optim, Muon)


class TestTrainerClsPlugin:
    """
    TestCase class for trainer builder with plugin
    """

    def test_trainer_cls_is_not_none_with_plugin(self, kto_cfg, model, tokenizer):
        """
        Test that the trainer cls is not none with plugin

        Fixes #2693
        """
        cfg = kto_cfg.copy()
        cfg.plugins = ["axolotl.integrations.liger.LigerPlugin"]

        # Expected AttributeError as we don't pass regular model configs to RL trainer builder
        # If it throws `TypeError: None is not a callable object`, trainer_cls could be None
        try:
            builder = HFRLTrainerBuilder(cfg, model, tokenizer)

            builder.build(100)
        except TypeError as e:
            # Error raised if trainer_cls is None
            assert "'tuple' object has no attribute 'config'" not in str(e)
        except Exception:
            # Another error happens, so we passed trainer_cls to builder
            pass


================================================
FILE: tests/e2e/.gitignore
================================================
last_run_prepared


================================================
FILE: tests/e2e/__init__.py
================================================


================================================
FILE: tests/e2e/integrations/test_cut_cross_entropy.py
================================================
"""
Simple end-to-end test for Cut Cross Entropy integration
"""

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils import get_pytorch_version
from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
from axolotl.utils.dict import DictDefault

from tests.e2e.utils import check_model_output_exists


@pytest.fixture()
def min_cfg(temp_dir):
    return {
        "base_model": "HuggingFaceTB/SmolLM2-135M",
        "plugins": [
            "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin",
        ],
        "cut_cross_entropy": True,
        "sequence_len": 1024,
        "val_set_size": 0.02,
        "special_tokens": {
            "pad_token": "<|endoftext|>",
        },
        "datasets": [
            {
                "path": "mhenrichsen/alpaca_2k_test",
                "type": "alpaca",
            },
        ],
        "num_epochs": 1,
        "micro_batch_size": 8,
        "gradient_accumulation_steps": 1,
        "learning_rate": 0.00001,
        "optimizer": "adamw_torch_fused",
        "output_dir": temp_dir,
        "lr_scheduler": "cosine",
        "max_steps": 10,
        "bf16": "auto",
        "save_first_step": False,
    }


class TestCutCrossEntropyIntegration:
    """
    e2e tests for cut_cross_entropy integration with Axolotl
    """

    def test_llama_w_cce(self, min_cfg, temp_dir):
        cfg = DictDefault(min_cfg)
        cfg = validate_config(cfg)
        prepare_plugins(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        major, minor, _ = get_pytorch_version()
        if (major, minor) < (2, 4):
            with pytest.raises(ImportError):
                train(cfg=cfg, dataset_meta=dataset_meta)
        else:
            train(cfg=cfg, dataset_meta=dataset_meta)
            check_model_output_exists(temp_dir, cfg)

    def test_qwen2_w_cce(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "Qwen/Qwen2.5-0.5B",
                "plugins": [
                    "axolotl.integrations.cut_cross_entropy.CutCrossEntropyPlugin",
                ],
                "cut_cross_entropy": True,
                "sequence_len": 1024,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 1,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "output_dir": temp_dir,
                "lr_scheduler": "cosine",
                "max_steps": 10,
                "bf16": "auto",
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        prepare_plugins(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        major, minor, _ = get_pytorch_version()
        if (major, minor) < (2, 4):
            with pytest.raises(ImportError):
                train(cfg=cfg, dataset_meta=dataset_meta)
        else:
            train(cfg=cfg, dataset_meta=dataset_meta)
            check_model_output_exists(temp_dir, cfg)

    @pytest.mark.parametrize(
        "attention_type",
        [
            "flash_attention",
            "sdp_attention",
            # "xformers_attention",
        ],
    )
    def test_llama_w_cce_and_attention(self, min_cfg, temp_dir, attention_type):
        cfg = DictDefault(
            min_cfg
            | {
                attention_type: True,
            }
        )
        cfg = validate_config(cfg)
        prepare_plugins(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        major, minor, _ = get_pytorch_version()
        if (major, minor) < (2, 4):
            with pytest.raises(ImportError):
                train(cfg=cfg, dataset_meta=dataset_meta)
        else:
            train(cfg=cfg, dataset_meta=dataset_meta)
            check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/integrations/test_fp8.py
================================================
"""
Simple end-to-end smoke tests for FP8 mixed precision training
"""

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from tests.e2e.utils import check_model_output_exists, require_torch_2_7_0


class FP8IntegrationTestCase:
    """
    e2e smoke tests for FP8 mixed precision training with Axolotl
    """

    @require_torch_2_7_0
    def test_fp8_single_gpu_smoke(self, temp_dir):
        """Smoke test for single GPU FP8 + torch.compile training"""

        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "trust_remote_code": True,
                "sequence_len": 512,
                "val_set_size": 0.05,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 3,  # Very short smoke test
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "sdp_attention": True,
                "pad_to_seq_len": True,
                "sample_packing": True,
                "fp8": True,
                "torch_compile": True,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/integrations/test_hooks.py
================================================
"""
e2e tests to make sure all the hooks are fired on the plugin
"""

import os
from pathlib import Path

from axolotl.common.datasets import load_datasets
from axolotl.integrations.base import BasePlugin
from axolotl.train import train
from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
from axolotl.utils.dict import DictDefault

from tests.e2e.utils import check_model_output_exists


class LogHooksPlugin(BasePlugin):
    """
    fixture to capture in a log file each hook that was fired
    """

    base_dir = Path("/tmp/axolotl-log-hooks")

    def __init__(self):
        self.base_dir.mkdir(parents=True, exist_ok=True)
        try:
            os.remove(self.base_dir.joinpath("plugin_hooks.log"))
        except FileNotFoundError:
            pass

    def post_trainer_create(self, cfg, trainer):
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("post_trainer_create\n")

    def pre_model_load(self, cfg):
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("pre_model_load\n")

    def post_model_build(self, cfg, model):
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("post_model_build\n")

    def pre_lora_load(self, cfg, model):
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("pre_lora_load\n")

    def post_lora_load(self, cfg, model):
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("post_lora_load\n")

    def post_model_load(self, cfg, model):
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("post_model_load\n")

    def create_optimizer(self, cfg, trainer):
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("create_optimizer\n")

    def get_trainer_cls(self, cfg):
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("get_trainer_cls\n")

    def create_lr_scheduler(self, cfg, trainer, optimizer, num_training_steps):
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("create_lr_scheduler\n")

    def add_callbacks_pre_trainer(self, cfg, model):
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("add_callbacks_pre_trainer\n")
        return []

    def add_callbacks_post_trainer(self, cfg, trainer):
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("add_callbacks_post_trainer\n")
        return []

    def post_train(self, cfg, model):
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("post_train\n")

    def post_train_unload(self, cfg):
        with open(
            self.base_dir.joinpath("plugin_hooks.log"), "a", encoding="utf-8"
        ) as f:
            f.write("post_train_unload\n")


class TestPluginHooks:
    """
    e2e tests to make sure all the hooks are fired during the training
    """

    def test_plugin_hooks(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "plugins": [
                    "tests.e2e.integrations.test_hooks.LogHooksPlugin",
                ],
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "flash_attention": True,
                "bf16": "auto",
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        prepare_plugins(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

        with open(
            "/tmp/axolotl-log-hooks" + "/plugin_hooks.log", "r", encoding="utf-8"
        ) as f:
            file_contents = f.readlines()
            file_contents = "\n".join(file_contents)
            assert "post_trainer_create" in file_contents
            assert "pre_model_load" in file_contents
            assert "post_model_build" in file_contents
            assert "pre_lora_load" in file_contents
            assert "post_lora_load" in file_contents
            assert "post_model_load" in file_contents
            # assert "create_optimizer" in file_contents  # not implemented yet
            assert "get_trainer_cls" in file_contents
            assert "create_lr_scheduler" in file_contents
            assert "add_callbacks_pre_trainer" in file_contents
            assert "add_callbacks_post_trainer" in file_contents
            assert "post_train" in file_contents
            # assert "post_train_unload" in file_contents  # not called from test train call

        try:
            os.remove("/tmp/axolotl-log-hooks" + "/plugin_hooks.log")
        except FileNotFoundError:
            pass


================================================
FILE: tests/e2e/integrations/test_kd.py
================================================
"""
e2e tests for kd trainer support in Axolotl
"""

from pathlib import Path

import pytest
import yaml
from accelerate.test_utils import execute_subprocess_async, get_torch_dist_unique_port

from axolotl.utils.dict import DictDefault

from tests.e2e.utils import check_tensorboard, require_torch_2_5_1


@pytest.fixture(name="kd_min_cfg")
def min_cfg(temp_dir):
    return {
        "base_model": "Qwen/Qwen3-0.6B",
        "tokenizer_config": "winglian/qwen3-14b-math",
        "plugins": [
            "axolotl.integrations.kd.KDPlugin",
            "axolotl.integrations.liger.LigerPlugin",
        ],
        "liger_rms_norm": True,
        "liger_glu_activation": True,
        "torch_compile": True,
        "chat_template": "qwen3",
        "kd_trainer": True,
        "kd_ce_alpha": 0.1,
        "kd_alpha": 0.9,
        "kd_temperature": 1.0,
        "kd_beta": 0.0,
        "kd_normalize_topk": True,
        "dataloader_prefetch_factor": 8,
        "dataloader_num_workers": 4,
        "dataloader_pin_memory": True,
        "datasets": [
            {
                "path": "winglian/OpenThoughts-114k-math-correct-qwen3-14b-math-prepared-topk128-normalized",
                "type": "chat_template",
                "split": "train",
                "split_thinking": True,
                "eot_tokens": ["<|im_end|>"],
                "data_files": ["train/batch-000000.parquet"],
            },
        ],
        "skip_prepare_dataset": True,
        "val_set_size": 0.0,
        "sequence_len": 2048,
        "sample_packing": True,
        "pad_to_sequence_len": True,
        "gradient_accumulation_steps": 2,
        "micro_batch_size": 1,
        "num_epochs": 1,
        "optimizer": "adamw_8bit",
        "lr_scheduler": "cosine",
        "learning_rate": 0.00001,
        "bf16": "auto",
        "gradient_checkpointing": True,
        "flash_attention": True,
        "special_tokens": {
            "pad_token": "<|end_of_text|>",
            "eos_token": "<|eot_id|>",
        },
        "max_steps": 5,
        "output_dir": temp_dir,
        "use_tensorboard": True,
        "save_first_step": False,
    }


class TestKnowledgeDistillation:
    """
    Test case for Knowledge Distillation
    """

    # While this will run on torch 2.4.x without torch_compile enabled
    # the VRAM requirement is higher than what is available in CI
    @require_torch_2_5_1
    def test_llama_kd(self, temp_dir, kd_min_cfg):
        cfg = DictDefault(kd_min_cfg)

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "1",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        assert (Path(temp_dir) / "model.safetensors").exists()
        check_tensorboard(
            temp_dir + "/runs", "train/loss", 1.4, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
        "load_in_8bit",
        [True, False],
    )
    def test_llama_lora_kd(self, temp_dir, kd_min_cfg, load_in_8bit):
        cfg = DictDefault(
            {
                "load_in_8bit": load_in_8bit,
                "torch_compile": False,
                "adapter": "lora",
                "peft_use_dora": True,
                "lora_target_linear": True,
                "lora_r": 16,
                "lora_alpha": 32,
                "lora_dropout": 0.0,
                "lora_modules_to_save": ["embed_tokens", "lm_head"],
                "lora_mlp_kernel": False,
                "lora_qkv_kernel": False,
                "lora_o_kernel": False,
            }
            | kd_min_cfg
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "1",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )
        assert (Path(temp_dir) / "adapter_model.safetensors").exists()
        check_tensorboard(
            temp_dir + "/runs", "train/loss", 1.2, "Train Loss (%s) is too high"
        )


================================================
FILE: tests/e2e/integrations/test_liger.py
================================================
"""
Simple end-to-end test for Liger integration
"""

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
from axolotl.utils.dict import DictDefault

from tests.e2e.utils import check_model_output_exists, require_torch_2_4_1


class LigerIntegrationTestCase:
    """
    e2e tests for liger integration with Axolotl
    """

    @require_torch_2_4_1
    def test_llama_wo_flce(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "plugins": [
                    "axolotl.integrations.liger.LigerPlugin",
                ],
                "liger_rope": True,
                "liger_rms_norm": True,
                "liger_glu_activation": True,
                "liger_cross_entropy": True,
                "liger_fused_linear_cross_entropy": False,
                "sequence_len": 1024,
                "val_set_size": 0.05,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "bf16": "auto",
                "max_steps": 5,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        prepare_plugins(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

    @require_torch_2_4_1
    @pytest.mark.parametrize(
        "liger_use_token_scaling",
        [True, False],
    )
    def test_llama_w_flce(self, temp_dir, liger_use_token_scaling):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "plugins": [
                    "axolotl.integrations.liger.LigerPlugin",
                ],
                "liger_rope": True,
                "liger_rms_norm": True,
                "liger_glu_activation": True,
                "liger_cross_entropy": False,
                "liger_fused_linear_cross_entropy": True,
                "liger_use_token_scaling": liger_use_token_scaling,
                "sequence_len": 1024,
                "val_set_size": 0.05,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "bf16": "auto",
                "max_steps": 5,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        prepare_plugins(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/integrations/test_llm_compressor.py
================================================
"""
E2E smoke tests for LLMCompressorPlugin integration
"""

from pathlib import Path

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, prepare_plugins, validate_config
from axolotl.utils.dict import DictDefault

from tests.e2e.utils import (
    check_model_output_exists,
    require_llmcompressor,
    require_torch_2_4_1,
)

MODELS = [
    "nm-testing/llama2.c-stories42M-pruned2.4-compressed",
    "nm-testing/llama2.c-stories42M-gsm8k-sparse-only-compressed",
]


@pytest.mark.parametrize(
    "base_model", MODELS, ids=["no-checkpoint-recipe", "with-checkpoint-recipe"]
)
@pytest.mark.parametrize(
    "save_compressed", [True, False], ids=["save_compressed", "save_uncompressed"]
)
class TestLLMCompressorIntegration:
    """
    e2e tests for axolotl.integrations.llm_compressor.LLMCompressorPlugin
    """

    @require_llmcompressor
    @require_torch_2_4_1
    def test_llmcompressor_plugin(
        self, temp_dir, base_model: str, save_compressed: bool
    ):
        from llmcompressor import active_session

        # core cfg
        cfg = DictDefault(
            {
                "base_model": base_model,
                "plugins": ["axolotl.integrations.llm_compressor.LLMCompressorPlugin"],
                "sequence_len": 1024,
                "val_set_size": 0.05,
                "special_tokens": {"pad_token": "<|endoftext|>"},
                "datasets": [{"path": "mhenrichsen/alpaca_2k_test", "type": "alpaca"}],
                "num_epochs": 1,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 1e-5,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "bf16": "auto",
                "max_steps": 5,
                "llmcompressor": {
                    "recipe": {
                        "finetuning_stage": {
                            "finetuning_modifiers": {
                                "ConstantPruningModifier": {
                                    "targets": [
                                        "re:.*q_proj.weight",
                                        "re:.*k_proj.weight",
                                        "re:.*v_proj.weight",
                                        "re:.*o_proj.weight",
                                        "re:.*gate_proj.weight",
                                        "re:.*up_proj.weight",
                                        "re:.*down_proj.weight",
                                    ],
                                    "start": 0,
                                },
                            },
                        },
                    },
                    "save_compressed": save_compressed,
                },
                "save_first_step": False,
            }
        )

        prepare_plugins(cfg)
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        try:
            train(cfg=cfg, dataset_meta=dataset_meta)
            check_model_output_exists(temp_dir, cfg)
            _check_llmcompressor_model_outputs(temp_dir, save_compressed)
        finally:
            active_session().reset()


def _check_llmcompressor_model_outputs(temp_dir, save_compressed):
    if save_compressed:
        assert (Path(temp_dir) / "recipe.yaml").exists()

        from compressed_tensors import ModelCompressor
        from compressed_tensors.config import Sparse24BitMaskConfig

        compressor = ModelCompressor.from_pretrained(temp_dir)
        assert compressor is not None
        assert isinstance(compressor.sparsity_config, Sparse24BitMaskConfig)


================================================
FILE: tests/e2e/integrations/test_scattermoe_lora_kernels.py
================================================
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) Axolotl AI
# Licensed under the Apache License, Version 2.0

"""
Tests for ScatterMoE + LoRA Fused Kernels
==========================================

Tests verify correctness of:
1. Forward pass: fused kernel matches naive PyTorch reference
2. Backward pass: gradients for LoRA A, B, and input match reference
3. Frozen weights: expert weight gradients are correctly skipped
4. Various configurations: top-k, grouped_in/out, with/without bias
5. Numerical stability: bf16/fp16 outputs within tolerance of fp32 reference
6. HFScatterMoEGatedMLP with sigmoid routing (GLM/DeepSeek/MiniMax M2)

Test strategy:
- Reference implementation uses pure PyTorch ops (no Triton)
- ScatterMoE routing (flatten_sort_count) is shared between reference and kernel
- Tolerances account for tf32 accumulation in Triton kernels
"""

from types import SimpleNamespace

import pytest
import torch

# Skip all tests if CUDA is not available
pytestmark = pytest.mark.skipif(
    not torch.cuda.is_available(),
    reason="CUDA required for Triton kernels",
)

_SMOE = "axolotl.integrations.kernels.libs.scattermoe_lora"


# =============================================================================
# Helpers
# =============================================================================


def flatten_sort_count_ref(expert_idxs: torch.Tensor, num_experts: int):
    """Reference implementation of routing."""
    with torch.no_grad():
        flat = expert_idxs.flatten()
        sorted_expert_idxs, sorted_scattered_idxs = torch.sort(flat)
        counts = flat.bincount(minlength=num_experts)
        offsets = counts.cumsum(-1)
    return sorted_expert_idxs, sorted_scattered_idxs, offsets


def reference_parallel_linear_lora(
    X,
    W,
    k,
    sorted_expert_idxs,
    sorted_scattered_idxs,
    lora_A,
    lora_B,
    scaling,
    x_grouped=False,
    y_grouped=False,
    bias=None,
):
    """
    Pure PyTorch reference for: Y[i] = X[i] @ W[e] + scaling * (X[i] @ A[e]^T) @ B[e]^T + b[e]

    Args:
        X: [M, K] input (token order)
        W: [E, K, N] expert weights
        sorted_expert_idxs: [M*k] expert assignments (sorted)
        sorted_scattered_idxs: [M*k] original token indices (sorted)
        lora_A: [r*E, K] LoRA A weights
        lora_B: [N, r*E] LoRA B weights
        scaling: LoRA scaling factor
    """
    E, K, N = W.shape
    R = lora_A.size(0) // E
    L = sorted_expert_idxs.size(0)  # M * k

    output = torch.zeros(L, N, device=X.device, dtype=X.dtype)

    for i in range(L):
        e = sorted_expert_idxs[i].item()
        if x_grouped:
            x_i = X[i]
        else:
            token_idx = sorted_scattered_idxs[i].item() // k
            x_i = X[token_idx]

        w_e = W[e]  # [K, N]
        a_e = lora_A[e * R : (e + 1) * R, :]  # [r, K]
        b_e = lora_B[:, e * R : (e + 1) * R]  # [N, r]

        # Y = X @ W + scaling * (X @ A^T) @ B^T
        base = x_i @ w_e  # [N]
        lora = scaling * ((x_i @ a_e.T) @ b_e.T)  # [N]
        out_i = base + lora

        if bias is not None:
            out_i = out_i + bias[e]

        if y_grouped:
            output[i] = out_i
        else:
            output[sorted_scattered_idxs[i]] = out_i

    return output


def reference_lora_backward(
    grad_out,
    X,
    W,
    lora_A,
    lora_B,
    scaling,
    sorted_expert_idxs,
    sorted_scattered_idxs,
    expert_offsets,
    k,
    E,
):
    """
    Pure PyTorch reference for LoRA backward pass on grouped data.

    Returns:
        dX: [M*k, K] input gradient (in grouped order)
        dA: [r*E, K] LoRA A gradient
        dB: [N, r*E] LoRA B gradient
    """
    R = lora_A.size(0) // E

    dA = torch.zeros_like(lora_A)
    dB = torch.zeros_like(lora_B)
    dX = torch.zeros_like(X)

    prev_offset = 0
    for e in range(E):
        curr_offset = expert_offsets[e].item()
        if curr_offset > prev_offset:
            dy_e = grad_out[prev_offset:curr_offset]  # [M_e, N]
            x_e = X[prev_offset:curr_offset]  # [M_e, K]
            a_e = lora_A[e * R : (e + 1) * R, :]  # [r, K]
            b_e = lora_B[:, e * R : (e + 1) * R]  # [N, r]
            w_e = W[e]  # [K, N]

            # Input gradient: dX = dY @ W^T + scaling * (dY @ B) @ A
            dx_base = dy_e @ w_e.T  # [M_e, K]
            dy_b = dy_e @ b_e  # [M_e, r]
            dx_lora = scaling * (dy_b @ a_e)  # [M_e, K]
            dX[prev_offset:curr_offset] = dx_base + dx_lora

            # LoRA A gradient: dA = scaling * (dY @ B)^T @ X
            xa = x_e @ a_e.T  # [M_e, r]
            dA[e * R : (e + 1) * R, :] = scaling * (dy_b.T @ x_e)

            # LoRA B gradient: dB = scaling * dY^T @ (X @ A^T)
            dB[:, e * R : (e + 1) * R] = scaling * (dy_e.T @ xa)

        prev_offset = curr_offset

    return dX, dA, dB


def make_test_data(
    M=32,
    K=64,
    N=128,
    E=4,
    R=8,
    k=2,
    dtype=torch.float32,
    device="cuda",
    seed=42,
):
    """Create test data for ScatterMoE + LoRA tests."""
    torch.manual_seed(seed)

    X = torch.randn(M, K, device=device, dtype=dtype)
    W = torch.randn(E, K, N, device=device, dtype=dtype) * 0.02
    lora_A = torch.randn(R * E, K, device=device, dtype=dtype) * 0.01
    lora_B = torch.randn(N, R * E, device=device, dtype=dtype) * 0.01
    scaling = 0.5

    # Generate routing
    selected_experts = torch.randint(0, E, (M, k), device=device)
    sorted_expert_idxs, sorted_scattered_idxs, expert_offsets = flatten_sort_count_ref(
        selected_experts, E
    )

    return {
        "X": X,
        "W": W,
        "lora_A": lora_A,
        "lora_B": lora_B,
        "scaling": scaling,
        "k": k,
        "E": E,
        "R": R,
        "sorted_expert_idxs": sorted_expert_idxs,
        "sorted_scattered_idxs": sorted_scattered_idxs,
        "expert_offsets": expert_offsets,
    }


# =============================================================================
# Test: Forward Pass Correctness
# =============================================================================


class TestForwardPass:
    """Test forward pass of fused scatter2scatter_lora kernel."""

    def _run_forward_test(
        self, M, K, N, E, R, k, dtype=torch.float32, atol=1e-2, rtol=1e-2
    ):
        from importlib import import_module

        lora_ops = import_module(f"{_SMOE}.kernels.lora_ops")

        data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k, dtype=dtype)

        # Reference
        ref_output = reference_parallel_linear_lora(
            data["X"],
            data["W"],
            data["k"],
            data["sorted_expert_idxs"],
            data["sorted_scattered_idxs"],
            data["lora_A"],
            data["lora_B"],
            data["scaling"],
        )

        # Kernel
        kernel_output = lora_ops.scatter2scatter_lora(
            X=data["X"],
            W=data["W"],
            sorted_expert_idxs=data["sorted_expert_idxs"],
            sorted_scattered_idxs=data["sorted_scattered_idxs"],
            k=data["k"],
            lora_A=data["lora_A"],
            lora_B=data["lora_B"],
            scaling=data["scaling"],
        )

        torch.testing.assert_close(kernel_output, ref_output, atol=atol, rtol=rtol)

    def test_basic(self):
        """Basic forward pass with small dimensions."""
        self._run_forward_test(M=16, K=64, N=64, E=4, R=8, k=1)

    def test_topk2(self):
        """Forward pass with top-2 routing."""
        self._run_forward_test(M=32, K=64, N=128, E=4, R=8, k=2)

    def test_larger_rank(self):
        """Forward pass with larger LoRA rank."""
        self._run_forward_test(M=16, K=128, N=128, E=8, R=32, k=2)

    def test_small_rank(self):
        """Forward pass with very small LoRA rank."""
        self._run_forward_test(M=32, K=64, N=64, E=4, R=4, k=1)

    def test_many_experts(self):
        """Forward with many experts, fewer tokens per expert."""
        self._run_forward_test(M=64, K=64, N=64, E=16, R=8, k=2)

    def test_non_power_of_2_dims(self):
        """Test with dimensions that are not powers of 2."""
        self._run_forward_test(M=17, K=96, N=80, E=6, R=16, k=2, atol=2e-2, rtol=2e-2)

    def test_single_token(self):
        """Test with a single token."""
        self._run_forward_test(M=1, K=64, N=64, E=4, R=8, k=1)

    def test_bf16(self):
        """Test with bfloat16 precision."""
        self._run_forward_test(
            M=32, K=64, N=128, E=4, R=8, k=2, dtype=torch.bfloat16, atol=5e-2, rtol=5e-2
        )

    def test_fp16(self):
        """Test with float16 precision."""
        self._run_forward_test(
            M=32, K=64, N=128, E=4, R=8, k=2, dtype=torch.float16, atol=5e-2, rtol=5e-2
        )


class TestForwardGrouped:
    """Test forward pass with grouped_in/grouped_out configurations."""

    def _make_grouped_data(self, M=32, K=64, N=128, E=4, R=8, k=2, dtype=torch.float32):
        from importlib import import_module

        base_ops = import_module(f"{_SMOE}.kernels.ops")

        data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k, dtype=dtype)

        # Create grouped X
        grouped_X = base_ops.group(data["X"], data["sorted_scattered_idxs"], fan_out=k)
        data["grouped_X"] = grouped_X
        return data

    def test_x_grouped(self):
        """Forward with pre-grouped input."""
        from importlib import import_module

        lora_ops = import_module(f"{_SMOE}.kernels.lora_ops")

        data = self._make_grouped_data()

        ref_output = reference_parallel_linear_lora(
            data["grouped_X"],
            data["W"],
            data["k"],
            data["sorted_expert_idxs"],
            data["sorted_scattered_idxs"],
            data["lora_A"],
            data["lora_B"],
            data["scaling"],
            x_grouped=True,
        )

        kernel_output = lora_ops.scatter2scatter_lora(
            X=data["grouped_X"],
            W=data["W"],
            sorted_expert_idxs=data["sorted_expert_idxs"],
            sorted_scattered_idxs=data["sorted_scattered_idxs"],
            k=1,  # When x_grouped, fan_out=1 (already expanded)
            lora_A=data["lora_A"],
            lora_B=data["lora_B"],
            scaling=data["scaling"],
            x_grouped=True,
        )

        torch.testing.assert_close(kernel_output, ref_output, atol=1e-2, rtol=1e-2)

    def test_y_grouped(self):
        """Forward with grouped output."""
        from importlib import import_module

        lora_ops = import_module(f"{_SMOE}.kernels.lora_ops")

        data = make_test_data()

        ref_output = reference_parallel_linear_lora(
            data["X"],
            data["W"],
            data["k"],
            data["sorted_expert_idxs"],
            data["sorted_scattered_idxs"],
            data["lora_A"],
            data["lora_B"],
            data["scaling"],
            y_grouped=True,
        )

        kernel_output = lora_ops.scatter2scatter_lora(
            X=data["X"],
            W=data["W"],
            sorted_expert_idxs=data["sorted_expert_idxs"],
            sorted_scattered_idxs=data["sorted_scattered_idxs"],
            k=data["k"],
            lora_A=data["lora_A"],
            lora_B=data["lora_B"],
            scaling=data["scaling"],
            y_grouped=True,
        )

        torch.testing.assert_close(kernel_output, ref_output, atol=1e-2, rtol=1e-2)


# =============================================================================
# Test: Backward Pass Correctness (LoRA Gradients)
# =============================================================================


class TestLoRAGradients:
    """Test backward LoRA gradient computation (dA, dB)."""

    def _run_lora_grad_test(self, M, K, N, E, R, k, atol=1e-2, rtol=1e-2):
        from importlib import import_module

        lora_ops = import_module(f"{_SMOE}.kernels.lora_ops")
        base_ops = import_module(f"{_SMOE}.kernels.ops")

        data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k)

        # Group X for backward
        grouped_X = base_ops.group(data["X"], data["sorted_scattered_idxs"], fan_out=k)

        # Create fake grad_out in grouped order
        grad_out = torch.randn(
            data["sorted_expert_idxs"].size(0),
            N,
            device="cuda",
            dtype=torch.float32,
        )

        # Reference
        _, ref_dA, ref_dB = reference_lora_backward(
            grad_out,
            grouped_X,
            data["W"],
            data["lora_A"],
            data["lora_B"],
            data["scaling"],
            data["sorted_expert_idxs"],
            data["sorted_scattered_idxs"],
            data["expert_offsets"],
            k,
            E,
        )

        # Kernel
        kernel_dA, kernel_dB = lora_ops.group_bwd_lora(
            DY=grad_out,
            X=grouped_X,
            lora_A=data["lora_A"],
            lora_B=data["lora_B"],
            expert_offsets=data["expert_offsets"],
            E=E,
            scaling=data["scaling"],
        )

        torch.testing.assert_close(kernel_dA, ref_dA, atol=atol, rtol=rtol)
        torch.testing.assert_close(kernel_dB, ref_dB, atol=atol, rtol=rtol)

    def test_basic_lora_grads(self):
        self._run_lora_grad_test(M=32, K=64, N=128, E=4, R=8, k=2)

    def test_small_rank(self):
        self._run_lora_grad_test(M=16, K=64, N=64, E=4, R=4, k=1)

    def test_larger_rank(self):
        self._run_lora_grad_test(
            M=16, K=128, N=128, E=8, R=32, k=2, atol=5e-2, rtol=5e-2
        )

    def test_many_experts(self):
        self._run_lora_grad_test(M=64, K=64, N=64, E=16, R=8, k=2)

    def test_single_token_per_expert(self):
        """Edge case: roughly 1 token per expert."""
        self._run_lora_grad_test(M=8, K=64, N=64, E=8, R=4, k=1)


# =============================================================================
# Test: Full Autograd (Forward + Backward) via torch.autograd
# =============================================================================


class TestAutograd:
    """Test full autograd integration through ScatterMoELoRA."""

    def test_lora_receives_gradients(self):
        """LoRA A and B receive non-zero gradients; frozen W does not."""
        from importlib import import_module

        pll = import_module(f"{_SMOE}.parallel_linear_lora")

        M, K, N, E, R, k = 16, 64, 64, 4, 8, 2
        data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k)

        X = data["X"].clone().requires_grad_(True)
        W = data["W"].clone().requires_grad_(False)  # Frozen
        lora_A = data["lora_A"].clone().requires_grad_(True)
        lora_B = data["lora_B"].clone().requires_grad_(True)

        output = pll.ScatterMoELoRA.apply(
            X,
            W,
            k,
            data["sorted_expert_idxs"],
            data["sorted_scattered_idxs"],
            data["expert_offsets"],
            lora_A,
            lora_B,
            data["scaling"],
            None,
            None,
            False,
            False,
        )

        loss = output.sum()
        loss.backward()

        # LoRA params should have gradients
        assert lora_A.grad is not None, "lora_A should have gradient"
        assert lora_B.grad is not None, "lora_B should have gradient"
        assert lora_A.grad.abs().sum() > 0, "lora_A gradient should be non-zero"
        assert lora_B.grad.abs().sum() > 0, "lora_B gradient should be non-zero"

        # Input should have gradient (needed for upstream backprop)
        assert X.grad is not None, "X should have gradient"
        assert X.grad.abs().sum() > 0, "X gradient should be non-zero"

    def test_input_gradient_matches_reference(self):
        """Input gradient from autograd matches pure PyTorch reference."""
        from importlib import import_module

        pll = import_module(f"{_SMOE}.parallel_linear_lora")
        base_ops = import_module(f"{_SMOE}.kernels.ops")

        M, K, N, E, R, k = 16, 64, 64, 4, 8, 1
        data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k)

        # Autograd path
        X_kern = data["X"].clone().requires_grad_(True)
        lora_A_kern = data["lora_A"].clone().requires_grad_(True)
        lora_B_kern = data["lora_B"].clone().requires_grad_(True)

        out_kern = pll.ScatterMoELoRA.apply(
            X_kern,
            data["W"],
            k,
            data["sorted_expert_idxs"],
            data["sorted_scattered_idxs"],
            data["expert_offsets"],
            lora_A_kern,
            lora_B_kern,
            data["scaling"],
            None,
            None,
            False,
            False,
        )
        grad_out = torch.randn_like(out_kern)
        out_kern.backward(grad_out)

        # Reference path
        grouped_X = base_ops.group(data["X"], data["sorted_scattered_idxs"], fan_out=k)
        grouped_grad = base_ops.group(
            grad_out, data["sorted_scattered_idxs"], fan_out=1
        )

        ref_dX, ref_dA, ref_dB = reference_lora_backward(
            grouped_grad,
            grouped_X,
            data["W"],
            data["lora_A"],
            data["lora_B"],
            data["scaling"],
            data["sorted_expert_idxs"],
            data["sorted_scattered_idxs"],
            data["expert_offsets"],
            k,
            E,
        )

        # Compare input gradient (for k=1, no reduction needed)
        # ref_dX is in grouped (expert-sorted) order; X_kern.grad is in original order.
        # Ungroup ref_dX by scattering back to original positions.
        ref_dX_ungrouped = torch.zeros_like(ref_dX)
        ref_dX_ungrouped[data["sorted_scattered_idxs"]] = ref_dX
        torch.testing.assert_close(X_kern.grad, ref_dX_ungrouped, atol=5e-2, rtol=5e-2)

    def test_lora_gradient_matches_reference(self):
        """LoRA A/B gradients from autograd match reference."""
        from importlib import import_module

        pll = import_module(f"{_SMOE}.parallel_linear_lora")
        base_ops = import_module(f"{_SMOE}.kernels.ops")

        M, K, N, E, R, k = 16, 64, 64, 4, 8, 1
        data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k)

        # Autograd path
        X_kern = data["X"].clone().requires_grad_(True)
        lora_A_kern = data["lora_A"].clone().requires_grad_(True)
        lora_B_kern = data["lora_B"].clone().requires_grad_(True)

        out_kern = pll.ScatterMoELoRA.apply(
            X_kern,
            data["W"],
            k,
            data["sorted_expert_idxs"],
            data["sorted_scattered_idxs"],
            data["expert_offsets"],
            lora_A_kern,
            lora_B_kern,
            data["scaling"],
            None,
            None,
            False,
            False,
        )
        grad_out = torch.randn_like(out_kern)
        out_kern.backward(grad_out)

        # Reference path
        grouped_X = base_ops.group(data["X"], data["sorted_scattered_idxs"], fan_out=k)
        grouped_grad = base_ops.group(
            grad_out, data["sorted_scattered_idxs"], fan_out=1
        )

        _, ref_dA, ref_dB = reference_lora_backward(
            grouped_grad,
            grouped_X,
            data["W"],
            data["lora_A"],
            data["lora_B"],
            data["scaling"],
            data["sorted_expert_idxs"],
            data["sorted_scattered_idxs"],
            data["expert_offsets"],
            k,
            E,
        )

        torch.testing.assert_close(lora_A_kern.grad, ref_dA, atol=5e-2, rtol=5e-2)
        torch.testing.assert_close(lora_B_kern.grad, ref_dB, atol=5e-2, rtol=5e-2)


# =============================================================================
# Test: Equivalence with Base ScatterMoE (scaling=0 should match base)
# =============================================================================


class TestBaseEquivalence:
    """When scaling=0, fused kernel should match base scatter2scatter."""

    def test_zero_scaling_matches_base(self):
        """With scaling=0, LoRA contribution vanishes; should match base."""
        from importlib import import_module

        lora_ops = import_module(f"{_SMOE}.kernels.lora_ops")
        base_ops = import_module(f"{_SMOE}.kernels.ops")

        data = make_test_data(M=32, K=64, N=128, E=4, R=8, k=2)

        base_output = base_ops.scatter2scatter(
            X=data["X"],
            W=data["W"],
            sorted_expert_idxs=data["sorted_expert_idxs"],
            sorted_scattered_idxs=data["sorted_scattered_idxs"],
            k=data["k"],
        )

        lora_output = lora_ops.scatter2scatter_lora(
            X=data["X"],
            W=data["W"],
            sorted_expert_idxs=data["sorted_expert_idxs"],
            sorted_scattered_idxs=data["sorted_scattered_idxs"],
            k=data["k"],
            lora_A=data["lora_A"],
            lora_B=data["lora_B"],
            scaling=0.0,
        )

        torch.testing.assert_close(lora_output, base_output, atol=1e-3, rtol=1e-3)

    def test_zero_lora_weights_matches_base(self):
        """With A=0, B=0, should match base scatter2scatter."""
        from importlib import import_module

        lora_ops = import_module(f"{_SMOE}.kernels.lora_ops")
        base_ops = import_module(f"{_SMOE}.kernels.ops")

        data = make_test_data(M=32, K=64, N=128, E=4, R=8, k=2)

        zero_A = torch.zeros_like(data["lora_A"])
        zero_B = torch.zeros_like(data["lora_B"])

        base_output = base_ops.scatter2scatter(
            X=data["X"],
            W=data["W"],
            sorted_expert_idxs=data["sorted_expert_idxs"],
            sorted_scattered_idxs=data["sorted_scattered_idxs"],
            k=data["k"],
        )

        lora_output = lora_ops.scatter2scatter_lora(
            X=data["X"],
            W=data["W"],
            sorted_expert_idxs=data["sorted_expert_idxs"],
            sorted_scattered_idxs=data["sorted_scattered_idxs"],
            k=data["k"],
            lora_A=zero_A,
            lora_B=zero_B,
            scaling=1.0,
        )

        torch.testing.assert_close(lora_output, base_output, atol=1e-3, rtol=1e-3)


# =============================================================================
# Test: LoRA Additivity
# =============================================================================


class TestLoRAAdditivity:
    """Test that the LoRA component is correctly additive."""

    def test_lora_additivity(self):
        """
        Verify: fused(X, W, A, B, s) == base(X, W) + s * per_expert_lora(X, A, B)
        """
        from importlib import import_module

        lora_ops = import_module(f"{_SMOE}.kernels.lora_ops")
        base_ops = import_module(f"{_SMOE}.kernels.ops")

        data = make_test_data(M=32, K=64, N=128, E=4, R=8, k=2)

        # Base output (no LoRA)
        base_output = base_ops.scatter2scatter(
            X=data["X"],
            W=data["W"],
            sorted_expert_idxs=data["sorted_expert_idxs"],
            sorted_scattered_idxs=data["sorted_scattered_idxs"],
            k=data["k"],
        )

        # Fused output
        fused_output = lora_ops.scatter2scatter_lora(
            X=data["X"],
            W=data["W"],
            sorted_expert_idxs=data["sorted_expert_idxs"],
            sorted_scattered_idxs=data["sorted_scattered_idxs"],
            k=data["k"],
            lora_A=data["lora_A"],
            lora_B=data["lora_B"],
            scaling=data["scaling"],
        )

        # Compute LoRA contribution manually (reference)
        lora_only = reference_parallel_linear_lora(
            data["X"],
            torch.zeros_like(data["W"]),
            data["k"],
            data["sorted_expert_idxs"],
            data["sorted_scattered_idxs"],
            data["lora_A"],
            data["lora_B"],
            data["scaling"],
        )

        # fused = base + lora
        expected = base_output + lora_only
        torch.testing.assert_close(fused_output, expected, atol=2e-2, rtol=2e-2)


# =============================================================================
# Test: ParallelExperts module integration
# =============================================================================


class TestParallelExpertsModule:
    """Test the ParallelExperts module with LoRA."""

    def test_set_and_clear_lora(self):
        """Test set_lora/clear_lora lifecycle."""
        from importlib import import_module

        lora_module = import_module(f"{_SMOE}.lora_ops")

        pe = lora_module.ParallelExperts(4, 64, 128).cuda()

        A = torch.randn(32, 64, device="cuda")  # r=8, E=4
        B = torch.randn(128, 32, device="cuda")
        pe.set_lora(A, B, 0.5)

        assert pe._lora_A is A
        assert pe._lora_B is B
        assert pe._lora_scaling == 0.5

        pe.clear_lora()
        assert pe._lora_A is None
        assert pe._lora_B is None

    def test_forward_with_lora(self):
        """ParallelExperts forward with LoRA matches reference."""
        from importlib import import_module

        lora_module = import_module(f"{_SMOE}.lora_ops")

        E, K, N, R = 4, 64, 128, 8
        M, k = 16, 2
        data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k)

        pe = lora_module.ParallelExperts(E, K, N).cuda()
        # Set weights to match test data
        with torch.no_grad():
            pe.weight.copy_(data["W"].permute(0, 2, 1))  # [E, N, K]

        pe.set_lora(data["lora_A"], data["lora_B"], data["scaling"])

        output = pe(
            data["X"],
            k,
            data["sorted_expert_idxs"],
            data["sorted_scattered_idxs"],
            data["expert_offsets"],
        )

        ref = reference_parallel_linear_lora(
            data["X"],
            data["W"],
            k,
            data["sorted_expert_idxs"],
            data["sorted_scattered_idxs"],
            data["lora_A"],
            data["lora_B"],
            data["scaling"],
        )

        torch.testing.assert_close(output, ref, atol=2e-2, rtol=2e-2)


# =============================================================================
# Test: Edge Cases
# =============================================================================


class TestEdgeCases:
    """Edge cases and boundary conditions."""

    def test_all_tokens_one_expert(self):
        """All tokens routed to a single expert."""
        from importlib import import_module

        lora_ops = import_module(f"{_SMOE}.kernels.lora_ops")

        M, K, N, E, R, k = 16, 64, 64, 4, 8, 1
        torch.manual_seed(42)

        X = torch.randn(M, K, device="cuda")
        W = torch.randn(E, K, N, device="cuda") * 0.02
        lora_A = torch.randn(R * E, K, device="cuda") * 0.01
        lora_B = torch.randn(N, R * E, device="cuda") * 0.01

        # All tokens go to expert 0
        selected_experts = torch.zeros(M, k, device="cuda", dtype=torch.long)
        sorted_expert_idxs, sorted_scattered_idxs, expert_offsets = (
            flatten_sort_count_ref(selected_experts, E)
        )

        ref = reference_parallel_linear_lora(
            X,
            W,
            k,
            sorted_expert_idxs,
            sorted_scattered_idxs,
            lora_A,
            lora_B,
            0.5,
        )

        kernel = lora_ops.scatter2scatter_lora(
            X=X,
            W=W,
            sorted_expert_idxs=sorted_expert_idxs,
            sorted_scattered_idxs=sorted_scattered_idxs,
            k=k,
            lora_A=lora_A,
            lora_B=lora_B,
            scaling=0.5,
        )

        torch.testing.assert_close(kernel, ref, atol=1e-2, rtol=1e-2)

    def test_empty_experts(self):
        """Some experts have no tokens assigned."""
        from importlib import import_module

        lora_ops = import_module(f"{_SMOE}.kernels.lora_ops")

        M, K, N, E, R, k = 8, 64, 64, 8, 4, 1
        torch.manual_seed(42)

        X = torch.randn(M, K, device="cuda")
        W = torch.randn(E, K, N, device="cuda") * 0.02
        lora_A = torch.randn(R * E, K, device="cuda") * 0.01
        lora_B = torch.randn(N, R * E, device="cuda") * 0.01

        # Only use experts 0 and 1
        selected_experts = torch.randint(0, 2, (M, k), device="cuda")
        sorted_expert_idxs, sorted_scattered_idxs, expert_offsets = (
            flatten_sort_count_ref(selected_experts, E)
        )

        ref = reference_parallel_linear_lora(
            X,
            W,
            k,
            sorted_expert_idxs,
            sorted_scattered_idxs,
            lora_A,
            lora_B,
            0.5,
        )

        kernel = lora_ops.scatter2scatter_lora(
            X=X,
            W=W,
            sorted_expert_idxs=sorted_expert_idxs,
            sorted_scattered_idxs=sorted_scattered_idxs,
            k=k,
            lora_A=lora_A,
            lora_B=lora_B,
            scaling=0.5,
        )

        torch.testing.assert_close(kernel, ref, atol=1e-2, rtol=1e-2)


# =============================================================================
# Test: Optimization 1 - Fused dX Kernel
# =============================================================================


class TestFusedDX:
    """Test fused backward dX kernel: dX = dY @ W^T + scaling * (dY @ B) @ A."""

    def _run_fused_dX_test(
        self, M, K, N, E, R, k, dtype=torch.float32, atol=5e-2, rtol=5e-2
    ):
        from importlib import import_module

        lora_ops = import_module(f"{_SMOE}.kernels.lora_ops")
        base_ops = import_module(f"{_SMOE}.kernels.ops")
        pll = import_module(f"{_SMOE}.parallel_linear_lora")

        data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k, dtype=dtype)

        # Create dummy grad_out in grouped order
        grad_out = torch.randn(
            data["sorted_expert_idxs"].size(0), N, device="cuda", dtype=dtype
        )
        grouped_grad = base_ops.group(
            grad_out,
            data["sorted_scattered_idxs"],
            fan_out=1,
        )

        # Reference: separate scatter2scatter(DY, W^T) + _compute_lora_input_grad
        ref_base = base_ops.scatter2scatter(
            X=grouped_grad,
            x_grouped=True,
            W=data["W"].permute(0, 2, 1),
            sorted_expert_idxs=data["sorted_expert_idxs"],
            sorted_scattered_idxs=data["sorted_scattered_idxs"],
            k=1,
            y_grouped=False,
        )

        ref_lora = pll._compute_lora_input_grad(
            grouped_grad,
            data["lora_A"],
            data["lora_B"],
            data["expert_offsets"],
            E,
            data["scaling"],
        )
        # Scatter lora from grouped to ungrouped order
        ref_lora_ungrouped = torch.zeros_like(ref_base)
        ref_lora_ungrouped[data["sorted_scattered_idxs"]] = ref_lora
        ref_total = ref_base + ref_lora_ungrouped

        # Fused kernel
        fused_result = lora_ops.scatter2scatter_lora_dX(
            DY=grouped_grad,
            W=data["W"],
            sorted_expert_idxs=data["sorted_expert_idxs"],
            sorted_scattered_idxs=data["sorted_scattered_idxs"],
            k=1,
            lora_A=data["lora_A"],
            lora_B=data["lora_B"],
            scaling=data["scaling"],
            dy_grouped=True,
            dx_grouped=False,
        )

        torch.testing.assert_close(fused_result, ref_total, atol=atol, rtol=rtol)

    def test_basic(self):
        self._run_fused_dX_test(M=32, K=64, N=128, E=4, R=8, k=2)

    def test_large(self):
        self._run_fused_dX_test(M=256, K=256, N=512, E=8, R=16, k=2)

    def test_single_expert(self):
        self._run_fused_dX_test(M=64, K=128, N=256, E=1, R=8, k=1)

    def test_k1(self):
        self._run_fused_dX_test(M=64, K=64, N=128, E=4, R=8, k=1)

    def test_bf16(self):
        self._run_fused_dX_test(
            M=64,
            K=128,
            N=256,
            E=4,
            R=16,
            k=2,
            dtype=torch.bfloat16,
            atol=1e-1,
            rtol=1e-1,
        )

    def test_grouped_output(self):
        """Test fused dX with dx_grouped=True."""
        from importlib import import_module

        lora_ops = import_module(f"{_SMOE}.kernels.lora_ops")
        base_ops = import_module(f"{_SMOE}.kernels.ops")
        pll = import_module(f"{_SMOE}.parallel_linear_lora")

        M, K, N, E, R, k = 32, 64, 128, 4, 8, 2
        data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k)

        grad_out = torch.randn(data["sorted_expert_idxs"].size(0), N, device="cuda")
        grouped_grad = base_ops.group(
            grad_out, data["sorted_scattered_idxs"], fan_out=1
        )

        # Reference: grouped output
        ref_base = base_ops.scatter2scatter(
            X=grouped_grad,
            x_grouped=True,
            W=data["W"].permute(0, 2, 1),
            sorted_expert_idxs=data["sorted_expert_idxs"],
            sorted_scattered_idxs=data["sorted_scattered_idxs"],
            k=1,
            y_grouped=True,  # grouped output
        )

        ref_lora = pll._compute_lora_input_grad(
            grouped_grad,
            data["lora_A"],
            data["lora_B"],
            data["expert_offsets"],
            E,
            data["scaling"],
        )
        ref_total = ref_base + ref_lora

        # Fused kernel with grouped output
        fused_result = lora_ops.scatter2scatter_lora_dX(
            DY=grouped_grad,
            W=data["W"],
            sorted_expert_idxs=data["sorted_expert_idxs"],
            sorted_scattered_idxs=data["sorted_scattered_idxs"],
            k=1,
            lora_A=data["lora_A"],
            lora_B=data["lora_B"],
            scaling=data["scaling"],
            dy_grouped=True,
            dx_grouped=True,
        )

        torch.testing.assert_close(fused_result, ref_total, atol=5e-2, rtol=5e-2)

    def test_autograd_with_fused_dX(self):
        """Full autograd round-trip with use_fused_dX=True."""
        from importlib import import_module

        pll = import_module(f"{_SMOE}.parallel_linear_lora")

        M, K, N, E, R, k = 32, 64, 128, 4, 8, 2
        data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k)

        # Run without fused dX
        X1 = data["X"].clone().requires_grad_(True)
        A1 = data["lora_A"].clone().requires_grad_(True)
        B1 = data["lora_B"].clone().requires_grad_(True)
        out1 = pll.ScatterMoELoRA.apply(
            X1,
            data["W"],
            k,
            data["sorted_expert_idxs"],
            data["sorted_scattered_idxs"],
            data["expert_offsets"],
            A1,
            B1,
            data["scaling"],
            None,
            None,
            False,
            False,
            False,  # use_fused_dX=False
        )
        out1.sum().backward()

        # Run with fused dX
        X2 = data["X"].clone().requires_grad_(True)
        A2 = data["lora_A"].clone().requires_grad_(True)
        B2 = data["lora_B"].clone().requires_grad_(True)
        out2 = pll.ScatterMoELoRA.apply(
            X2,
            data["W"],
            k,
            data["sorted_expert_idxs"],
            data["sorted_scattered_idxs"],
            data["expert_offsets"],
            A2,
            B2,
            data["scaling"],
            None,
            None,
            False,
            False,
            True,  # use_fused_dX=True
        )
        out2.sum().backward()

        # Forward should be identical
        torch.testing.assert_close(out1, out2, atol=1e-5, rtol=1e-5)

        # Gradients should match
        torch.testing.assert_close(X1.grad, X2.grad, atol=5e-2, rtol=5e-2)
        torch.testing.assert_close(A1.grad, A2.grad, atol=5e-2, rtol=5e-2)
        torch.testing.assert_close(B1.grad, B2.grad, atol=5e-2, rtol=5e-2)


# =============================================================================
# Test: Optimization 2 - Fused Gather Backward
# =============================================================================


class TestFusedGatherBackward:
    """Test fused gather + backward dA/dB kernel."""

    def _run_fused_gather_test(
        self, M, K, N, E, R, k, dtype=torch.float32, atol=5e-2, rtol=5e-2
    ):
        from importlib import import_module

        lora_ops = import_module(f"{_SMOE}.kernels.lora_ops")
        base_ops = import_module(f"{_SMOE}.kernels.ops")

        data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k, dtype=dtype)

        # Create grad_out in ungrouped order (M*k, N)
        M_total = data["sorted_expert_idxs"].size(0)
        grad_out = torch.randn(M_total, N, device="cuda", dtype=dtype)

        # Reference: group() + group_bwd_lora()
        grouped_grad = base_ops.group(
            grad_out, data["sorted_scattered_idxs"], fan_out=1
        )
        grouped_x = base_ops.group(data["X"], data["sorted_scattered_idxs"], fan_out=k)

        ref_dA, ref_dB = lora_ops.group_bwd_lora(
            DY=grouped_grad,
            X=grouped_x,
            lora_A=data["lora_A"],
            lora_B=data["lora_B"],
            expert_offsets=data["expert_offsets"],
            E=E,
            scaling=data["scaling"],
        )

        # Fused kernel: no group() calls
        fused_dA, fused_dB = lora_ops.group_bwd_lora_fused(
            DY=grad_out,
            X=data["X"],
            lora_A=data["lora_A"],
            lora_B=data["lora_B"],
            expert_offsets=data["expert_offsets"],
            sorted_scattered_idxs=data["sorted_scattered_idxs"],
            E=E,
            k=k,
            scaling=data["scaling"],
        )

        torch.testing.assert_close(fused_dA, ref_dA, atol=atol, rtol=rtol)
        torch.testing.assert_close(fused_dB, ref_dB, atol=atol, rtol=rtol)

    def test_basic(self):
        self._run_fused_gather_test(M=32, K=64, N=128, E=4, R=8, k=2)

    def test_large(self):
        self._run_fused_gather_test(M=256, K=256, N=512, E=8, R=16, k=2)

    def test_single_expert(self):
        self._run_fused_gather_test(M=64, K=128, N=256, E=1, R=8, k=1)

    def test_k1(self):
        self._run_fused_gather_test(M=64, K=64, N=128, E=4, R=8, k=1)

    def test_many_experts(self):
        self._run_fused_gather_test(M=128, K=64, N=128, E=16, R=8, k=4)

    def test_bf16(self):
        self._run_fused_gather_test(
            M=64,
            K=128,
            N=256,
            E=4,
            R=16,
            k=2,
            dtype=torch.bfloat16,
            atol=1e-1,
            rtol=1e-1,
        )

    def test_autograd_with_fused_gather(self):
        """Full autograd round-trip with use_fused_gather=True."""
        from importlib import import_module

        pll = import_module(f"{_SMOE}.parallel_linear_lora")

        M, K, N, E, R, k = 32, 64, 128, 4, 8, 2
        data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k)

        # Run without fused gather
        X1 = data["X"].clone().requires_grad_(True)
        A1 = data["lora_A"].clone().requires_grad_(True)
        B1 = data["lora_B"].clone().requires_grad_(True)
        out1 = pll.ScatterMoELoRA.apply(
            X1,
            data["W"],
            k,
            data["sorted_expert_idxs"],
            data["sorted_scattered_idxs"],
            data["expert_offsets"],
            A1,
            B1,
            data["scaling"],
            None,
            None,
            False,
            False,
            False,
            False,  # use_fused_dX=False, use_fused_gather=False
        )
        out1.sum().backward()

        # Run with fused gather
        X2 = data["X"].clone().requires_grad_(True)
        A2 = data["lora_A"].clone().requires_grad_(True)
        B2 = data["lora_B"].clone().requires_grad_(True)
        out2 = pll.ScatterMoELoRA.apply(
            X2,
            data["W"],
            k,
            data["sorted_expert_idxs"],
            data["sorted_scattered_idxs"],
            data["expert_offsets"],
            A2,
            B2,
            data["scaling"],
            None,
            None,
            False,
            False,
            False,
            True,  # use_fused_dX=False, use_fused_gather=True
        )
        out2.sum().backward()

        # Forward identical
        torch.testing.assert_close(out1, out2, atol=1e-5, rtol=1e-5)

        # dA/dB should match
        torch.testing.assert_close(A1.grad, A2.grad, atol=5e-2, rtol=5e-2)
        torch.testing.assert_close(B1.grad, B2.grad, atol=5e-2, rtol=5e-2)
        # dX should also match (same path for dX)
        torch.testing.assert_close(X1.grad, X2.grad, atol=5e-2, rtol=5e-2)


# =============================================================================
# Test: Optimization 3 - Token Rounding
# =============================================================================


class TestTokenRounding:
    """Test token rounding utility and its integration with backward kernels."""

    def test_round_expert_counts_basic(self):
        """Verify round_expert_counts produces correct shapes and values."""
        from importlib import import_module

        lora_ops = import_module(f"{_SMOE}.kernels.lora_ops")

        M, K, N, E, R, k = 32, 64, 128, 4, 8, 2
        data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k)

        padded_ei, padded_si, padded_offsets, real_offsets = (
            lora_ops.round_expert_counts(
                data["sorted_expert_idxs"],
                data["sorted_scattered_idxs"],
                data["expert_offsets"],
                E=E,
                block_m=lora_ops.BLOCK_M,
            )
        )

        # Real offsets should match original
        torch.testing.assert_close(real_offsets, data["expert_offsets"])

        # Padded offsets should be >= real offsets
        assert (padded_offsets >= real_offsets).all(), (
            "Padded offsets should be >= real offsets"
        )

        # Each expert's padded count should be multiple of BLOCK_M (if non-zero)
        prev = 0
        for e in range(E):
            count = padded_offsets[e].item() - prev
            real_count = real_offsets[e].item() - (
                real_offsets[e - 1].item() if e > 0 else 0
            )
            if real_count > 0:
                assert count % lora_ops.BLOCK_M == 0, (
                    f"Expert {e}: padded count {count} not multiple of {lora_ops.BLOCK_M}"
                )
                assert count >= real_count, (
                    f"Expert {e}: padded count {count} < real count {real_count}"
                )
            prev = padded_offsets[e].item()

    def test_round_with_fused_gather(self):
        """Token rounding + fused gather gives same result as plain fused gather."""
        from importlib import import_module

        lora_ops = import_module(f"{_SMOE}.kernels.lora_ops")
        base_ops = import_module(f"{_SMOE}.kernels.ops")

        M, K, N, E, R, k = 64, 64, 128, 4, 8, 2
        data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k)

        M_total = data["sorted_expert_idxs"].size(0)
        grad_out = torch.randn(M_total, N, device="cuda")

        # Reference: group() + group_bwd_lora() (the gold standard)
        grouped_grad = base_ops.group(
            grad_out, data["sorted_scattered_idxs"], fan_out=1
        )
        grouped_x = base_ops.group(data["X"], data["sorted_scattered_idxs"], fan_out=k)
        ref_dA, ref_dB = lora_ops.group_bwd_lora(
            DY=grouped_grad,
            X=grouped_x,
            lora_A=data["lora_A"],
            lora_B=data["lora_B"],
            expert_offsets=data["expert_offsets"],
            E=E,
            scaling=data["scaling"],
        )

        # Apply token rounding
        padded_ei, padded_si, padded_offsets, real_offsets = (
            lora_ops.round_expert_counts(
                data["sorted_expert_idxs"],
                data["sorted_scattered_idxs"],
                data["expert_offsets"],
                E=E,
            )
        )

        # Fused gather with token rounding
        rounded_dA, rounded_dB = lora_ops.group_bwd_lora_fused(
            DY=grad_out,
            X=data["X"],
            lora_A=data["lora_A"],
            lora_B=data["lora_B"],
            expert_offsets=padded_offsets,
            sorted_scattered_idxs=padded_si,
            E=E,
            k=k,
            scaling=data["scaling"],
            real_expert_offsets=real_offsets,
        )

        torch.testing.assert_close(rounded_dA, ref_dA, atol=5e-2, rtol=5e-2)
        torch.testing.assert_close(rounded_dB, ref_dB, atol=5e-2, rtol=5e-2)

    def test_empty_experts_with_rounding(self):
        """Token rounding handles experts with 0 tokens correctly."""
        from importlib import import_module

        lora_ops = import_module(f"{_SMOE}.kernels.lora_ops")

        E, k = 8, 1
        M = 8
        torch.manual_seed(42)

        # Only use experts 0 and 1 (rest have 0 tokens)
        selected_experts = torch.randint(0, 2, (M, k), device="cuda")
        sorted_expert_idxs, sorted_scattered_idxs, expert_offsets = (
            flatten_sort_count_ref(selected_experts, E)
        )

        padded_ei, padded_si, padded_offsets, real_offsets = (
            lora_ops.round_expert_counts(
                sorted_expert_idxs,
                sorted_scattered_idxs,
                expert_offsets,
                E=E,
            )
        )

        # Verify empty experts have same count (0)
        for e in range(E):
            real_count = real_offsets[e].item() - (
                real_offsets[e - 1].item() if e > 0 else 0
            )
            padded_count = padded_offsets[e].item() - (
                padded_offsets[e - 1].item() if e > 0 else 0
            )
            if real_count == 0:
                assert padded_count == 0, (
                    f"Expert {e}: empty expert should have padded_count=0, got {padded_count}"
                )


# =============================================================================
# Test: Combined Optimizations
# =============================================================================


class TestCombinedOptimizations:
    """Test all optimizations together."""

    def test_fused_dX_and_fused_gather(self):
        """Both fused dX and fused gather together."""
        from importlib import import_module

        pll = import_module(f"{_SMOE}.parallel_linear_lora")

        M, K, N, E, R, k = 64, 128, 256, 4, 8, 2
        data = make_test_data(M=M, K=K, N=N, E=E, R=R, k=k)

        # Baseline: no optimizations
        X1 = data["X"].clone().requires_grad_(True)
        A1 = data["lora_A"].clone().requires_grad_(True)
        B1 = data["lora_B"].clone().requires_grad_(True)
        out1 = pll.ScatterMoELoRA.apply(
            X1,
            data["W"],
            k,
            data["sorted_expert_idxs"],
            data["sorted_scattered_idxs"],
            data["expert_offsets"],
            A1,
            B1,
            data["scaling"],
            None,
            None,
            False,
            False,
            False,
            False,  # no optimizations
        )
        out1.sum().backward()

        # Both optimizations
        X2 = data["X"].clone().requires_grad_(True)
        A2 = data["lora_A"].clone().requires_grad_(True)
        B2 = data["lora_B"].clone().requires_grad_(True)
        out2 = pll.ScatterMoELoRA.apply(
            X2,
            data["W"],
            k,
            data["sorted_expert_idxs"],
            data["sorted_scattered_idxs"],
            data["expert_offsets"],
            A2,
            B2,
            data["scaling"],
            None,
            None,
            False,
            False,
            True,
            True,  # use_fused_dX=True, use_fused_gather=True
        )
        out2.sum().backward()

        # Forward identical
        torch.testing.assert_close(out1, out2, atol=1e-5, rtol=1e-5)

        # All gradients match
        torch.testing.assert_close(X1.grad, X2.grad, atol=5e-2, rtol=5e-2)
        torch.testing.assert_close(A1.grad, A2.grad, atol=5e-2, rtol=5e-2)
        torch.testing.assert_close(B1.grad, B2.grad, atol=5e-2, rtol=5e-2)


# =============================================================================
# Test: HFScatterMoEGatedMLP with Sigmoid Routing
# =============================================================================


def _reference_moe_forward(
    hidden_states,
    gate_weight,
    gate_up_proj,
    down_proj,
    act_fn,
    routing_weights,
    selected_experts,
    num_experts,
):
    """Pure PyTorch reference for a full MoE forward pass.

    Args:
        hidden_states: [T, H]
        gate_weight: [E, H]
        gate_up_proj: [E, 2*FF, H]
        down_proj: [E, H, FF]
        act_fn: activation function (e.g. torch.nn.SiLU())
        routing_weights: [T, K] routing weights
        selected_experts: [T, K] expert indices
        num_experts: int

    Returns:
        output: [T, H]
    """
    T, H = hidden_states.shape
    K = selected_experts.shape[1]
    output = torch.zeros(T, H, device=hidden_states.device, dtype=hidden_states.dtype)

    for t in range(T):
        for j in range(K):
            e = selected_experts[t, j].item()
            w = routing_weights[t, j].item()

            # gate_up projection
            gup = hidden_states[t] @ gate_up_proj[e].T  # [2*I]
            I_dim = gup.shape[0] // 2
            gates = gup[:I_dim]
            up = gup[I_dim:]

            # activation
            h = act_fn(gates) * up

            # down projection
            out = h @ down_proj[e].T  # [H]

            output[t] += w * out

    return output


def _make_mock_sigmoid_moe_block(
    T=16, H=64, FF=32, E=8, K=2, n_group=2, topk_group=1, bias_on_gate=True
):
    """Create a mock MoE block with sigmoid routing for GPU testing."""
    gate_up_proj = torch.randn(E, 2 * FF, H, device="cuda") * 0.02
    down_proj = torch.randn(E, H, FF, device="cuda") * 0.02
    act_fn = torch.nn.SiLU()

    experts = SimpleNamespace(
        gate_up_proj=gate_up_proj,
        down_proj=down_proj,
        act_fn=act_fn,
        num_experts=E,
    )

    if bias_on_gate:
        gate = SimpleNamespace(
            weight=torch.randn(E, H, device="cuda") * 0.1,
            e_score_correction_bias=torch.zeros(E, device="cuda"),
        )
        moe_block = SimpleNamespace(
            gate=gate,
            experts=experts,
            top_k=K,
            n_routed_experts=E,
            n_group=n_group,
            topk_group=topk_group,
            norm_topk_prob=True,
            routed_scaling_factor=1.0,
        )
    else:
        # minimax_m2 style
        gate = SimpleNamespace(
            weight=torch.randn(E, H, device="cuda") * 0.1,
            top_k=K,
        )
        moe_block = SimpleNamespace(
            gate=gate,
            experts=experts,
            top_k=K,
            e_score_correction_bias=torch.zeros(E, device="cuda"),
        )

    return moe_block, T, H, FF, E, K


class TestHFScatterMoESigmoidRouting:
    """Test HFScatterMoEGatedMLP forward with sigmoid routing on GPU."""

    def test_forward_matches_reference_bias_on_gate(self):
        """Forward pass with sigmoid routing (bias on gate) matches reference."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            HFScatterMoEGatedMLP,
            _sigmoid_topk_route,
        )

        moe_block, T, H, FF, E, K = _make_mock_sigmoid_moe_block(
            T=16, H=64, FF=32, E=8, K=2, n_group=2, topk_group=1, bias_on_gate=True
        )

        hidden = torch.randn(1, T, H, device="cuda")

        # Get routing for reference
        gate = moe_block.gate
        hidden_flat = hidden.view(-1, H)
        routing_weights, selected_experts, _, _ = _sigmoid_topk_route(
            moe_block, gate, hidden_flat, gate.weight, None
        )

        # Reference output
        ref_output = _reference_moe_forward(
            hidden_flat,
            gate.weight,
            moe_block.experts.gate_up_proj,
            moe_block.experts.down_proj,
            moe_block.experts.act_fn,
            routing_weights,
            selected_experts,
            E,
        )

        # Kernel output
        kernel_output = HFScatterMoEGatedMLP.forward(moe_block, hidden)
        kernel_output_flat = kernel_output.view(-1, H)

        torch.testing.assert_close(
            kernel_output_flat.float(),
            ref_output.float(),
            atol=5e-2,
            rtol=5e-2,
        )

    def test_forward_matches_reference_bias_on_block(self):
        """Forward pass with sigmoid routing (minimax_m2 style, bias on block)."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            HFScatterMoEGatedMLP,
            _sigmoid_topk_route,
        )

        moe_block, T, H, FF, E, K = _make_mock_sigmoid_moe_block(
            T=16, H=64, FF=32, E=8, K=2, n_group=1, bias_on_gate=False
        )

        hidden = torch.randn(1, T, H, device="cuda")
        hidden_flat = hidden.view(-1, H)

        gate = moe_block.gate
        routing_weights, selected_experts, _, _ = _sigmoid_topk_route(
            moe_block, gate, hidden_flat, gate.weight, None
        )

        ref_output = _reference_moe_forward(
            hidden_flat,
            gate.weight,
            moe_block.experts.gate_up_proj,
            moe_block.experts.down_proj,
            moe_block.experts.act_fn,
            routing_weights,
            selected_experts,
            E,
        )

        kernel_output = HFScatterMoEGatedMLP.forward(moe_block, hidden)
        kernel_output_flat = kernel_output.view(-1, H)

        torch.testing.assert_close(
            kernel_output_flat.float(),
            ref_output.float(),
            atol=5e-2,
            rtol=5e-2,
        )

    def test_softmax_routing_still_works(self):
        """Verify softmax routing (Qwen/OLMoE) is not broken."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            HFScatterMoEGatedMLP,
            _softmax_topk_route,
        )

        T, H, FF, E, K = 16, 64, 32, 4, 2
        gate_up_proj = torch.randn(E, 2 * FF, H, device="cuda") * 0.02
        down_proj = torch.randn(E, H, FF, device="cuda") * 0.02
        act_fn = torch.nn.SiLU()

        experts = SimpleNamespace(
            gate_up_proj=gate_up_proj,
            down_proj=down_proj,
            act_fn=act_fn,
            num_experts=E,
        )
        gate = SimpleNamespace(
            weight=torch.randn(E, H, device="cuda") * 0.1,
            top_k=K,
            num_experts=E,
            norm_topk_prob=True,
        )
        moe_block = SimpleNamespace(gate=gate, experts=experts)

        hidden = torch.randn(1, T, H, device="cuda")
        hidden_flat = hidden.view(-1, H)

        routing_weights, selected_experts, _, _ = _softmax_topk_route(
            moe_block, gate, hidden_flat, gate.weight, None
        )

        ref_output = _reference_moe_forward(
            hidden_flat,
            gate.weight,
            gate_up_proj,
            down_proj,
            act_fn,
            routing_weights,
            selected_experts,
            E,
        )

        kernel_output = HFScatterMoEGatedMLP.forward(moe_block, hidden)
        kernel_output_flat = kernel_output.view(-1, H)

        torch.testing.assert_close(
            kernel_output_flat.float(),
            ref_output.float(),
            atol=5e-2,
            rtol=5e-2,
        )


class TestHFScatterMoESigmoidWithSharedExperts:
    """Test HFScatterMoEGatedMLP with sigmoid routing + shared experts."""

    def test_shared_experts_plural(self):
        """DeepSeek V3 style: shared_experts attribute (plural)."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            HFScatterMoEGatedMLP,
        )

        T, H, FF, E, K = 8, 64, 32, 8, 2
        gate_up_proj = torch.randn(E, 2 * FF, H, device="cuda") * 0.02
        down_proj = torch.randn(E, H, FF, device="cuda") * 0.02
        act_fn = torch.nn.SiLU()

        experts = SimpleNamespace(
            gate_up_proj=gate_up_proj,
            down_proj=down_proj,
            act_fn=act_fn,
            num_experts=E,
        )

        # Shared expert as a simple linear for testing
        shared_W = torch.randn(H, H, device="cuda") * 0.01
        shared_experts_fn = lambda x: x @ shared_W.T  # noqa: E731

        gate = SimpleNamespace(
            weight=torch.randn(E, H, device="cuda") * 0.1,
            e_score_correction_bias=torch.zeros(E, device="cuda"),
        )
        moe_block = SimpleNamespace(
            gate=gate,
            experts=experts,
            shared_experts=shared_experts_fn,
            top_k=K,
            n_routed_experts=E,
            n_group=1,
            norm_topk_prob=True,
            routed_scaling_factor=1.0,
        )

        hidden = torch.randn(1, T, H, device="cuda")

        # Should not raise; output should include shared expert contribution
        output = HFScatterMoEGatedMLP.forward(moe_block, hidden)
        assert output.shape == (1, T, H)

        # Run without shared expert to verify it changes the output
        moe_block_no_shared = SimpleNamespace(
            gate=gate,
            experts=experts,
            top_k=K,
            n_routed_experts=E,
            n_group=1,
            norm_topk_prob=True,
            routed_scaling_factor=1.0,
        )
        output_no_shared = HFScatterMoEGatedMLP.forward(moe_block_no_shared, hidden)
        assert not torch.equal(output, output_no_shared)

    def test_shared_expert_with_gate(self):
        """Qwen2MoE style: shared_expert + shared_expert_gate."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            HFScatterMoEGatedMLP,
        )

        T, H, FF, E, K = 8, 64, 32, 4, 2
        gate_up_proj = torch.randn(E, 2 * FF, H, device="cuda") * 0.02
        down_proj = torch.randn(E, H, FF, device="cuda") * 0.02
        act_fn = torch.nn.SiLU()

        experts = SimpleNamespace(
            gate_up_proj=gate_up_proj,
            down_proj=down_proj,
            act_fn=act_fn,
            num_experts=E,
        )

        shared_W = torch.randn(H, H, device="cuda") * 0.01
        shared_expert_fn = lambda x: x @ shared_W.T  # noqa: E731
        # Gate that returns 0 -> sigmoid(0) = 0.5
        gate_W = torch.zeros(H, H, device="cuda")
        shared_expert_gate_fn = lambda x: x @ gate_W.T  # noqa: E731

        gate = SimpleNamespace(
            weight=torch.randn(E, H, device="cuda") * 0.1,
            top_k=K,
            num_experts=E,
            norm_topk_prob=True,
        )
        moe_block = SimpleNamespace(
            gate=gate,
            experts=experts,
            shared_expert=shared_expert_fn,
            shared_expert_gate=shared_expert_gate_fn,
        )

        hidden = torch.randn(1, T, H, device="cuda")
        output = HFScatterMoEGatedMLP.forward(moe_block, hidden)
        assert output.shape == (1, T, H)


================================================
FILE: tests/e2e/integrations/test_scattermoe_lora_olmoe.py
================================================
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) Axolotl AI
# Licensed under the Apache License, Version 2.0

"""
Integration tests: OLMoE + peft LoRA + ScatterMoE fused kernels.

Validates that scattermoe_lora fused kernels produce correct results when used
with HuggingFace OLMoE models and peft LoRA adapters applied via
``target_parameters``.

Key things tested
-----------------
- LoRA weight layout conversion between peft (rank-major) and scattermoe (expert-major)
- Base forward equivalence: per-expert reference vs ScatterMoE kernels (no LoRA)
- LoRA forward equivalence: peft merged-weight approach vs scattermoe fused kernels
- Backward gradient correctness through the fused LoRA path
- ``kernelize()`` integration via ``LocalLayerRepository``
"""

from pathlib import Path

import pytest
import torch
import torch.nn as nn
import torch.nn.functional as F
from peft import LoraConfig, get_peft_model
from transformers import OlmoeConfig
from transformers.models.olmoe.modeling_olmoe import OlmoeSparseMoeBlock

_SMOE = "axolotl.integrations.kernels.libs.scattermoe_lora"

# Try to import from axolotl's scattermoe_lora.layers; may fail on CPU without triton.
try:
    from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
        _unwrap_experts_lora,
        _unwrap_gate_lora,
        peft_lora_B_to_scattermoe,
        peft_lora_to_scattermoe,
    )

    HAS_SCATTERMOE = True
except (ImportError, ModuleNotFoundError):
    HAS_SCATTERMOE = False

    # Provide pure-torch fallbacks for CPU-only layout conversion tests.
    def peft_lora_B_to_scattermoe(peft_B, num_experts, rank):
        N = peft_B.shape[0]
        return (
            peft_B.reshape(N, rank, num_experts)
            .permute(0, 2, 1)
            .contiguous()
            .reshape(N, num_experts * rank)
        )

    def peft_lora_to_scattermoe(peft_A, peft_B, num_experts, rank):
        peft_B_em = peft_lora_B_to_scattermoe(peft_B, num_experts, rank)
        K_inter, N_hidden = peft_B.shape[0], peft_A.shape[1]
        smoe_A = torch.zeros(
            rank * num_experts,
            K_inter,
            device=peft_A.device,
            dtype=peft_A.dtype,
        )
        smoe_B = torch.zeros(
            N_hidden,
            rank * num_experts,
            device=peft_A.device,
            dtype=peft_A.dtype,
        )
        for e in range(num_experts):
            s = e * rank
            smoe_A[s : s + rank, :] = peft_B_em[:, s : s + rank].T
            smoe_B[:, s : s + rank] = peft_A[s : s + rank, :].T
        return smoe_A, smoe_B

    def _unwrap_experts_lora(experts_module):
        return experts_module, None, None

    def _unwrap_gate_lora(gate_module):
        if hasattr(gate_module, "base_layer") and hasattr(gate_module, "lora_A"):
            base_gate = gate_module.base_layer
            active = getattr(gate_module, "active_adapters", ["default"])
            name = active[0] if active else "default"
            lora_A_dict = getattr(gate_module, "lora_A", {})
            lora_B_dict = getattr(gate_module, "lora_B", {})
            scaling_dict = getattr(gate_module, "scaling", {})
            if name in lora_A_dict:
                lora_A = lora_A_dict[name].weight
                lora_B = lora_B_dict[name].weight
                s = scaling_dict[name]
                delta = s * (lora_B @ lora_A)
                return base_gate, base_gate.weight, delta
            return base_gate, base_gate.weight, None
        return gate_module, gate_module.weight, None


# =============================================================================
# Configuration
# =============================================================================

FULL_OLMOE_CONFIG = dict(
    hidden_size=2048,
    intermediate_size=1024,
    num_experts=64,
    num_experts_per_tok=8,
    hidden_act="silu",
    norm_topk_prob=False,
)

SMALL_OLMOE_CONFIG = dict(
    hidden_size=128,
    intermediate_size=48,  # non-square: 2*inter=96 != hidden=128
    num_experts=8,
    num_experts_per_tok=2,
    hidden_act="silu",
    norm_topk_prob=False,
)

requires_cuda = pytest.mark.skipif(
    not torch.cuda.is_available(), reason="CUDA not available"
)


def make_olmoe_config(use_full=False):
    cfg = dict(FULL_OLMOE_CONFIG if use_full else SMALL_OLMOE_CONFIG)
    cfg["experts_implementation"] = "grouped_mm"
    return OlmoeConfig(**cfg)


# =============================================================================
# Layout conversion utilities (test-local helpers)
# =============================================================================


def scattermoe_lora_B_to_peft(smoe_B, num_experts, rank):
    """Inverse of ``peft_lora_B_to_scattermoe``."""
    N = smoe_B.shape[0]
    return (
        smoe_B.reshape(N, num_experts, rank)
        .permute(0, 2, 1)
        .contiguous()
        .reshape(N, num_experts * rank)
    )


def peft_gate_up_lora_to_scattermoe(peft_A, peft_B, num_experts, rank):
    """Convert peft LoRA for gate_up_proj to scattermoe layout.

    Both gate_up_proj and down_proj need the A<->B swap because
    scattermoe transposes the parameter (W = param.T).
    """
    return peft_lora_to_scattermoe(peft_A, peft_B, num_experts, rank)


# =============================================================================
# Helpers
# =============================================================================


def _init_expert_weights(moe_block):
    """Initialize OlmoeExperts parameters which use torch.empty (uninitialized).

    Without this, gate_up_proj and down_proj contain garbage/NaN values.
    """
    with torch.no_grad():
        nn.init.kaiming_uniform_(moe_block.experts.gate_up_proj)
        nn.init.kaiming_uniform_(moe_block.experts.down_proj)
    return moe_block


class MinimalOLMoEModel(nn.Module):
    """Thin wrapper so peft's get_peft_model can attach adapters."""

    def __init__(self, config):
        super().__init__()
        self.moe = OlmoeSparseMoeBlock(config)
        _init_expert_weights(self.moe)

    def forward(self, x):
        return self.moe(x)


def _get_routing(moe_block, hidden_states):
    """Run the router and return (routing_weights, selected_experts)."""
    with torch.no_grad():
        _, routing_weights, selected_experts = moe_block.gate(
            hidden_states.view(-1, hidden_states.size(-1))
        )
    return routing_weights, selected_experts


def _reference_moe_forward(
    x_flat,
    gate_up_proj,
    down_proj,
    act_fn,
    top_k_index,
    top_k_weights,
    num_experts,
):
    """Pure-PyTorch per-expert reference MoE forward (no LoRA).

    Uses F.linear per expert for an apples-to-apples comparison with
    the ScatterMoE kernel path.
    """
    final = torch.zeros_like(x_flat)
    expert_mask = F.one_hot(top_k_index, num_classes=num_experts).permute(2, 1, 0)
    for e in range(num_experts):
        top_k_pos, token_idx = torch.where(expert_mask[e])
        if token_idx.numel() == 0:
            continue
        cur = x_flat[token_idx]
        gate_up = F.linear(cur, gate_up_proj[e])
        g, u = gate_up.chunk(2, dim=-1)
        h = act_fn(g) * u
        out = F.linear(h, down_proj[e])
        out = out * top_k_weights[token_idx, top_k_pos, None]
        final.index_add_(0, token_idx, out.to(final.dtype))
    return final


def _reference_moe_forward_with_lora(
    x_flat,
    gate_up_proj,
    down_proj,
    act_fn,
    top_k_index,
    top_k_weights,
    num_experts,
    gup_delta,
    down_delta,
):
    """Pure-PyTorch reference MoE forward with pre-computed weight deltas."""
    merged_gup = gate_up_proj + gup_delta
    merged_down = down_proj + down_delta
    return _reference_moe_forward(
        x_flat,
        merged_gup,
        merged_down,
        act_fn,
        top_k_index,
        top_k_weights,
        num_experts,
    )


def _compute_delta_from_scattermoe_lora(lora_A, lora_B, scaling, E, r, param_shape):
    """Compute additive weight delta from scattermoe-layout LoRA weights.

    delta[e] = scaling * B_e @ A_e  where A_e [r,K], B_e [N,r] -> [N,K].
    """
    delta = torch.zeros(param_shape, device=lora_A.device, dtype=lora_A.dtype)
    for e in range(E):
        A_e = lora_A[e * r : (e + 1) * r, :]
        B_e = lora_B[:, e * r : (e + 1) * r]
        delta[e] = scaling * (B_e @ A_e)
    return delta


# =============================================================================
# Tests: Layout conversion
# =============================================================================


class TestLoRABLayoutConversion:
    """Test the peft <-> scattermoe lora_B layout conversion."""

    def test_roundtrip(self):
        E, r, N = 8, 4, 64
        original = torch.randn(N, E * r)
        converted = peft_lora_B_to_scattermoe(original, E, r)
        back = scattermoe_lora_B_to_peft(converted, E, r)
        torch.testing.assert_close(back, original)

    def test_per_expert_slices(self):
        """After conversion, scattermoe slicing gives the same per-expert
        matrices as peft's reshape slicing."""
        E, r, N = 4, 2, 16
        peft_B = torch.randn(N, E * r)
        smoe_B = peft_lora_B_to_scattermoe(peft_B, E, r)

        peft_reshaped = peft_B.reshape(N, r, E)
        for e in range(E):
            torch.testing.assert_close(
                smoe_B[:, e * r : (e + 1) * r],
                peft_reshaped[:, :, e],
            )

    def test_lora_A_already_compatible(self):
        """lora_A layout is identical between peft and scattermoe."""
        E, r, K = 4, 2, 16
        lora_A = torch.randn(E * r, K)
        peft_reshaped = lora_A.reshape(E, r, K)
        for e in range(E):
            torch.testing.assert_close(
                lora_A[e * r : (e + 1) * r, :],
                peft_reshaped[e],
            )

    def test_delta_weight_equivalence(self):
        """peft's einsum delta matches per-expert B @ A with converted layouts."""
        E, r, K, N = 8, 4, 32, 64
        peft_A = torch.randn(E * r, K)
        peft_B = torch.randn(N, E * r)
        scaling = 2.0

        A_r = peft_A.reshape(E, r, K)
        B_r = peft_B.reshape(N, r, E)
        delta_peft = torch.einsum("o r e, e r i -> e i o", B_r, A_r) * scaling

        smoe_B = peft_lora_B_to_scattermoe(peft_B, E, r)
        for e in range(E):
            A_e = peft_A[e * r : (e + 1) * r, :]
            B_e = smoe_B[:, e * r : (e + 1) * r]
            delta_e = scaling * (B_e @ A_e)
            torch.testing.assert_close(delta_e, delta_peft[e].T, atol=1e-5, rtol=1e-5)

    def test_down_proj_conversion(self):
        """Verify peft_lora_to_scattermoe produces correct delta."""
        E, r = 4, 2
        hidden, inter = 32, 16
        scaling = 2.0

        peft_A = torch.randn(E * r, hidden)
        peft_B = torch.randn(inter, E * r)

        A_r = peft_A.reshape(E, r, hidden)
        B_r = peft_B.reshape(inter, r, E)
        delta_peft = torch.einsum("o r e, e r i -> e i o", B_r, A_r) * scaling

        smoe_A, smoe_B = peft_lora_to_scattermoe(peft_A, peft_B, E, r)
        for e in range(E):
            A_e = smoe_A[e * r : (e + 1) * r, :]
            B_e = smoe_B[:, e * r : (e + 1) * r]
            delta_smoe_e = scaling * (B_e @ A_e)
            torch.testing.assert_close(
                delta_smoe_e, delta_peft[e], atol=1e-5, rtol=1e-5
            )

    def test_gate_up_proj_conversion(self):
        """Verify gate_up_proj LoRA conversion with non-square dims (Qwen3-like).

        gate_up_proj param: [E, 2*inter, hidden].
        peft: in_features=2*inter, out_features=hidden.
        peft lora_A: [r*E, 2*inter], lora_B: [hidden, r*E].

        scattermoe W = param.T = [E, hidden, 2*inter], K=hidden, N=2*inter.
        scattermoe needs: lora_A [r*E, K=hidden], lora_B [N=2*inter, r*E].

        Uses non-square dims (hidden=32 != 2*inter=24) to catch A<->B swap bugs.
        """
        E, r = 4, 2
        hidden, inter = 32, 12  # 2*inter=24 != hidden=32
        scaling = 2.0

        # peft assigns: in_features=2*inter, out_features=hidden
        peft_A = torch.randn(E * r, 2 * inter)  # [r*E, in_features=2*inter]
        peft_B = torch.randn(hidden, E * r)  # [out_features=hidden, r*E]

        # peft delta via einsum: "o r e, e r i -> e i o"
        A_r = peft_A.reshape(E, r, 2 * inter)
        B_r = peft_B.reshape(hidden, r, E)
        delta_peft = torch.einsum("o r e, e r i -> e i o", B_r, A_r) * scaling
        # delta_peft[e] has shape [in_features, out_features] = [2*inter, hidden]
        # = param[e] shape [2*inter, hidden]

        smoe_A, smoe_B = peft_gate_up_lora_to_scattermoe(peft_A, peft_B, E, r)
        # smoe_A should be [r*E, K=hidden], smoe_B should be [N=2*inter, r*E]
        assert smoe_A.shape == (E * r, hidden), (
            f"Expected {(E * r, hidden)}, got {smoe_A.shape}"
        )
        assert smoe_B.shape == (2 * inter, E * r), (
            f"Expected {(2 * inter, E * r)}, got {smoe_B.shape}"
        )

        for e in range(E):
            A_e = smoe_A[e * r : (e + 1) * r, :]  # [r, K=hidden]
            B_e = smoe_B[:, e * r : (e + 1) * r]  # [N=2*inter, r]
            delta_smoe_e = scaling * (B_e @ A_e)  # [2*inter, hidden]
            # Should match peft delta which is [2*inter, hidden] = param[e]
            torch.testing.assert_close(
                delta_smoe_e, delta_peft[e], atol=1e-5, rtol=1e-5
            )


# =============================================================================
# Tests: peft weight extraction
# =============================================================================


class TestPeftLoRAWeightExtraction:
    """Test extracting peft LoRA weights for OLMoE."""

    def test_peft_creates_correct_shapes(self):
        config = make_olmoe_config(use_full=False)
        E, r = config.num_experts, 4

        model = MinimalOLMoEModel(config)
        lora_config = LoraConfig(
            r=r,
            lora_alpha=16,
            target_modules=[],
            target_parameters=[
                "gate.weight",
                "experts.gate_up_proj",
                "experts.down_proj",
            ],
            bias="none",
        )
        peft_model = get_peft_model(model, lora_config)
        trainable = {n: p for n, p in peft_model.named_parameters() if p.requires_grad}

        # Gate router
        assert trainable["base_model.model.moe.gate.lora_A.default.weight"].shape == (
            r,
            config.hidden_size,
        )
        assert trainable["base_model.model.moe.gate.lora_B.default.weight"].shape == (
            E,
            r,
        )

        # gate_up_proj [E, 2*inter, hidden]
        # peft: in_features=2*inter (dim 1), out_features=hidden (dim 2)
        assert trainable[
            "base_model.model.moe.experts.base_layer.lora_A.default.weight"
        ].shape == (E * r, 2 * config.intermediate_size)
        assert trainable[
            "base_model.model.moe.experts.base_layer.lora_B.default.weight"
        ].shape == (config.hidden_size, E * r)

        # down_proj [E, hidden, inter]
        # peft: in_features=hidden (dim 1), out_features=inter (dim 2)
        assert trainable[
            "base_model.model.moe.experts.lora_A.default.weight"
        ].shape == (E * r, config.hidden_size)
        assert trainable[
            "base_model.model.moe.experts.lora_B.default.weight"
        ].shape == (config.intermediate_size, E * r)

    @requires_cuda
    def test_peft_forward_runs(self):
        """Smoke test: peft model forward pass completes (needs CUDA for grouped_mm)."""
        config = make_olmoe_config(use_full=False)
        model = MinimalOLMoEModel(config)
        lora_config = LoraConfig(
            r=4,
            lora_alpha=16,
            target_modules=[],
            target_parameters=[
                "gate.weight",
                "experts.gate_up_proj",
                "experts.down_proj",
            ],
            bias="none",
        )
        peft_model = get_peft_model(model, lora_config)
        x = torch.randn(1, 4, config.hidden_size)
        out = peft_model(x)
        assert out.shape == x.shape

    @pytest.mark.skipif(
        not HAS_SCATTERMOE, reason="scattermoe_lora not importable (no triton)"
    )
    def test_unwrap_experts_lora(self):
        """Test that _unwrap_experts_lora correctly detects LoRA wrappers."""
        config = make_olmoe_config(use_full=False)
        model = MinimalOLMoEModel(config)
        lora_config = LoraConfig(
            r=4,
            lora_alpha=16,
            target_modules=[],
            target_parameters=["experts.gate_up_proj", "experts.down_proj"],
            bias="none",
        )
        peft_model = get_peft_model(model, lora_config)
        base_moe = peft_model.base_model.model.moe

        # Experts should be wrapped by ParamWrapper
        experts, gup_lora, down_lora = _unwrap_experts_lora(base_moe.experts)

        # Base experts should have the raw parameters
        assert hasattr(experts, "gate_up_proj")
        assert hasattr(experts, "down_proj")

        # LoRA should be detected
        assert gup_lora is not None, "gate_up_proj LoRA not detected"
        assert down_lora is not None, "down_proj LoRA not detected"

        # Check shapes (after peft->scattermoe conversion with A<->B swap)
        # gate_up_proj W = param.T = [E, hidden, 2*inter], K=hidden, N=2*inter
        E, r = config.num_experts, 4
        gup_A, gup_B, gup_s = gup_lora
        assert gup_A.shape == (E * r, config.hidden_size), (
            f"gate_up_proj smoe_A: expected [r*E, K=hidden]={(E * r, config.hidden_size)}, "
            f"got {gup_A.shape}"
        )
        assert gup_B.shape == (2 * config.intermediate_size, E * r), (
            f"gate_up_proj smoe_B: expected [N=2*inter, r*E]="
            f"{(2 * config.intermediate_size, E * r)}, got {gup_B.shape}"
        )

        # down_proj W = param.T = [E, inter, hidden], K=inter, N=hidden
        down_A, down_B, down_s = down_lora
        assert down_A.shape == (E * r, config.intermediate_size), (
            f"down_proj smoe_A: expected [r*E, K=inter]={(E * r, config.intermediate_size)}, "
            f"got {down_A.shape}"
        )
        assert down_B.shape == (config.hidden_size, E * r), (
            f"down_proj smoe_B: expected [N=hidden, r*E]={(config.hidden_size, E * r)}, "
            f"got {down_B.shape}"
        )

    def test_unwrap_no_lora(self):
        """Without peft, _unwrap_experts_lora returns no LoRA."""
        config = make_olmoe_config(use_full=False)
        moe = OlmoeSparseMoeBlock(config)
        experts, gup_lora, down_lora = _unwrap_experts_lora(moe.experts)
        assert gup_lora is None
        assert down_lora is None
        assert hasattr(experts, "gate_up_proj")

    def test_unwrap_gate_lora(self):
        """Test that _unwrap_gate_lora detects LoRA on the router gate."""
        config = make_olmoe_config(use_full=False)
        model = MinimalOLMoEModel(config)
        r = 4
        lora_config = LoraConfig(
            r=r,
            lora_alpha=16,
            target_modules=[],
            target_parameters=["gate.weight"],
            bias="none",
        )
        peft_model = get_peft_model(model, lora_config)
        base_moe = peft_model.base_model.model.moe

        # Set non-zero LoRA weights (peft initializes lora_B to zeros)
        with torch.no_grad():
            base_moe.gate.lora_B["default"].weight.normal_(0, 0.01)

        base_gate, gate_weight, gate_delta = _unwrap_gate_lora(base_moe.gate)

        # Base gate should be the original router
        assert hasattr(base_gate, "top_k")
        assert hasattr(base_gate, "num_experts")
        assert base_gate.top_k == config.num_experts_per_tok
        assert base_gate.num_experts == config.num_experts

        # Gate weight should be the base weight (delta returned separately)
        assert gate_weight.shape == (config.num_experts, config.hidden_size)
        torch.testing.assert_close(gate_weight, base_gate.weight)

        # Delta should be non-zero (LoRA was applied)
        assert gate_delta is not None
        assert gate_delta.shape == (config.num_experts, config.hidden_size)
        assert gate_delta.abs().max() > 0, "Gate LoRA delta should be non-zero"

    def test_unwrap_gate_no_lora(self):
        """Without peft, _unwrap_gate_lora returns the original gate."""
        config = make_olmoe_config(use_full=False)
        moe = OlmoeSparseMoeBlock(config)
        base_gate, gate_weight, gate_delta = _unwrap_gate_lora(moe.gate)
        assert base_gate is moe.gate
        torch.testing.assert_close(gate_weight, moe.gate.weight)
        assert gate_delta is None

    def test_gate_lora_delta_matches_peft(self):
        """Verify _unwrap_gate_lora computes the same delta as peft."""
        config = make_olmoe_config(use_full=False)
        model = MinimalOLMoEModel(config)
        r = 4
        lora_alpha = 16
        scaling = lora_alpha / r
        lora_config = LoraConfig(
            r=r,
            lora_alpha=lora_alpha,
            target_modules=[],
            target_parameters=["gate.weight"],
            bias="none",
        )
        peft_model = get_peft_model(model, lora_config)
        base_moe = peft_model.base_model.model.moe

        # Our unwrapped weight + delta
        _, gate_weight, gate_delta = _unwrap_gate_lora(base_moe.gate)

        # Manually compute expected delta
        lora_A = base_moe.gate.lora_A["default"].weight  # [r, hidden]
        lora_B = base_moe.gate.lora_B["default"].weight  # [E, r]
        base_weight = base_moe.gate.base_layer.weight  # [E, hidden]
        expected_delta = scaling * (lora_B @ lora_A)

        torch.testing.assert_close(gate_weight, base_weight)
        torch.testing.assert_close(gate_delta, expected_delta)
        # Combined should match the old behavior
        torch.testing.assert_close(
            gate_weight + gate_delta, base_weight + expected_delta
        )


# =============================================================================
# Tests: Base forward equivalence (no LoRA)
# =============================================================================


@requires_cuda
class TestOLMoEReferenceVsScatterMoE:
    """Base forward equivalence: per-expert reference vs ScatterMoE kernels."""

    def test_small(self):
        self._run(use_full=False, M=16)

    @pytest.mark.slow
    def test_full(self):
        self._run(use_full=True, M=32)

    def _run(self, use_full, M):
        from axolotl.integrations.kernels.libs.scattermoe_lora import (
            flatten_sort_count,
            parallel_linear,
        )

        config = make_olmoe_config(use_full=use_full)
        torch.manual_seed(42)
        moe = _init_expert_weights(OlmoeSparseMoeBlock(config)).cuda().float()
        E, k = config.num_experts, config.num_experts_per_tok

        x = torch.randn(1, M, config.hidden_size, device="cuda")
        x_flat = x.view(-1, config.hidden_size)

        with torch.no_grad():
            # Shared routing for both paths
            _, rw, sel = moe.gate(x_flat)
            sei, ssi, eo = flatten_sort_count(sel, num_experts=E)

            # Per-expert reference
            ref_out = _reference_moe_forward(
                x_flat,
                moe.experts.gate_up_proj,
                moe.experts.down_proj,
                moe.experts.act_fn,
                sel,
                rw,
                E,
            ).view(1, M, config.hidden_size)

            # ScatterMoE kernel path
            gup = parallel_linear(
                x_flat,
                moe.experts.gate_up_proj.transpose(2, 1),
                k,
                sei,
                ssi,
                eo,
                grouped_in=False,
                grouped_out=True,
            )
            g, u = gup.chunk(2, dim=-1)
            h = moe.experts.act_fn(g) * u

            smoe_out = parallel_linear(
                h,
                moe.experts.down_proj.transpose(2, 1),
                1,
                sei,
                ssi,
                eo,
                grouped_in=True,
                grouped_out=False,
                gates=rw,
            ).view(1, M, config.hidden_size)

        torch.testing.assert_close(smoe_out, ref_out, atol=1e-3, rtol=1e-3)


# =============================================================================
# Tests: LoRA forward equivalence (peft vs scattermoe fused)
# =============================================================================


@requires_cuda
class TestOLMoEPeftLoRAForward:
    """Fused LoRA forward: peft merged-weight vs scattermoe_lora kernel."""

    def test_small(self):
        self._run(use_full=False, M=16, r=4)

    @pytest.mark.slow
    def test_full(self):
        self._run(use_full=True, M=32, r=8)

    def _run(self, use_full, M, r):
        from axolotl.integrations.kernels.libs.scattermoe_lora import (
            flatten_sort_count,
            parallel_linear_lora,
        )

        config = make_olmoe_config(use_full=use_full)
        E, k = config.num_experts, config.num_experts_per_tok
        lora_alpha = 16
        scaling = lora_alpha / r

        # Create peft model
        model = MinimalOLMoEModel(config).cuda().float()
        lora_config = LoraConfig(
            r=r,
            lora_alpha=lora_alpha,
            target_modules=[],
            target_parameters=["experts.gate_up_proj", "experts.down_proj"],
            bias="none",
        )
        peft_model = get_peft_model(model, lora_config)

        torch.manual_seed(42)
        x = torch.randn(1, M, config.hidden_size, device="cuda")

        # peft forward
        with torch.no_grad():
            peft_out = peft_model(x)

        # Extract base weights and LoRA weights
        base_moe = peft_model.base_model.model.moe
        base_experts = base_moe.experts.base_layer.base_layer
        gate_up_proj = base_experts.gate_up_proj
        down_proj = base_experts.down_proj
        act_fn = base_experts.act_fn

        # gate_up_proj LoRA
        gup_w = base_moe.experts.base_layer
        peft_gup_A = gup_w.lora_A["default"].weight.detach()
        peft_gup_B = gup_w.lora_B["default"].weight.detach()
        smoe_gup_A, smoe_gup_B = peft_gate_up_lora_to_scattermoe(
            peft_gup_A, peft_gup_B, E, r
        )

        # down_proj LoRA
        down_w = base_moe.experts
        peft_down_A = down_w.lora_A["default"].weight.detach()
        peft_down_B = down_w.lora_B["default"].weight.detach()
        smoe_down_A, smoe_down_B = peft_lora_to_scattermoe(
            peft_down_A, peft_down_B, E, r
        )

        # ScatterMoE fused forward -- gate is NOT peft-wrapped, access directly
        x_flat = x.view(-1, config.hidden_size)

        with torch.no_grad():
            _, rw, sel = base_moe.gate(x_flat)
            sei, ssi, eo = flatten_sort_count(sel, num_experts=E)

            gup = parallel_linear_lora(
                x_flat,
                gate_up_proj.transpose(2, 1),
                k,
                sei,
                ssi,
                eo,
                lora_A=smoe_gup_A,
                lora_B=smoe_gup_B,
                scaling=scaling,
                grouped_in=False,
                grouped_out=True,
            )
            g, u = gup.chunk(2, dim=-1)
            h = act_fn(g) * u

            smoe_out = parallel_linear_lora(
                h,
                down_proj.transpose(2, 1),
                1,
                sei,
                ssi,
                eo,
                lora_A=smoe_down_A,
                lora_B=smoe_down_B,
                scaling=scaling,
                grouped_in=True,
                grouped_out=False,
                gates=rw,
            ).view(1, M, config.hidden_size)

        torch.testing.assert_close(smoe_out, peft_out, atol=5e-3, rtol=5e-3)


# =============================================================================
# Tests: Backward gradient correctness
# =============================================================================


@requires_cuda
class TestOLMoEPeftLoRABackward:
    """Backward gradients through scattermoe_lora vs pure-PyTorch reference."""

    def test_small(self):
        self._run(use_full=False, M=16, r=4)

    def _run(self, use_full, M, r):
        from axolotl.integrations.kernels.libs.scattermoe_lora import (
            flatten_sort_count,
            parallel_linear_lora,
        )

        config = make_olmoe_config(use_full=use_full)
        E, k = config.num_experts, config.num_experts_per_tok
        lora_alpha = 16
        scaling = lora_alpha / r

        torch.manual_seed(42)
        moe = _init_expert_weights(OlmoeSparseMoeBlock(config)).cuda().float()
        x = torch.randn(1, M, config.hidden_size, device="cuda")
        x_flat = x.view(-1, config.hidden_size)
        gate_up_proj = moe.experts.gate_up_proj
        down_proj = moe.experts.down_proj

        # Create LoRA weights in scattermoe layout directly
        gup_A = torch.randn(r * E, config.hidden_size, device="cuda") * 0.01
        gup_B = torch.randn(2 * config.intermediate_size, r * E, device="cuda") * 0.01
        down_A = torch.randn(r * E, config.intermediate_size, device="cuda") * 0.01
        down_B = torch.randn(config.hidden_size, r * E, device="cuda") * 0.01

        rw, sel = _get_routing(moe, x)
        sei, ssi, eo = flatten_sort_count(sel, num_experts=E)

        # --- Reference ---
        gup_delta = _compute_delta_from_scattermoe_lora(
            gup_A, gup_B, scaling, E, r, gate_up_proj.shape
        )
        down_delta = _compute_delta_from_scattermoe_lora(
            down_A, down_B, scaling, E, r, down_proj.shape
        )

        x_ref = x_flat.clone().detach().requires_grad_(True)
        ref_out = _reference_moe_forward_with_lora(
            x_ref,
            gate_up_proj,
            down_proj,
            moe.experts.act_fn,
            sel,
            rw,
            E,
            gup_delta,
            down_delta,
        )
        ref_out.sum().backward()

        # --- ScatterMoE fused path ---
        x_smoe = x_flat.clone().detach().requires_grad_(True)
        gup_A_s = gup_A.clone().requires_grad_(True)
        gup_B_s = gup_B.clone().requires_grad_(True)
        down_A_s = down_A.clone().requires_grad_(True)
        down_B_s = down_B.clone().requires_grad_(True)

        gup_out = parallel_linear_lora(
            x_smoe,
            gate_up_proj.transpose(2, 1),
            k,
            sei,
            ssi,
            eo,
            lora_A=gup_A_s,
            lora_B=gup_B_s,
            scaling=scaling,
            grouped_in=False,
            grouped_out=True,
        )
        g, u = gup_out.chunk(2, dim=-1)
        h = moe.experts.act_fn(g) * u

        smoe_out = parallel_linear_lora(
            h,
            down_proj.transpose(2, 1),
            1,
            sei,
            ssi,
            eo,
            lora_A=down_A_s,
            lora_B=down_B_s,
            scaling=scaling,
            grouped_in=True,
            grouped_out=False,
            gates=rw,
        )
        smoe_out.sum().backward()

        torch.testing.assert_close(
            smoe_out.detach(),
            ref_out.detach(),
            atol=5e-3,
            rtol=5e-3,
        )
        torch.testing.assert_close(
            x_smoe.grad,
            x_ref.grad,
            atol=5e-2,
            rtol=5e-2,
        )


# =============================================================================
# Tests: kernelize() integration via LocalLayerRepository
# =============================================================================


@requires_cuda
class TestKernelizeIntegration:
    """Test the HF kernels library integration with LocalLayerRepository."""

    @staticmethod
    def _get_kernelize_imports():
        """Import kernels library components, skip if not available."""
        try:
            from kernels import (
                LocalLayerRepository,
                Mode,
                kernelize,
                register_kernel_mapping,
                replace_kernel_forward_from_hub,
            )

            return (
                LocalLayerRepository,
                Mode,
                register_kernel_mapping,
                replace_kernel_forward_from_hub,
                kernelize,
            )
        except ImportError:
            pytest.skip("kernels library not installed")

    @staticmethod
    def _get_repo_path():
        """Get the path to scattermoe_lora within axolotl's plugin."""
        return (
            Path(__file__).parent.parent.parent
            / "src"
            / "axolotl"
            / "integrations"
            / "kernels"
            / "libs"
            / "scattermoe_lora"
        )

    def _setup_kernels(
        self,
        LocalLayerRepository,
        Mode,
        register_kernel_mapping,
        replace_kernel_forward_from_hub,
    ):
        """Register kernel mapping for tests."""
        repo_path = self._get_repo_path()
        local_repo = LocalLayerRepository(
            repo_path=repo_path,
            package_name="scattermoe_lora",
            layer_name="HFScatterMoEGatedMLP",
        )

        replace_kernel_forward_from_hub(
            OlmoeSparseMoeBlock, "HFScatterMoEParallelExperts"
        )
        register_kernel_mapping(
            {
                "HFScatterMoEParallelExperts": {
                    "cuda": {
                        Mode.TRAINING: local_repo,
                        Mode.INFERENCE: local_repo,
                    },
                }
            }
        )

    def test_base_forward_via_kernelize(self):
        """Kernelized OlmoeSparseMoeBlock (no LoRA) matches per-expert reference."""
        (
            LocalLayerRepository,
            Mode,
            register_kernel_mapping,
            replace_kernel_forward_from_hub,
            kernelize,
        ) = self._get_kernelize_imports()

        config = make_olmoe_config(use_full=False)
        E = config.num_experts

        # Create model
        torch.manual_seed(42)
        moe = _init_expert_weights(OlmoeSparseMoeBlock(config)).cuda().float()
        x = torch.randn(1, 8, config.hidden_size, device="cuda")
        x_flat = x.view(-1, config.hidden_size)

        # Compute reference BEFORE kernelizing
        with torch.no_grad():
            _, rw, sel = moe.gate(x_flat)
            ref_out = _reference_moe_forward(
                x_flat,
                moe.experts.gate_up_proj,
                moe.experts.down_proj,
                moe.experts.act_fn,
                sel,
                rw,
                E,
            ).view(1, 8, config.hidden_size)

        # Set up kernel mapping
        self._setup_kernels(
            LocalLayerRepository,
            Mode,
            register_kernel_mapping,
            replace_kernel_forward_from_hub,
        )

        # Kernelize the model
        kernelize(moe, mode=Mode.TRAINING, device="cuda")

        # Forward through kernelized model
        with torch.no_grad():
            kern_out = moe(x)

        torch.testing.assert_close(kern_out, ref_out, atol=1e-3, rtol=1e-3)

    def test_lora_forward_via_kernelize(self):
        """Kernelized OlmoeSparseMoeBlock with peft LoRA matches reference."""
        (
            LocalLayerRepository,
            Mode,
            register_kernel_mapping,
            replace_kernel_forward_from_hub,
            kernelize,
        ) = self._get_kernelize_imports()

        config = make_olmoe_config(use_full=False)
        r = 4

        # Create peft model
        torch.manual_seed(42)
        model = MinimalOLMoEModel(config).cuda().float()
        lora_config = LoraConfig(
            r=r,
            lora_alpha=16,
            target_modules=[],
            target_parameters=["experts.gate_up_proj", "experts.down_proj"],
            bias="none",
        )
        peft_model = get_peft_model(model, lora_config)

        x = torch.randn(1, 8, config.hidden_size, device="cuda")

        # Reference: peft's own forward (uses _activate_lora context manager)
        with torch.no_grad():
            ref_out = peft_model(x)

        # Set up kernel mapping
        self._setup_kernels(
            LocalLayerRepository,
            Mode,
            register_kernel_mapping,
            replace_kernel_forward_from_hub,
        )

        # Kernelize the MoE block inside the peft model
        base_moe = peft_model.base_model.model.moe
        kernelize(base_moe, mode=Mode.TRAINING, device="cuda")

        # Forward through kernelized peft model
        with torch.no_grad():
            kern_out = peft_model(x)

        torch.testing.assert_close(kern_out, ref_out, atol=5e-3, rtol=5e-3)

    def test_gate_lora_forward_via_kernelize(self):
        """Kernelized forward with gate LoRA matches peft reference."""
        (
            LocalLayerRepository,
            Mode,
            register_kernel_mapping,
            replace_kernel_forward_from_hub,
            kernelize,
        ) = self._get_kernelize_imports()

        config = make_olmoe_config(use_full=False)
        r = 4

        # Create peft model with gate + experts LoRA
        torch.manual_seed(42)
        model = MinimalOLMoEModel(config).cuda().float()
        lora_config = LoraConfig(
            r=r,
            lora_alpha=16,
            target_modules=[],
            target_parameters=[
                "gate.weight",
                "experts.gate_up_proj",
                "experts.down_proj",
            ],
            bias="none",
        )
        peft_model = get_peft_model(model, lora_config)

        x = torch.randn(1, 8, config.hidden_size, device="cuda")

        # Reference: peft's own forward
        with torch.no_grad():
            ref_out = peft_model(x)

        # Set up kernel mapping
        self._setup_kernels(
            LocalLayerRepository,
            Mode,
            register_kernel_mapping,
            replace_kernel_forward_from_hub,
        )

        # Kernelize the MoE block inside the peft model
        base_moe = peft_model.base_model.model.moe
        kernelize(base_moe, mode=Mode.TRAINING, device="cuda")

        # Forward through kernelized peft model
        with torch.no_grad():
            kern_out = peft_model(x)

        torch.testing.assert_close(kern_out, ref_out, atol=5e-3, rtol=5e-3)


# =============================================================================
# Tests: Shared expert handling
# =============================================================================


class TestSharedExpertHandling:
    """Test that HFScatterMoEGatedMLP.forward handles shared experts."""

    @staticmethod
    def _make_shared_expert_block(config):
        """Create an OlmoeSparseMoeBlock with a mock shared expert attached."""
        moe = OlmoeSparseMoeBlock(config)
        _init_expert_weights(moe)

        hidden = config.hidden_size
        inter = config.intermediate_size

        # Attach a simple shared expert MLP (mimics Qwen2MoE structure)
        class SharedExpertMLP(nn.Module):
            def __init__(self, hidden_size, intermediate_size):
                super().__init__()
                self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
                self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
                self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
                self.act_fn = nn.SiLU()

            def forward(self, x):
                return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))

        moe.shared_expert = SharedExpertMLP(hidden, inter)
        moe.shared_expert_gate = nn.Linear(hidden, 1, bias=False)

        return moe

    def test_shared_expert_is_used(self):
        """Verify shared expert output affects final result."""
        config = make_olmoe_config(use_full=False)
        moe = self._make_shared_expert_block(config)

        # Compute reference without shared expert
        torch.manual_seed(42)
        x = torch.randn(1, 4, config.hidden_size)
        x_flat = x.view(-1, config.hidden_size)

        with torch.no_grad():
            # Shared expert contribution
            shared_out = moe.shared_expert(x_flat)
            gate_val = F.sigmoid(moe.shared_expert_gate(x_flat))
            shared_contribution = shared_out * gate_val

        # Verify shared expert produces non-zero output
        assert shared_contribution.abs().max() > 0

    @requires_cuda
    def test_shared_expert_forward_via_kernelize(self):
        """Kernelized forward with shared expert matches manual reference."""
        try:
            from kernels import (
                LocalLayerRepository,
                Mode,
                kernelize,
                register_kernel_mapping,
                replace_kernel_forward_from_hub,
            )
        except ImportError:
            pytest.skip("kernels library not installed")

        config = make_olmoe_config(use_full=False)
        E = config.num_experts

        torch.manual_seed(42)
        moe = self._make_shared_expert_block(config).cuda().float()
        x = torch.randn(1, 8, config.hidden_size, device="cuda")
        x_flat = x.view(-1, config.hidden_size)

        # Compute reference: per-expert + shared expert
        with torch.no_grad():
            _, rw, sel = moe.gate(x_flat)

            expert_out = _reference_moe_forward(
                x_flat,
                moe.experts.gate_up_proj,
                moe.experts.down_proj,
                moe.experts.act_fn,
                sel,
                rw,
                E,
            )
            shared_out = moe.shared_expert(x_flat)
            gate_val = F.sigmoid(moe.shared_expert_gate(x_flat))
            ref_out = (expert_out + shared_out * gate_val).view(
                1, 8, config.hidden_size
            )

        # Kernelize
        repo_path = (
            Path(__file__).parent.parent.parent
            / "src"
            / "axolotl"
            / "integrations"
            / "kernels"
            / "libs"
            / "scattermoe_lora"
        )
        local_repo = LocalLayerRepository(
            repo_path=repo_path,
            package_name="scattermoe_lora",
            layer_name="HFScatterMoEGatedMLP",
        )

        replace_kernel_forward_from_hub(
            OlmoeSparseMoeBlock, "HFScatterMoEParallelExperts"
        )
        register_kernel_mapping(
            {
                "HFScatterMoEParallelExperts": {
                    "cuda": {
                        Mode.TRAINING: local_repo,
                        Mode.INFERENCE: local_repo,
                    },
                }
            }
        )

        kernelize(moe, mode=Mode.TRAINING, device="cuda")

        with torch.no_grad():
            kern_out = moe(x)

        torch.testing.assert_close(kern_out, ref_out, atol=1e-3, rtol=1e-3)


================================================
FILE: tests/e2e/integrations/test_sonicmoe.py
================================================
"""
End-to-end gradient and convergence tests for SonicMoE integration.

Requires:
    - H100/H200 GPU (SonicMoE CUTLASS kernels target sm_90)
    - sonicmoe package installed
    - transformers with Qwen3MoE support

Usage:
    pytest tests/e2e/integrations/test_sonicmoe.py -v -s
"""

import importlib.util
import math

import pytest
import torch

_sonicmoe_available = importlib.util.find_spec("sonicmoe") is not None
_is_hopper = torch.cuda.is_available() and torch.cuda.get_device_capability() == (9, 0)

pytestmark = [
    pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA GPU"),
    pytest.mark.skipif(
        not _is_hopper, reason="SonicMoE CUTLASS kernels require Hopper (sm_90)"
    ),
    pytest.mark.skipif(not _sonicmoe_available, reason="SonicMoE not installed"),
]


def _create_tiny_qwen3_config():
    """Create a minimal Qwen3MoE config for fast testing."""
    from transformers import AutoConfig

    config = AutoConfig.for_model("qwen3_moe")
    config.hidden_size = 512
    config.intermediate_size = 1024
    config.moe_intermediate_size = 64
    config.num_attention_heads = 16
    config.num_key_value_heads = 2
    config.head_dim = 32
    config.num_hidden_layers = 2
    config.num_experts = 8
    config.num_experts_per_tok = 2
    config.vocab_size = 1000
    config.max_position_embeddings = 128
    config.norm_topk_prob = True
    config.torch_dtype = torch.bfloat16
    return config


def _interleave_gate_up_weights(model):
    """Interleave all gate_up_proj parameters in-place for SonicMoE."""
    from axolotl.integrations.kernels.sonicmoe.weight_converter import (
        interleave_gate_up,
    )

    with torch.no_grad():
        for name, param in model.named_parameters():
            if "gate_up_proj" in name:
                param.copy_(interleave_gate_up(param))


def _unpatch_sonicmoe():
    """Restore original forward on the MoE block class if it was patched."""
    from axolotl.integrations.kernels.constants import resolve_moe_block_classes

    for moe_cls in resolve_moe_block_classes("qwen3_moe"):
        if hasattr(moe_cls, "_original_forward"):
            moe_cls.forward = moe_cls._original_forward
            del moe_cls._original_forward


class TestSonicMoEForwardCorrectness:
    """Verify SonicMoE-patched model produces same output as original."""

    def teardown_method(self):
        _unpatch_sonicmoe()

    def test_forward_output_matches(self):
        from transformers import AutoModelForCausalLM

        from axolotl.integrations.kernels.sonicmoe.patch import patch_sonicmoe

        config = _create_tiny_qwen3_config()
        input_ids = torch.randint(0, config.vocab_size, (1, 16), device="cuda")

        # Original model
        model_orig = AutoModelForCausalLM.from_config(config).cuda().bfloat16()

        with torch.no_grad():
            out_orig = model_orig(input_ids)

        # Patched model (same weights, interleaved for SonicMoE)
        model_patched = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
        model_patched.load_state_dict(model_orig.state_dict())

        patch_sonicmoe("qwen3_moe")
        _interleave_gate_up_weights(model_patched)

        with torch.no_grad():
            out_patched = model_patched(input_ids)

        max_diff = (out_orig.logits - out_patched.logits).abs().max().item()
        assert torch.allclose(
            out_orig.logits, out_patched.logits, atol=1e-1, rtol=1e-1
        ), f"Output mismatch: max diff={max_diff:.6f}"


class TestSonicMoEGradientCorrectness:
    """Compare gradients between original HuggingFace and SonicMoE-patched forward."""

    def teardown_method(self):
        _unpatch_sonicmoe()

    def test_gradients_match(self):
        """Verify all parameter gradients match between original and patched."""
        from transformers import AutoModelForCausalLM

        from axolotl.integrations.kernels.sonicmoe.patch import patch_sonicmoe
        from axolotl.integrations.kernels.sonicmoe.weight_converter import (
            deinterleave_gate_up,
        )

        config = _create_tiny_qwen3_config()
        input_ids = torch.randint(0, config.vocab_size, (1, 16), device="cuda")

        # ---------- Original model ----------
        model_orig = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
        out_orig = model_orig(input_ids, labels=input_ids)
        out_orig.loss.backward()
        grads_orig = {
            n: p.grad.float().clone()
            for n, p in model_orig.named_parameters()
            if p.grad is not None
        }
        loss_orig = out_orig.loss.item()

        # ---------- SonicMoE-patched model (same weights, interleaved) ----------
        model_patched = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
        model_patched.load_state_dict(model_orig.state_dict())

        patch_sonicmoe("qwen3_moe")
        _interleave_gate_up_weights(model_patched)

        out_patched = model_patched(input_ids, labels=input_ids)
        out_patched.loss.backward()
        grads_patched = {}
        for n, p in model_patched.named_parameters():
            if p.grad is None:
                continue
            g = p.grad.float().clone()
            # gate_up_proj grads are in interleaved layout, de-interleave to match orig
            if "gate_up_proj" in n:
                g = deinterleave_gate_up(g)
            grads_patched[n] = g
        loss_patched = out_patched.loss.item()

        # ---------- Compare ----------
        assert abs(loss_orig - loss_patched) < 0.5, (
            f"Loss mismatch: orig={loss_orig:.4f}, patched={loss_patched:.4f}"
        )

        # All parameters with gradients in original should have them in patched
        missing = set(grads_orig.keys()) - set(grads_patched.keys())
        assert not missing, f"Missing gradients in patched model: {missing}"

        # Compare gradient values
        # bf16 with different GEMM impls (cuBLAS vs CUTLASS) can diverge,
        # so use generous tolerance: flag only if both rel >10% AND abs >1e-2
        mismatches = []
        for name in grads_orig:
            if name not in grads_patched:
                continue
            g_orig = grads_orig[name]
            g_patched = grads_patched[name]
            max_diff = (g_orig - g_patched).abs().max().item()
            rel_diff = max_diff / (g_orig.abs().max().item() + 1e-8)

            if rel_diff > 0.1 and max_diff > 1e-2:
                mismatches.append(
                    f"  {name}: max_abs_diff={max_diff:.6f}, rel_diff={rel_diff:.4f}"
                )

        assert not mismatches, (
            "Gradient mismatches (rel_diff > 10% and abs_diff > 1e-2):\n"
            + "\n".join(mismatches)
        )

    def test_router_weights_receive_gradients(self):
        """Verify that router (gate) weights get non-zero gradients."""
        from transformers import AutoModelForCausalLM

        from axolotl.integrations.kernels.sonicmoe.patch import patch_sonicmoe

        config = _create_tiny_qwen3_config()
        input_ids = torch.randint(0, config.vocab_size, (1, 16), device="cuda")

        model = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
        patch_sonicmoe("qwen3_moe")
        _interleave_gate_up_weights(model)

        out = model(input_ids, labels=input_ids)
        out.loss.backward()

        gate_grads_found = False
        for name, param in model.named_parameters():
            if "gate" in name and "weight" in name:
                gate_grads_found = True
                assert param.grad is not None, f"No gradient for router: {name}"
                assert param.grad.abs().max() > 0, f"Zero gradient for router: {name}"

        assert gate_grads_found, "No gate.weight parameters found in model"


class TestSonicMoETrainingConvergence:
    """Verify loss decreases during training with SonicMoE."""

    def teardown_method(self):
        _unpatch_sonicmoe()

    def test_loss_decreases(self):
        """Run 30 training steps, verify loss decreases and no NaN/Inf."""
        from transformers import AutoModelForCausalLM

        from axolotl.integrations.kernels.sonicmoe.patch import patch_sonicmoe

        config = _create_tiny_qwen3_config()
        input_ids = torch.randint(0, config.vocab_size, (2, 32), device="cuda")

        model = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
        patch_sonicmoe("qwen3_moe")
        _interleave_gate_up_weights(model)

        optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
        losses = []

        for step in range(30):
            out = model(input_ids, labels=input_ids)
            loss = out.loss
            assert not math.isnan(loss.item()), f"NaN loss at step {step}"
            assert not math.isinf(loss.item()), f"Inf loss at step {step}"
            losses.append(loss.item())

            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        assert losses[-1] < losses[0], (
            f"Loss did not decrease: first={losses[0]:.4f}, last={losses[-1]:.4f}"
        )

    def test_expert_weights_update(self):
        """Verify expert weights change during training (not frozen)."""
        from transformers import AutoModelForCausalLM

        from axolotl.integrations.kernels.sonicmoe.patch import patch_sonicmoe

        config = _create_tiny_qwen3_config()
        input_ids = torch.randint(0, config.vocab_size, (2, 32), device="cuda")

        model = AutoModelForCausalLM.from_config(config).cuda().bfloat16()
        patch_sonicmoe("qwen3_moe")
        _interleave_gate_up_weights(model)

        # Snapshot expert weights before training
        expert_weights_before = {}
        for name, param in model.named_parameters():
            if "experts" in name:
                expert_weights_before[name] = param.data.clone()

        assert expert_weights_before, "No expert parameters found"

        optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)
        for _ in range(5):
            out = model(input_ids, labels=input_ids)
            out.loss.backward()
            optimizer.step()
            optimizer.zero_grad()

        # Check that expert weights changed
        changed = 0
        for name, param in model.named_parameters():
            if name in expert_weights_before:
                if not torch.equal(param.data, expert_weights_before[name]):
                    changed += 1

        assert changed > 0, "No expert weights changed after 5 training steps"


================================================
FILE: tests/e2e/kernels/test_geglu.py
================================================
"""Tests for GEGLU activation function Triton kernels."""

import pytest
import torch
import torch.nn.functional as F

from axolotl.kernels.geglu import geglu_backward, geglu_forward


def test_geglu_forward_shape():
    """Test that GEGLU forward pass preserves expected shapes."""
    batch, seq_len, hidden_dim = 2, 3, 64
    gate = torch.randn(batch, seq_len, hidden_dim, device="cuda")
    up = torch.randn(batch, seq_len, hidden_dim, device="cuda")

    out = geglu_forward(gate, up)
    assert out.shape == (batch, seq_len, hidden_dim)
    assert out.dtype == gate.dtype
    assert out.device == gate.device


@pytest.mark.flaky(retries=1, delay=5)
@pytest.mark.parametrize(
    "torch_seed",
    [0, 42],
)
def test_geglu_forward_values(torch_seed):
    """Test GEGLU forward pass matches PyTorch reference implementation."""
    torch.manual_seed(torch_seed)

    gate = torch.randn(2, 3, 64, device="cuda")
    up = torch.randn(2, 3, 64, device="cuda")

    # Custom implementation
    triton_out = geglu_forward(gate.clone(), up.clone())

    # PyTorch reference
    torch_out = F.gelu(gate) * up

    assert torch.allclose(triton_out, torch_out, rtol=1e-3)


@pytest.mark.flaky(retries=1, delay=5)
@pytest.mark.parametrize(
    "torch_seed",
    [0, 42],
)
def test_geglu_backward(torch_seed):
    """Test GEGLU backward pass matches PyTorch autograd."""
    torch.manual_seed(torch_seed)

    gate = torch.randn(2, 3, 64, device="cuda", requires_grad=True)
    up = torch.randn(2, 3, 64, device="cuda", requires_grad=True)
    grad_output = torch.randn(2, 3, 64, device="cuda")

    # PyTorch reference - compute intermediates
    gelu_gate = F.gelu(gate)
    torch_out = gelu_gate * up
    torch_out.backward(grad_output)

    # Custom backward pass
    gate_clone = gate.clone().detach()
    up_clone = up.clone().detach()
    grad_output_clone = grad_output.clone()

    h, grad_gate, grad_up = geglu_backward(grad_output_clone, gate_clone, up_clone)

    # Compare outputs and gradients
    assert torch.allclose(h, torch_out, rtol=1e-3)
    assert torch.allclose(grad_gate, gate.grad, rtol=1e-3)
    assert torch.allclose(grad_up, up.grad, rtol=1e-3)


def test_geglu_inplace_preservation():
    """Test that GEGLU backward doesn't modify original tensors unexpectedly."""
    gate = torch.randn(2, 3, 64, device="cuda")
    up = torch.randn(2, 3, 64, device="cuda")
    grad_output = torch.randn(2, 3, 64, device="cuda")

    gate_copy = gate.clone()
    up_copy = up.clone()
    grad_copy = grad_output.clone()

    geglu_backward(grad_output, gate, up)

    assert not torch.equal(gate, gate_copy), "Gate should be modified in-place"
    assert not torch.equal(up, up_copy), "Up should be modified in-place"
    assert not torch.equal(grad_output, grad_copy), (
        "Grad output should be modified in-place"
    )


================================================
FILE: tests/e2e/kernels/test_lora.py
================================================
"""Tests for LoRA custom autograd."""

import pytest
import torch
from bitsandbytes.functional import QuantState
from torch import nn

from axolotl.kernels.geglu import geglu_backward, geglu_forward
from axolotl.kernels.lora import (
    LoRA_MLP,
    LoRA_O,
    LoRA_QKV,
    apply_lora_mlp_geglu,
    apply_lora_mlp_swiglu,
    get_lora_parameters,
    matmul_lora,
)
from axolotl.kernels.swiglu import swiglu_backward, swiglu_forward


@pytest.fixture
def mock_quantstate():
    """Creates a mock QuantState for testing"""
    shape = (64, 64)
    n_blocks = shape[0]  # Assuming blockwise quantization along first dimension

    # Create nested state first
    nested_state = QuantState(
        absmax=torch.ones(n_blocks, device="cuda"),  # One value per block
        shape=shape,
        code=torch.randint(0, 15, shape, device="cuda"),  # NF4 range is 0-15
        dtype=torch.float16,
        blocksize=64,
        quant_type="nf4",
        offset=None,
        state2=None,
    )

    # Create main state with nested state
    return QuantState(
        absmax=torch.ones(n_blocks, device="cuda"),
        shape=shape,
        code=torch.randint(0, 15, shape, device="cuda"),
        dtype=torch.float16,
        blocksize=64,
        quant_type="nf4",
        offset=torch.zeros(n_blocks, dtype=torch.int32, device="cuda"),
        state2=nested_state,
    )


@pytest.fixture
def sample_tensors():
    """Creates sample tensors for testing"""
    torch.manual_seed(42)
    batch_size, seq_len, hidden_dim = 2, 3, 64
    rank = 8
    out_dim = hidden_dim

    return {
        "X": torch.randn(
            batch_size, seq_len, hidden_dim, device="cuda", dtype=torch.float16
        ),
        "W": torch.randn(out_dim, hidden_dim, device="cuda", dtype=torch.float16),
        "b": torch.randn(out_dim, device="cuda", dtype=torch.float16),
        "scale": 0.5,
        "shapes": {
            "batch": batch_size,
            "seq": seq_len,
            "hidden": hidden_dim,
            "out": out_dim,
            "rank": rank,
        },
    }


@pytest.fixture
def mock_proj():
    """Creates a mock projection module for testing."""

    class MockProj(nn.Module):
        """Mock projection class."""

        def __init__(self, in_features=64, out_features=128, rank=8):
            super().__init__()
            self.base_layer = nn.Linear(in_features, out_features)
            self.base_layer.to("cuda")
            self.lora_A = nn.ModuleDict(
                {"default": nn.Linear(in_features, rank, bias=False).to("cuda")}
            )
            self.lora_B = nn.ModuleDict(
                {"default": nn.Linear(rank, out_features, bias=False).to("cuda")}
            )
            self.scaling = {"default": 0.5}
            self.active_adapter = "default"
            self.disable_adapters = False
            self.merged = False

    return MockProj()


def test_get_lora_parameters(mock_proj):
    """Tests get_lora_parameters function"""
    # Test with LoRA enabled
    W, b, _, A, B, s = get_lora_parameters(mock_proj)

    assert isinstance(W, torch.Tensor)
    assert W.shape == (128, 64)
    assert b.shape == (128,)
    assert A.shape == (8, 64)
    assert B.shape == (128, 8)
    assert s == 0.5

    # Test with LoRA disabled
    mock_proj.disable_adapters = True
    W, b, _, A, B, s = get_lora_parameters(mock_proj)
    assert A is None and B is None and s is None

    # Test with merged state
    mock_proj.disable_adapters = False
    mock_proj.merged = True
    W, b, _, A, B, s = get_lora_parameters(mock_proj)
    assert A is None and B is None and s is None


def test_matmul_lora(sample_tensors):
    """Tests matmul_lora function"""
    X = sample_tensors["X"]
    W = sample_tensors["W"]
    b = sample_tensors["b"]
    scale = sample_tensors["scale"]

    shapes = sample_tensors["shapes"]
    hidden_dim = shapes["hidden"]
    out_dim = shapes["out"]
    rank = shapes["rank"]

    A = torch.randn(rank, hidden_dim, device="cuda", dtype=torch.float16)
    B = torch.randn(out_dim, rank, device="cuda", dtype=torch.float16)

    # Test base matmul
    out1 = matmul_lora(X, W, b, None, None, None, None)
    matmul = torch.matmul(X, W.t())
    expected1 = matmul + b
    assert torch.allclose(out1, expected1, rtol=1e-3)

    # Test with LoRA
    out2 = matmul_lora(X, W, b, None, A, B, scale)
    lora_term = scale * torch.matmul(torch.matmul(X, A.t()), B.t())
    expected2 = matmul + lora_term + b
    assert torch.allclose(out2, expected2, rtol=1e-3)

    # Test 3D input reshaping
    X_3d = X.clone()
    out3 = matmul_lora(X_3d, W, b, None, A, B, scale)
    assert out3.shape == (X.shape[0], X.shape[1], W.shape[0])


@pytest.mark.parametrize(
    "activation_forward,activation_backward",
    [(swiglu_forward, swiglu_backward), (geglu_forward, geglu_backward)],
)
def test_lora_mlp_direct(sample_tensors, activation_forward, activation_backward):
    """Tests LoRA_MLP directly with different activation functions"""
    X = sample_tensors["X"]
    shapes = sample_tensors["shapes"]
    hidden_dim = shapes["hidden"]
    out_dim = shapes["out"]

    # Create linear layers
    gate_proj = nn.Linear(hidden_dim, out_dim).to(device="cuda", dtype=torch.float16)
    up_proj = nn.Linear(hidden_dim, out_dim).to(device="cuda", dtype=torch.float16)
    down_proj = nn.Linear(out_dim, hidden_dim).to(device="cuda", dtype=torch.float16)

    # Test SwiGLU path
    X.requires_grad = True
    output = LoRA_MLP.apply(
        X,
        gate_proj.weight,
        gate_proj.bias,
        None,  # gate_quant
        None,  # gate_A
        None,  # gate_B
        None,  # gate_scale
        up_proj.weight,
        up_proj.bias,
        None,  # up_quant
        None,  # up_A
        None,  # up_B
        None,  # up_scale
        down_proj.weight,
        down_proj.bias,
        None,  # down_quant
        None,  # down_A
        None,  # down_B
        None,  # down_scale
        activation_forward,
        activation_backward,
        True,  # inplace
    )

    assert output.shape == X.shape
    assert not torch.isnan(output).any()

    # Test backward pass
    loss = output.sum()
    loss.backward()
    assert X.grad is not None
    assert not torch.isnan(X.grad).any()


@pytest.mark.parametrize(
    "activation_forward,activation_backward",
    [(swiglu_forward, swiglu_backward), (geglu_forward, geglu_backward)],
)
def test_lora_mlp_with_adapters(
    sample_tensors, activation_forward, activation_backward
):
    """Tests LoRA_MLP with LoRA adapters"""
    X = sample_tensors["X"]
    shapes = sample_tensors["shapes"]
    hidden_dim = shapes["hidden"]
    out_dim = shapes["out"]
    rank = shapes["rank"]

    # Create LoRA components
    gate_A = torch.randn(rank, hidden_dim, device="cuda", dtype=torch.float16)
    gate_B = torch.randn(out_dim, rank, device="cuda", dtype=torch.float16)
    up_A = torch.randn(rank, hidden_dim, device="cuda", dtype=torch.float16)
    up_B = torch.randn(out_dim, rank, device="cuda", dtype=torch.float16)
    down_A = torch.randn(rank, out_dim, device="cuda", dtype=torch.float16)
    down_B = torch.randn(hidden_dim, rank, device="cuda", dtype=torch.float16)
    scale = 0.5

    gate_proj = nn.Linear(hidden_dim, out_dim).to(device="cuda", dtype=torch.float16)
    up_proj = nn.Linear(hidden_dim, out_dim).to(device="cuda", dtype=torch.float16)
    down_proj = nn.Linear(out_dim, hidden_dim).to(device="cuda", dtype=torch.float16)

    X.requires_grad = True
    gate_A.requires_grad = True
    gate_B.requires_grad = True
    up_A.requires_grad = True
    up_B.requires_grad = True
    down_A.requires_grad = True
    down_B.requires_grad = True

    # Forward pass with adapters
    output = LoRA_MLP.apply(
        X,
        gate_proj.weight,
        gate_proj.bias,
        None,
        gate_A,
        gate_B,
        scale,
        up_proj.weight,
        up_proj.bias,
        None,
        up_A,
        up_B,
        scale,
        down_proj.weight,
        down_proj.bias,
        None,
        down_A,
        down_B,
        scale,
        activation_forward,
        activation_backward,
        True,
    )

    assert output.shape == X.shape
    assert not torch.isnan(output).any()

    # Test backward pass
    loss = output.sum()
    loss.backward()

    # Check all gradients
    assert X.grad is not None
    assert gate_A.grad is not None
    assert gate_B.grad is not None
    assert up_A.grad is not None
    assert up_B.grad is not None
    assert down_A.grad is not None
    assert down_B.grad is not None

    assert not torch.isnan(X.grad).any()
    assert not torch.isnan(gate_A.grad).any()
    assert not torch.isnan(gate_B.grad).any()
    assert not torch.isnan(up_A.grad).any()
    assert not torch.isnan(up_B.grad).any()
    assert not torch.isnan(down_A.grad).any()
    assert not torch.isnan(down_B.grad).any()


def test_lora_qkv(sample_tensors):
    """Tests LoRA QKV implementation with and without adapters"""
    X = sample_tensors["X"]
    shapes = sample_tensors["shapes"]
    hidden_dim = shapes["hidden"]
    rank = shapes["rank"]

    # Create base weights
    q_weight = torch.randn(hidden_dim, hidden_dim, device="cuda", dtype=torch.float16)
    k_weight = torch.randn(hidden_dim, hidden_dim, device="cuda", dtype=torch.float16)
    v_weight = torch.randn(hidden_dim, hidden_dim, device="cuda", dtype=torch.float16)

    # Create LoRA matrices
    q_A = torch.randn(
        rank, hidden_dim, device="cuda", dtype=torch.float16, requires_grad=True
    )
    q_B = torch.randn(
        hidden_dim, rank, device="cuda", dtype=torch.float16, requires_grad=True
    )
    k_A = torch.randn(
        rank, hidden_dim, device="cuda", dtype=torch.float16, requires_grad=True
    )
    k_B = torch.randn(
        hidden_dim, rank, device="cuda", dtype=torch.float16, requires_grad=True
    )
    v_A = torch.randn(
        rank, hidden_dim, device="cuda", dtype=torch.float16, requires_grad=True
    )
    v_B = torch.randn(
        hidden_dim, rank, device="cuda", dtype=torch.float16, requires_grad=True
    )
    scale = 0.5

    X.requires_grad = True

    # Test without LoRA adapters

    Q1, K1, V1 = LoRA_QKV.apply(
        X,
        q_weight,
        None,
        None,
        None,
        None,
        None,
        k_weight,
        None,
        None,
        None,
        None,
        None,
        v_weight,
        None,
        None,
        None,
        None,
        None,
        True,
    )

    assert Q1.shape == K1.shape == V1.shape == X.shape
    loss1 = (Q1 + K1 + V1).sum()
    loss1.backward()
    assert X.grad is not None

    # Clear gradients
    X.grad = None

    # Test with LoRA adapters
    Q2, K2, V2 = LoRA_QKV.apply(
        X,
        q_weight,
        None,
        None,
        q_A,
        q_B,
        scale,
        k_weight,
        None,
        None,
        k_A,
        k_B,
        scale,
        v_weight,
        None,
        None,
        v_A,
        v_B,
        scale,
        True,
    )

    assert Q2.shape == K2.shape == V2.shape == X.shape
    loss2 = (Q2 + K2 + V2).sum()
    loss2.backward()

    # Check gradients
    assert X.grad is not None
    assert q_A.grad is not None
    assert q_B.grad is not None
    assert k_A.grad is not None
    assert k_B.grad is not None
    assert v_A.grad is not None
    assert v_B.grad is not None

    # Check for NaN values
    assert not torch.isnan(X.grad).any()
    assert not torch.isnan(q_A.grad).any()
    assert not torch.isnan(q_B.grad).any()
    assert not torch.isnan(k_A.grad).any()
    assert not torch.isnan(k_B.grad).any()
    assert not torch.isnan(v_A.grad).any()
    assert not torch.isnan(v_B.grad).any()


def test_lora_o(sample_tensors):
    """Tests LoRA output projection"""
    X = sample_tensors["X"]
    W = sample_tensors["W"]
    b = sample_tensors["b"]
    scale = sample_tensors["scale"]

    shapes = sample_tensors["shapes"]
    hidden_dim = shapes["hidden"]
    out_dim = shapes["out"]
    rank = shapes["rank"]

    A = torch.randn(rank, hidden_dim, device="cuda", dtype=torch.float16)
    B = torch.randn(out_dim, rank, device="cuda", dtype=torch.float16)

    # Test forward pass
    X.requires_grad = True
    output = LoRA_O.apply(X, W, b, None, A, B, scale)

    assert output.shape == (X.shape[0], X.shape[1], W.shape[0])

    # Test backward pass
    loss = output.sum()
    loss.backward()
    assert X.grad is not None


def test_with_quantization(sample_tensors, mock_quantstate):
    """Tests LoRA with quantized weights"""
    X = sample_tensors["X"]  # [batch, seq, hidden]
    W = sample_tensors["W"]  # [out, hidden]
    b = sample_tensors["b"]  # [out]
    scale = 0.5

    shapes = sample_tensors["shapes"]
    hidden_dim = shapes["hidden"]
    out_dim = shapes["out"]
    rank = shapes["rank"]

    A = torch.randn(rank, hidden_dim, device="cuda", dtype=torch.float16)
    B = torch.randn(out_dim, rank, device="cuda", dtype=torch.float16)

    # Test matmul with quantization
    out = matmul_lora(X, W, b, mock_quantstate, A, B, scale)
    assert out.shape == (X.shape[0], X.shape[1], W.shape[0])
    assert not torch.isnan(out).any()

    # Test with different batch sizes
    X2 = torch.randn(4, 6, hidden_dim, device="cuda", dtype=torch.float16)
    out2 = matmul_lora(X2, W, b, mock_quantstate, A, B, scale)
    assert out2.shape == (4, 6, W.shape[0])
    assert not torch.isnan(out2).any()


@pytest.mark.parametrize(
    "batch,seq,hidden,rank,out",
    [
        (1, 1, 32, 4, 64),
        (2, 3, 64, 8, 128),
        (4, 5, 128, 16, 256),
    ],
)
def test_shapes_and_dimensions(batch, seq, hidden, rank, out):
    """Tests various input shapes and dimensions"""
    X = torch.randn(batch, seq, hidden, device="cuda", dtype=torch.float16)
    W = torch.randn(out, hidden, device="cuda", dtype=torch.float16)
    b = torch.randn(out, device="cuda", dtype=torch.float16)
    A = torch.randn(rank, hidden, device="cuda", dtype=torch.float16)
    B = torch.randn(out, rank, device="cuda", dtype=torch.float16)
    scale = 0.5

    result = matmul_lora(X, W, b, None, A, B, scale)
    assert result.shape == (batch, seq, out)


def test_gradient_flow(sample_tensors):
    """Tests gradient flow through LoRA layers"""
    X = sample_tensors["X"].clone()
    W = sample_tensors["W"].clone()
    b = sample_tensors["b"].clone()
    scale = sample_tensors["scale"]

    shapes = sample_tensors["shapes"]
    hidden_dim = shapes["hidden"]
    out_dim = shapes["out"]
    rank = shapes["rank"]

    A = torch.randn(rank, hidden_dim, device="cuda", dtype=torch.float16)
    B = torch.randn(out_dim, rank, device="cuda", dtype=torch.float16)

    X.requires_grad = True
    A.requires_grad = True
    B.requires_grad = True

    # Forward pass
    out = matmul_lora(X, W, b, None, A, B, scale)
    loss = out.sum()

    # Backward pass
    loss.backward()

    assert X.grad is not None
    assert A.grad is not None
    assert B.grad is not None
    assert not torch.isnan(X.grad).any()
    assert not torch.isnan(A.grad).any()
    assert not torch.isnan(B.grad).any()


@pytest.mark.parametrize(
    "apply_function",
    [apply_lora_mlp_swiglu, apply_lora_mlp_geglu],
)
def test_inplace_operations(sample_tensors, apply_function):
    """Tests inplace operation behavior"""
    X = sample_tensors["X"]
    shapes = sample_tensors["shapes"]

    # Create MLP with both inplace=True and inplace=False
    mlp = type(
        "MLPModule",
        (),
        {
            "gate_proj": nn.Linear(shapes["hidden"], shapes["out"]).to(
                device="cuda", dtype=torch.float16
            ),
            "up_proj": nn.Linear(shapes["hidden"], shapes["out"]).to(
                device="cuda", dtype=torch.float16
            ),
            "down_proj": nn.Linear(shapes["out"], shapes["hidden"]).to(
                device="cuda", dtype=torch.float16
            ),
        },
    )

    out1 = apply_function(mlp, X.clone(), inplace=True)
    out2 = apply_function(mlp, X.clone(), inplace=False)

    assert torch.allclose(out1, out2, rtol=1e-3)


================================================
FILE: tests/e2e/kernels/test_quantize.py
================================================
"""Tests for quantization utility functions."""

import torch
from bitsandbytes.functional import QuantState

from axolotl.kernels.quantize import dequantize


def test_dequantize_null_state():
    """Test that dequantize returns input unchanged when quant_state is None"""
    W = torch.randn(32, 32)
    assert torch.equal(dequantize(W, None), W)


def test_dequantize_shape_preservation():
    """Test that dequantization preserves expected shapes"""
    shape = (32, 32)
    W = torch.randn(shape, device="cuda")

    quant_state = QuantState(
        absmax=torch.ones(shape[0], device="cuda"),
        shape=shape,
        code=torch.randint(0, 15, shape, device="cuda"),
        dtype=torch.float16,
        blocksize=32,
        quant_type="nf4",
        offset=torch.zeros(shape[0], dtype=torch.int32, device="cuda"),
        state2=QuantState(
            absmax=torch.ones(shape[0], device="cuda"),
            shape=shape,
            code=torch.randint(0, 15, shape, device="cuda"),
            dtype=torch.float16,
            blocksize=32,
            quant_type="nf4",
            offset=None,
            state2=None,
        ),
    )

    result = dequantize(W, quant_state)
    assert result.shape == shape
    assert result.dtype == torch.float16
    assert result.device == W.device


def test_dequantize_transposed():
    """Test that transposed input produces transposed output"""
    shape = (32, 32)
    W = torch.randn(1, shape[1], device="cuda")  # Transposed input

    quant_state = QuantState(
        absmax=torch.ones(1),
        shape=shape,
        code=torch.randint(0, 15, shape),
        dtype=torch.float16,
        blocksize=32,
        quant_type="nf4",
        offset=torch.zeros(1, dtype=torch.int32),
        state2=QuantState(
            absmax=torch.ones(1),
            shape=shape,
            code=torch.randint(0, 15, shape),
            dtype=torch.float16,
            blocksize=32,
            quant_type="nf4",
            offset=None,
            state2=None,
        ),
    )

    result = dequantize(W, quant_state)
    assert result.shape[0] == shape[0]


def test_dequantize_output_tensor():
    """Test dequantization with provided output tensor"""
    shape = (32, 32)
    W = torch.randn(shape, device="cuda")
    out = torch.empty(shape, dtype=torch.float16, device="cuda")

    quant_state = QuantState(
        absmax=torch.ones(shape[0]),
        shape=shape,
        code=torch.randint(0, 15, shape),
        dtype=torch.float16,
        blocksize=32,
        quant_type="nf4",
        offset=torch.zeros(shape[0], dtype=torch.int32),
        state2=QuantState(
            absmax=torch.ones(shape[0]),
            shape=shape,
            code=torch.randint(0, 15, shape),
            dtype=torch.float16,
            blocksize=32,
            quant_type="nf4",
            offset=None,
            state2=None,
        ),
    )

    result = dequantize(W, quant_state, out=out)
    assert result is out


================================================
FILE: tests/e2e/kernels/test_swiglu.py
================================================
"""Tests for SwiGLU activation function Triton kernels."""

import torch
import torch.nn.functional as F

from axolotl.kernels.swiglu import swiglu_backward, swiglu_forward


def test_swiglu_forward_shape():
    """Test that SwiGLU forward pass preserves expected shapes"""
    batch, seq_len, hidden_dim = 2, 3, 64
    gate = torch.randn(batch, seq_len, hidden_dim, device="cuda")
    up = torch.randn(batch, seq_len, hidden_dim, device="cuda")

    out = swiglu_forward(gate, up)
    assert out.shape == (batch, seq_len, hidden_dim)
    assert out.dtype == gate.dtype
    assert out.device == gate.device


def test_swiglu_forward_values():
    """Test SwiGLU forward pass matches PyTorch reference implementation"""
    gate = torch.randn(2, 3, 64, device="cuda")
    up = torch.randn(2, 3, 64, device="cuda")

    # Custom implementation
    triton_out = swiglu_forward(gate.clone(), up.clone())

    # PyTorch reference
    torch_out = F.silu(gate) * up

    assert torch.allclose(triton_out, torch_out, rtol=1e-3)


def test_swiglu_backward():
    """Test SwiGLU backward pass matches PyTorch autograd"""
    gate = torch.randn(2, 3, 64, device="cuda", requires_grad=True)
    up = torch.randn(2, 3, 64, device="cuda", requires_grad=True)
    grad_output = torch.randn(2, 3, 64, device="cuda")

    # PyTorch reference - compute intermediates
    silu_gate = F.silu(gate)
    torch_out = silu_gate * up
    torch_out.backward(grad_output)

    # Custom backward pass
    gate_clone = gate.clone().detach()
    up_clone = up.clone().detach()
    grad_output_clone = grad_output.clone()

    h, our_grad_gate, our_grad_up = swiglu_backward(
        grad_output_clone, gate_clone, up_clone
    )

    # Compare outputs and gradients
    assert torch.allclose(h, torch_out, rtol=1e-3)
    assert torch.allclose(our_grad_gate, gate.grad, rtol=1e-3)
    assert torch.allclose(our_grad_up, up.grad, rtol=1e-3)


def test_swiglu_inplace_preservation():
    """Test that SwiGLU backward doesn't modify original tensors unexpectedly"""
    gate = torch.randn(2, 3, 64, device="cuda")
    up = torch.randn(2, 3, 64, device="cuda")
    grad_output = torch.randn(2, 3, 64, device="cuda")

    gate_copy = gate.clone()
    up_copy = up.clone()
    grad_copy = grad_output.clone()

    swiglu_backward(grad_output, gate, up)

    assert not torch.equal(gate, gate_copy), "Gate should be modified in-place"
    assert not torch.equal(up, up_copy), "Up should be modified in-place"
    assert not torch.equal(grad_output, grad_copy), (
        "Grad output should be modified in-place"
    )


================================================
FILE: tests/e2e/multigpu/__init__.py
================================================


================================================
FILE: tests/e2e/multigpu/patched/__init__.py
================================================


================================================
FILE: tests/e2e/multigpu/patched/test_sp.py
================================================
"""E2E tests for sequence parallelism"""

from pathlib import Path

import pytest
import yaml
from accelerate.test_utils import execute_subprocess_async
from transformers.testing_utils import get_torch_dist_unique_port

from axolotl.utils.dict import DictDefault

from ...utils import check_tensorboard


class TestSequenceParallelism:
    """Test case for training with sequence parallelism enabled"""

    def _run_sequence_parallel_test(
        self,
        temp_dir,
        sample_packing=True,
        micro_batch_size=1,
        pad_to_sequence_len=True,
        ring_attn_func=None,
        threshold=2.0,
    ):
        """Helper method to run sequence parallel tests with different configurations"""
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "load_in_8bit": False,
                "load_in_4bit": True,
                "strict": False,
                "sequence_len": 2048,
                "adapter": "qlora",
                "sample_packing": sample_packing,
                "eval_sample_packing": sample_packing,
                "pad_to_sequence_len": pad_to_sequence_len,
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "lora_modules_to_save": ["embed_tokens", "lm_head"],
                "special_tokens": {"pad_token": "<|endoftext|>"},
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 8,
                "micro_batch_size": micro_batch_size,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "loss_watchdog_threshold": 5.0,
                "loss_watchdog_patience": 3,
                "bf16": "auto",
                "warmup_steps": 1,
                "saves_per_epoch": 1,
                "logging_steps": 1,
                "weight_decay": 0.0,
                "use_tensorboard": True,
                "context_parallel_size": 2,
                "ring_attn_func": ring_attn_func,
                "save_first_step": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "accelerate",
                "launch",
                "--num-processes",
                "2",
                "--main_process_port",
                f"{get_torch_dist_unique_port()}",
                "-m",
                "axolotl.cli.train",
                str(Path(temp_dir) / "config.yaml"),
            ]
        )

        check_tensorboard(
            temp_dir + "/runs",
            "train/train_loss",
            threshold,
            "Train Loss (%s) is too high",
        )

    @pytest.mark.parametrize(
        "sample_packing, micro_batch_size, pad_to_sequence_len, ring_attn_func, threshold",
        [
            (True, 1, True, None, 2.5),  # defaults to varlen_llama3 ring_attn_func
            (False, 2, True, None, 2.5),  # defaults to batch_ring ring_attn_func
            # (False, 2, True, "batch_zigzag", 2.5),
            # (False, 2, False, None, 2.65),  # defaults to batch_ring ring_attn_func
        ],
        ids=[
            "sample_packing, varlen_llama3 ring_attn_func",
            "no sample_packing, pad_to_sequence_len, batch_ring ring_attn_func",
            # "no sample_packing, no pad_to_sequence_len, batch_zigzag ring_attn_func",
            # "no sample_packing, no pad_to_sequence_len, batch_ring ring_attn_func",
        ],
    )
    def test_sequence_parallel_training(
        self,
        temp_dir,
        sample_packing,
        micro_batch_size,
        pad_to_sequence_len,
        ring_attn_func,
        threshold,
    ):
        """Test sequence parallel training with different configurations"""
        self._run_sequence_parallel_test(
            temp_dir,
            sample_packing=sample_packing,
            micro_batch_size=micro_batch_size,
            pad_to_sequence_len=pad_to_sequence_len,
            ring_attn_func=ring_attn_func,
            threshold=threshold,
        )


================================================
FILE: tests/e2e/multigpu/solo/__init__.py
================================================
# Tests under this directory should get run "solo" on their own as they
# seem to cause issues when run in the same batch as other tests.


================================================
FILE: tests/e2e/multigpu/solo/test_flex.py
================================================
"""
E2E tests for multigpu lora tinyllama
"""

from pathlib import Path

import pytest
import yaml
from accelerate.test_utils import execute_subprocess_async
from huggingface_hub import snapshot_download
from transformers.testing_utils import get_torch_dist_unique_port
from transformers.utils import is_torch_bf16_gpu_available

from axolotl.utils.dict import DictDefault

from tests.e2e.utils import check_tensorboard, require_torch_2_6_0

AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent


@pytest.fixture(scope="session", autouse=True)
def download_model():
    # download the model
    snapshot_download("HuggingFaceTB/SmolLM2-135M")


class TestPackedFlex:
    """
    Test case for Packed training of llama models
    """

    @require_torch_2_6_0
    def test_loss_llama(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 1024,
                "sample_packing": True,
                "flex_attention": True,
                "val_set_size": 0.0,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 2,
                "use_tensorboard": True,
                "save_strategy": "no",
                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
            cfg.bf16 = True
        else:
            cfg.fp16 = True

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
        )


================================================
FILE: tests/e2e/multigpu/solo/test_gdpo.py
================================================
"""
GDPO test suite

GDPO uses TRL's multi_objective_aggregation="normalize_then_sum" for
per-reward normalization in multi-reward RL training.
"""

import os
import random
from pathlib import Path

import pytest
import yaml
from accelerate.test_utils import execute_subprocess_async
from transformers.testing_utils import get_torch_dist_unique_port

from axolotl.utils.dict import DictDefault

from tests.e2e.multigpu.solo.test_grpo import recursive_kill, start_vllm
from tests.e2e.utils import require_vllm


@pytest.mark.skip(reason="flaky vllm tests in modal")
class TestGDPO:
    """Test case for GDPO training using TRL's native multi-objective aggregation."""

    def _utils_write_yaml_and_rewards(self, cfg, temp_dir, suffix=""):
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
        with open(f"rewards_gdpo_{suffix}.py", "w", encoding="utf-8") as fout:
            fout.write(
                """import random

def format_reward(prompts, completions, **kwargs) -> list[float]:
    return [1.0 if len(c) > 10 else 0.0 for c in completions]

def correctness_reward(prompts, completions, **kwargs) -> list[float]:
    return [random.uniform(-1, 3) for _ in completions]

def safety_reward(prompts, completions, **kwargs) -> list[float]:
    return [1.0 if 'error' not in c.lower() else 0.0 for c in completions]

def single_reward(prompts, completions, **kwargs) -> list[float]:
    return [random.uniform(0, 1) for _ in completions]

def oai_gsm8k_transform(cfg, *args, **kwargs):
    def transform_fn(example, tokenizer=None):
        label = example["answer"].split("####")[-1].strip().replace(",", "")
        return {
            "prompt": [{"role": "user", "content": example["question"]}],
            "answer": label,
        }
    return transform_fn, {"remove_columns": ["question"]}
"""
            )

    @pytest.mark.parametrize("num_gpus", [1, 2])
    @require_vllm
    def test_gdpo_multi_reward_lora(self, temp_dir, num_gpus):
        """Test GDPO with multiple reward functions using LoRA."""
        rnd_suffix = str(random.randint(1000, 9999))
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "chat_template": "llama3",
                "rl": "gdpo",
                "trl": {
                    "beta": 0.001,
                    "max_completion_length": 256,
                    "use_vllm": True,
                    "num_generations": 4,
                    "reward_funcs": [
                        f"rewards_gdpo_{rnd_suffix}.format_reward",
                        f"rewards_gdpo_{rnd_suffix}.correctness_reward",
                    ],
                    "reward_weights": [1.0, 2.0],
                    "scale_rewards": True,
                },
                "vllm": {
                    "max_model_len": 800,
                    "enable_prefix_caching": True,
                },
                "datasets": [
                    {
                        "path": "openai/gsm8k",
                        "name": "main",
                        "type": f"rewards_gdpo_{rnd_suffix}.oai_gsm8k_transform",
                    },
                ],
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "flash_attention": True,
                "sequence_len": 1024,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "max_steps": 3,
                "num_epochs": 1,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
                "warmup_steps": 10,
                "val_set_size": 0.0,
                "output_dir": temp_dir,
                "learning_rate": 0.0001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "save_safetensors": True,
                "bf16": "auto",
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )

        self._utils_write_yaml_and_rewards(cfg, temp_dir, suffix=rnd_suffix)

        current_env = os.environ.copy()
        env = {
            "NCCL_P2P_LEVEL": "LOC",
            **current_env,
            "CUDA_VISIBLE_DEVICES": "1",
        }
        vllm_process = start_vllm(
            cfg.base_model,
            env=env,
            quiet=True,
            wait=300,
            gpu_memory_utilization=0.15,
            max_model_len=cfg.vllm.max_model_len,
            enable_prefix_caching=cfg.vllm.enable_prefix_caching,
            host="0.0.0.0",
            port=8000,
        )

        try:
            execute_subprocess_async(
                [
                    "axolotl",
                    "train",
                    str(Path(temp_dir) / "config.yaml"),
                    "--num-processes",
                    str(num_gpus),
                    "--main-process-port",
                    f"{get_torch_dist_unique_port()}",
                ],
                env={
                    "NCCL_P2P_LEVEL": "LOC",
                    "NCCL_DEBUG": "INFO",
                    **current_env,
                },
            )
        finally:
            recursive_kill(vllm_process)

    @require_vllm
    def test_gdpo_three_rewards(self, temp_dir):
        """Test GDPO with three reward functions (format, correctness, safety)."""
        rnd_suffix = str(random.randint(1000, 9999))
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "chat_template": "llama3",
                "rl": "gdpo",
                "trl": {
                    "beta": 0.001,
                    "max_completion_length": 256,
                    "use_vllm": True,
                    "num_generations": 4,
                    "reward_funcs": [
                        f"rewards_gdpo_{rnd_suffix}.format_reward",
                        f"rewards_gdpo_{rnd_suffix}.correctness_reward",
                        f"rewards_gdpo_{rnd_suffix}.safety_reward",
                    ],
                    "reward_weights": [1.0, 2.0, 1.5],
                },
                "vllm": {
                    "max_model_len": 800,
                    "enable_prefix_caching": True,
                },
                "datasets": [
                    {
                        "path": "openai/gsm8k",
                        "name": "main",
                        "type": f"rewards_gdpo_{rnd_suffix}.oai_gsm8k_transform",
                    },
                ],
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "flash_attention": True,
                "sequence_len": 1024,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "max_steps": 3,
                "num_epochs": 1,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
                "warmup_steps": 10,
                "val_set_size": 0.0,
                "output_dir": temp_dir,
                "learning_rate": 0.0001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "save_safetensors": True,
                "bf16": "auto",
            }
        )

        self._utils_write_yaml_and_rewards(cfg, temp_dir, suffix=rnd_suffix)

        current_env = os.environ.copy()
        env = {
            "NCCL_P2P_LEVEL": "LOC",
            **current_env,
            "CUDA_VISIBLE_DEVICES": "1",
        }
        vllm_process = start_vllm(
            cfg.base_model,
            env=env,
            quiet=True,
            wait=300,
            gpu_memory_utilization=0.15,
            max_model_len=cfg.vllm.max_model_len,
            enable_prefix_caching=cfg.vllm.enable_prefix_caching,
            host="0.0.0.0",
            port=8000,
        )

        try:
            execute_subprocess_async(
                [
                    "axolotl",
                    "train",
                    str(Path(temp_dir) / "config.yaml"),
                    "--num-processes",
                    "1",
                    "--main-process-port",
                    f"{get_torch_dist_unique_port()}",
                ],
                env={
                    "NCCL_P2P_LEVEL": "LOC",
                    "NCCL_DEBUG": "INFO",
                    **current_env,
                },
            )
        finally:
            recursive_kill(vllm_process)

    @require_vllm
    def test_gdpo_single_reward_fallback(self, temp_dir):
        """Test GDPO with single reward."""
        rnd_suffix = str(random.randint(1000, 9999))
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "chat_template": "llama3",
                "rl": "gdpo",
                "trl": {
                    "beta": 0.001,
                    "max_completion_length": 256,
                    "use_vllm": True,
                    "num_generations": 4,
                    "reward_funcs": [
                        f"rewards_gdpo_{rnd_suffix}.single_reward",
                    ],
                    "reward_weights": [1.0],
                },
                "vllm": {
                    "max_model_len": 800,
                    "enable_prefix_caching": True,
                },
                "datasets": [
                    {
                        "path": "openai/gsm8k",
                        "name": "main",
                        "type": f"rewards_gdpo_{rnd_suffix}.oai_gsm8k_transform",
                    },
                ],
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "flash_attention": True,
                "sequence_len": 1024,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "max_steps": 3,
                "num_epochs": 1,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
                "warmup_steps": 10,
                "val_set_size": 0.0,
                "output_dir": temp_dir,
                "learning_rate": 0.0001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "save_safetensors": True,
                "bf16": "auto",
            }
        )

        self._utils_write_yaml_and_rewards(cfg, temp_dir, suffix=rnd_suffix)

        current_env = os.environ.copy()
        env = {
            "NCCL_P2P_LEVEL": "LOC",
            **current_env,
            "CUDA_VISIBLE_DEVICES": "1",
        }
        vllm_process = start_vllm(
            cfg.base_model,
            env=env,
            quiet=True,
            wait=300,
            gpu_memory_utilization=0.15,
            max_model_len=cfg.vllm.max_model_len,
            enable_prefix_caching=cfg.vllm.enable_prefix_caching,
            host="0.0.0.0",
            port=8000,
        )

        try:
            execute_subprocess_async(
                [
                    "axolotl",
                    "train",
                    str(Path(temp_dir) / "config.yaml"),
                    "--num-processes",
                    "1",
                    "--main-process-port",
                    f"{get_torch_dist_unique_port()}",
                ],
                env={
                    "NCCL_P2P_LEVEL": "LOC",
                    "NCCL_DEBUG": "INFO",
                    **current_env,
                },
            )
        finally:
            recursive_kill(vllm_process)

    @require_vllm
    def test_gdpo_fft(self, temp_dir):
        """Test GDPO with full fine-tuning (no adapter)."""
        rnd_suffix = str(random.randint(1000, 9999))
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "chat_template": "llama3",
                "rl": "gdpo",
                "trl": {
                    "beta": 0.001,
                    "max_completion_length": 256,
                    "use_vllm": True,
                    "num_generations": 4,
                    "reward_funcs": [
                        f"rewards_gdpo_{rnd_suffix}.format_reward",
                        f"rewards_gdpo_{rnd_suffix}.correctness_reward",
                    ],
                    "reward_weights": [1.0, 2.0],
                },
                "vllm": {
                    "max_model_len": 800,
                    "enable_prefix_caching": True,
                },
                "datasets": [
                    {
                        "path": "openai/gsm8k",
                        "name": "main",
                        "type": f"rewards_gdpo_{rnd_suffix}.oai_gsm8k_transform",
                    },
                ],
                # No adapter - full fine-tuning
                "flash_attention": True,
                "sequence_len": 1024,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "max_steps": 3,
                "num_epochs": 1,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
                "warmup_steps": 10,
                "val_set_size": 0.0,
                "output_dir": temp_dir,
                "learning_rate": 0.0001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "save_safetensors": True,
                "bf16": "auto",
            }
        )

        self._utils_write_yaml_and_rewards(cfg, temp_dir, suffix=rnd_suffix)

        current_env = os.environ.copy()
        env = {
            "NCCL_P2P_LEVEL": "LOC",
            **current_env,
            "CUDA_VISIBLE_DEVICES": "1",
        }
        vllm_process = start_vllm(
            cfg.base_model,
            env=env,
            quiet=True,
            wait=300,
            gpu_memory_utilization=0.15,
            max_model_len=cfg.vllm.max_model_len,
            enable_prefix_caching=cfg.vllm.enable_prefix_caching,
            host="0.0.0.0",
            port=8000,
        )

        try:
            execute_subprocess_async(
                [
                    "axolotl",
                    "train",
                    str(Path(temp_dir) / "config.yaml"),
                    "--num-processes",
                    "1",
                    "--main-process-port",
                    f"{get_torch_dist_unique_port()}",
                ],
                env={
                    "NCCL_P2P_LEVEL": "LOC",
                    "NCCL_DEBUG": "INFO",
                    **current_env,
                },
            )
        finally:
            recursive_kill(vllm_process)

    @require_vllm
    def test_gdpo_sequence_parallel(self, temp_dir):
        """Test GDPO with sequence parallelism."""
        rnd_suffix = str(random.randint(1000, 9999))
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "chat_template": "llama3",
                "rl": "gdpo",
                "context_parallel_size": 2,
                "trl": {
                    "beta": 0.001,
                    "max_completion_length": 256,
                    "use_vllm": True,
                    "num_generations": 4,
                    "reward_funcs": [
                        f"rewards_gdpo_{rnd_suffix}.format_reward",
                        f"rewards_gdpo_{rnd_suffix}.correctness_reward",
                    ],
                    "reward_weights": [1.0, 2.0],
                },
                "vllm": {
                    "max_model_len": 800,
                    "enable_prefix_caching": True,
                },
                "datasets": [
                    {
                        "path": "openai/gsm8k",
                        "name": "main",
                        "type": f"rewards_gdpo_{rnd_suffix}.oai_gsm8k_transform",
                    },
                ],
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "flash_attention": True,
                "sequence_len": 1024,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "max_steps": 3,
                "num_epochs": 1,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
                "warmup_steps": 10,
                "val_set_size": 0.0,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.0001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "save_safetensors": True,
                "bf16": "auto",
            }
        )

        self._utils_write_yaml_and_rewards(cfg, temp_dir, suffix=rnd_suffix)

        current_env = os.environ.copy()
        env = {
            "NCCL_P2P_LEVEL": "LOC",
            **current_env,
            "CUDA_VISIBLE_DEVICES": "1",
        }
        vllm_process = start_vllm(
            cfg.base_model,
            env=env,
            quiet=True,
            wait=300,
            gpu_memory_utilization=0.15,
            max_model_len=cfg.vllm.max_model_len,
            enable_prefix_caching=cfg.vllm.enable_prefix_caching,
            host="0.0.0.0",
            port=8000,
        )

        try:
            execute_subprocess_async(
                [
                    "axolotl",
                    "train",
                    str(Path(temp_dir) / "config.yaml"),
                    "--num-processes",
                    "2",
                    "--main-process-port",
                    f"{get_torch_dist_unique_port()}",
                ],
                env={
                    "NCCL_P2P_LEVEL": "LOC",
                    "NCCL_DEBUG": "INFO",
                    **current_env,
                },
            )
        finally:
            recursive_kill(vllm_process)


================================================
FILE: tests/e2e/multigpu/solo/test_grpo.py
================================================
"""
GRPO test suite
"""

import os
import random
import subprocess  # nosec B404
import sys
import tempfile
import time
from pathlib import Path

import psutil
import pytest
import requests
import yaml
from accelerate.test_utils import execute_subprocess_async
from transformers.testing_utils import get_torch_dist_unique_port

from axolotl.utils.dict import DictDefault

from tests.e2e.utils import require_vllm


def start_vllm(
    model: str, env: dict, wait: int | None = None, quiet=False, **kwargs
) -> subprocess.Popen:
    """
    helper function to start the VLLM server in the background, mostly for testing purposes
    """
    cmd = [sys.executable, "-m", "trl.scripts.vllm_serve", "--model", model]

    if tensor_parallel_size := kwargs.get("tensor_parallel_size"):
        cmd.extend(["--tensor-parallel-size", str(tensor_parallel_size)])
    if host := kwargs.get("host"):
        cmd.extend(["--host", host])
    if port := kwargs.get("port"):
        cmd.extend(["--port", str(port)])
    if gpu_memory_utilization := kwargs.get("gpu_memory_utilization"):
        cmd.extend(["--gpu-memory-utilization", str(gpu_memory_utilization)])
    if dtype := kwargs.get("dtype"):
        cmd.extend(["--dtype", dtype])
    if max_model_len := kwargs.get("max_model_len"):
        cmd.extend(["--max-model-len", str(max_model_len)])
    if kwargs.get("enable_prefix_caching"):
        cmd.extend(["--enable-prefix-caching", "True"])

    # print out the command to be executed
    print(" ".join(cmd))

    vllm_logging_json = Path(tempfile.mkdtemp()) / "vllm_logging.json"
    with open(vllm_logging_json, "w", encoding="utf-8") as temp_file:
        temp_file.write(
            """{
  "formatters": {
    "json": {
      "class": "pythonjsonlogger.jsonlogger.JsonFormatter"
    }
  },
  "handlers": {
    "file": {
      "class": "logging.FileHandler",
      "formatter": "json",
      "level": "DEBUG",
      "filename": "/tmp/vllm.log",
      "mode": "a"
    }
  },
  "loggers": {
    "vllm": {
      "handlers": ["file"],
      "level": "DEBUG",
      "propagate": false
    }
  },
  "version": 1
}"""
        )

    cmd_env = env.copy()
    cmd_env.update({"VLLM_LOGGING_CONFIG_PATH": vllm_logging_json})
    # start `trl vllm-serve` command in the background and capture the process id
    process = subprocess.Popen(
        cmd,
        env=cmd_env,
        stdout=subprocess.DEVNULL if quiet else subprocess.PIPE,
        stderr=subprocess.DEVNULL if quiet else subprocess.PIPE,
    )  # nosec B603

    # print out the process id so the user can easily kill it later
    print(f"VLLM server process started (PID: {process.pid})")

    # wait until the http server is ready, even if it 404s, but timeout after 60 seconds
    period_seconds = 5
    started = False
    if wait and host and port:
        for i in range(0, int(wait), period_seconds):
            try:
                response = requests.get(f"http://{host}:{port}", timeout=1)
                print(f"{i}: VLLM server (status: {response.status_code})")
                if int(response.status_code) in [200, 404]:
                    started = True
                    break
            except requests.exceptions.RequestException as exc:
                print(f"{i}: VLLM server failed to start: {str(exc)}")

            # also check if the process.pid is still running
            if process.poll() is not None:
                break

            time.sleep(period_seconds)

    if wait and not started:
        print(
            f"VLLM server process did not start within {wait} seconds. Please check your server logs."
        )
        recursive_kill(process)
        with open("/tmp/vllm.log", "r", encoding="utf-8") as log_file:
            print(log_file.read())
        try:
            os.remove("/tmp/vllm.log")
        except FileNotFoundError:
            pass
        raise RuntimeError(f"VLLM server process did not start within {wait} seconds.")

    # return the process
    return process


def recursive_kill(process: subprocess.Popen):
    """
    Recursively kill a process and its children
    """
    process = psutil.Process(process.pid)
    for child in psutil.Process(process.pid).children(recursive=True):
        child.terminate()
        child.kill()
        os.kill(child.pid, 9)
    process.terminate()
    process.kill()
    os.kill(process.pid, 9)


@pytest.mark.skip(reason="flaky vllm tests in modal")
class TestGRPO:
    """
    Test case for GRPO training using multiple GPUs
    """

    def _utils_write_yaml_and_rewards(self, cfg, temp_dir, suffix=""):
        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))
        with open(f"rewards_{suffix}.py", "w", encoding="utf-8") as fout:
            fout.write(
                """import random
def rand_reward_func(completions, **kwargs) -> list[float]:
    return [random.uniform(0, 1) for _ in completions]

def oai_gsm8k_transform(cfg, *args, **kwargs):
    def transform_fn(example, tokenizer=None):
        label = example["answer"].split("####")[-1].strip().replace(",", "")
        return {
            "prompt": [{"role": "user", "content": example["question"]},],
            "answer": label,
        }
    return transform_fn, {"remove_columns": ["question"]}
"""
            )

    @pytest.mark.parametrize(
        "num_gpus",
        [1, 2],
    )
    @require_vllm
    def test_llama_dora(self, temp_dir, num_gpus):
        rnd_reward_suffix = str(random.randint(1000, 9999))
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "chat_template": "llama3",
                "rl": "grpo",
                "trl": {
                    "beta": 0.001,
                    "max_completion_length": 256,
                    "use_vllm": True,
                    "num_generations": 4,
                    "reward_funcs": [f"rewards_{rnd_reward_suffix}.rand_reward_func"],
                },
                "vllm": {
                    "max_model_len": 800,
                    "enable_prefix_caching": True,
                },
                "datasets": [
                    {
                        "path": "openai/gsm8k",
                        "name": "main",
                        "type": f"rewards_{rnd_reward_suffix}.oai_gsm8k_transform",
                    },
                ],
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "peft_use_dora": True,
                "flash_attention": True,
                "sequence_len": 1024,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "max_steps": 3,
                "num_epochs": 1,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
                "warmup_steps": 10,
                "val_set_size": 0.0,
                "output_dir": temp_dir,
                "learning_rate": 0.0001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "bf16": "auto",
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )

        self._utils_write_yaml_and_rewards(cfg, temp_dir, suffix=rnd_reward_suffix)

        current_env = os.environ.copy()
        env = {
            "NCCL_P2P_LEVEL": "LOC",
            **current_env,
            "CUDA_VISIBLE_DEVICES": "1",
        }
        vllm_process = start_vllm(
            cfg.base_model,
            env=env,
            quiet=True,
            wait=300,
            gpu_memory_utilization=0.15,
            max_model_len=cfg.vllm.max_model_len,
            enable_prefix_caching=cfg.vllm.enable_prefix_caching,
            host="0.0.0.0",
            port=8000,
        )

        try:
            execute_subprocess_async(
                [
                    "axolotl",
                    "train",
                    str(Path(temp_dir) / "config.yaml"),
                    "--num-processes",
                    str(num_gpus),
                    "--main-process-port",
                    f"{get_torch_dist_unique_port()}",
                ],
                env={
                    "NCCL_P2P_LEVEL": "LOC",
                    "NCCL_DEBUG": "INFO",
                    **current_env,
                },
            )
        finally:
            (recursive_kill(vllm_process))

    @require_vllm
    def test_llama_lora_sp(self, temp_dir):
        rnd_reward_suffix = str(random.randint(1000, 9999))
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "chat_template": "llama3",
                "rl": "grpo",
                "trl": {
                    "beta": 0.001,
                    "max_completion_length": 256,
                    "use_vllm": True,
                    "num_generations": 4,
                    "reward_funcs": [f"rewards_{rnd_reward_suffix}.rand_reward_func"],
                },
                "vllm": {
                    "max_model_len": 800,
                    "enable_prefix_caching": True,
                },
                "datasets": [
                    {
                        "path": "openai/gsm8k",
                        "name": "main",
                        "type": f"rewards_{rnd_reward_suffix}.oai_gsm8k_transform",
                    },
                ],
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "context_parallel_size": 2,
                "flash_attention": True,
                "sequence_len": 1024,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "max_steps": 3,
                "num_epochs": 1,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
                "warmup_steps": 10,
                "val_set_size": 0.0,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.0001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "bf16": "auto",
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )

        self._utils_write_yaml_and_rewards(cfg, temp_dir, suffix=rnd_reward_suffix)

        current_env = os.environ.copy()
        env = {
            "NCCL_P2P_LEVEL": "LOC",
            **current_env,
            "CUDA_VISIBLE_DEVICES": "1",
        }
        vllm_process = start_vllm(
            cfg.base_model,
            env=env,
            quiet=True,
            wait=300,
            gpu_memory_utilization=0.15,
            max_model_len=cfg.vllm.max_model_len,
            enable_prefix_caching=cfg.vllm.enable_prefix_caching,
            host="0.0.0.0",
            port=8000,
        )

        try:
            execute_subprocess_async(
                [
                    "axolotl",
                    "train",
                    str(Path(temp_dir) / "config.yaml"),
                    "--num-processes",
                    str(2),
                    "--main-process-port",
                    f"{get_torch_dist_unique_port()}",
                ],
                env={
                    "NCCL_P2P_LEVEL": "LOC",
                    "NCCL_DEBUG": "INFO",
                    **current_env,
                },
            )
        finally:
            recursive_kill(vllm_process)

    @pytest.mark.parametrize(
        "num_gpus",
        [1, 2],
    )
    @require_vllm
    def test_llama_fft(self, temp_dir, num_gpus):
        rnd_reward_suffix = str(random.randint(1000, 9999))
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "chat_template": "llama3",
                "rl": "grpo",
                "trl": {
                    "beta": 0.001,
                    "max_completion_length": 256,
                    "use_vllm": True,
                    "num_generations": 4,
                    "reward_funcs": [f"rewards_{rnd_reward_suffix}.rand_reward_func"],
                },
                "vllm": {
                    "max_model_len": 800,
                    "enable_prefix_caching": True,
                },
                "datasets": [
                    {
                        "path": "openai/gsm8k",
                        "name": "main",
                        "type": f"rewards_{rnd_reward_suffix}.oai_gsm8k_transform",
                    },
                ],
                "flash_attention": True,
                "sequence_len": 1024,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "max_steps": 3,
                "num_epochs": 1,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
                "warmup_steps": 10,
                "val_set_size": 0.0,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.0001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "bf16": "auto",
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )

        self._utils_write_yaml_and_rewards(cfg, temp_dir, suffix=rnd_reward_suffix)

        current_env = os.environ.copy()
        env = {
            "NCCL_P2P_LEVEL": "LOC",  # nccl can be brittle, assume P2P isn't reliable
            **current_env,
            "CUDA_VISIBLE_DEVICES": "1",
        }
        vllm_process = start_vllm(
            cfg.base_model,
            env=env,
            quiet=True,
            wait=300,
            gpu_memory_utilization=0.15,
            max_model_len=cfg.vllm.max_model_len,
            enable_prefix_caching=cfg.vllm.enable_prefix_caching,
            host="0.0.0.0",
            port=8000,
        )

        try:
            execute_subprocess_async(
                [
                    "axolotl",
                    "train",
                    str(Path(temp_dir) / "config.yaml"),
                    "--num-processes",
                    str(num_gpus),
                    "--main-process-port",
                    f"{get_torch_dist_unique_port()}",
                ],
                env={
                    "NCCL_P2P_LEVEL": "LOC",
                    "NCCL_DEBUG": "INFO",
                    **current_env,
                },
            )
        finally:
            recursive_kill(vllm_process)


================================================
FILE: tests/e2e/multigpu/test_dist_muon_fsdp2.py
================================================
"""Test module for DistMuon optimizer with FSDP2 multi-GPU functionality."""

import os
from pathlib import Path

import torch
import yaml
from accelerate.test_utils import execute_subprocess_async
from tbparse import SummaryReader
from transformers.testing_utils import get_torch_dist_unique_port

from axolotl.utils.dict import DictDefault

from tests.e2e.utils import most_recent_subdir, require_torch_2_7_0

AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent


def verify_training_success(temp_dir):
    """Verify that training completed successfully by checking artifacts and loss."""
    output_path = Path(temp_dir)

    model_files = list(output_path.glob("*.bin")) + list(
        output_path.glob("*.safetensors")
    )
    assert len(model_files) > 0, "No model files found - training may have failed"

    checkpoint_files = list(output_path.glob("checkpoint-*"))
    assert len(checkpoint_files) > 0, (
        "No checkpoint files found - training may have failed"
    )

    tb_log_path = most_recent_subdir(temp_dir + "/runs")
    if tb_log_path:
        event_files = sorted(os.listdir(tb_log_path))
        if event_files:
            event_file = os.path.join(tb_log_path, event_files[0])
            reader = SummaryReader(event_file)
            df = reader.scalars
            train_loss_df = df[df.tag == "train/train_loss"]
            if len(train_loss_df) > 0:
                final_loss = train_loss_df.value.values[-1]
                assert not torch.isnan(torch.tensor(final_loss)), (
                    f"Training loss is NaN: {final_loss}"
                )


class TestDistMuon:
    """Test class for DistMuon optimizer with FSDP2 functionality."""

    @require_torch_2_7_0
    def test_fft_sft(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "Qwen/Qwen2.5-0.5B",
                "sequence_len": 2048,
                "val_set_size": 0.01,
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.02,
                "optimizer": "muon",
                "weight_decay": 0.01,
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp_version": 2,
                "fsdp_config": {
                    "offload_params": False,
                    "cpu_ram_efficient_loading": False,
                    "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
                    "state_dict_type": "FULL_STATE_DICT",
                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    "reshard_after_forward": True,
                },
                "use_tensorboard": True,
                "bf16": True,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        verify_training_success(temp_dir)

    @require_torch_2_7_0
    def test_lora_sft(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "Qwen/Qwen2.5-0.5B",
                "sequence_len": 2048,
                "val_set_size": 0.01,
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.02,
                "optimizer": "muon",
                "weight_decay": 0.01,
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp_version": 2,
                "fsdp_config": {
                    "offload_params": False,
                    "cpu_ram_efficient_loading": False,
                    "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
                    "state_dict_type": "FULL_STATE_DICT",
                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    "reshard_after_forward": True,
                },
                "use_tensorboard": True,
                "bf16": True,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        verify_training_success(temp_dir)


================================================
FILE: tests/e2e/multigpu/test_eval.py
================================================
"""
E2E tests for multigpu eval
"""

from pathlib import Path

import yaml
from accelerate.test_utils import execute_subprocess_async
from transformers.testing_utils import get_torch_dist_unique_port

from axolotl.utils.dict import DictDefault

from ..utils import check_tensorboard

AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent


class TestMultiGPUEval:
    """
    Test case for MultiGPU Eval Sample Packing
    """

    def test_eval_sample_packing(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "load_in_8bit": False,
                "load_in_4bit": True,
                "strict": False,
                "sequence_len": 2048,
                "adapter": "qlora",
                "sample_packing": True,
                "eval_sample_packing": True,
                "pad_to_sequence_len": True,
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "lora_modules_to_save": ["embed_tokens", "lm_head"],
                "val_set_size": 0.05,
                "special_tokens": {"pad_token": "<|endoftext|>"},
                "datasets": [
                    {
                        "path": "teknium/GPT4-LLM-Cleaned",
                        "type": "alpaca",
                        "split": "train[:5%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "loss_watchdog_threshold": 5.0,
                "loss_watchdog_patience": 3,
                "bf16": "auto",
                "warmup_steps": 1,
                "evals_per_epoch": 2,
                "eval_max_new_tokens": 128,
                "saves_per_epoch": 1,
                "logging_steps": 1,
                "weight_decay": 0.0,
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "accelerate",
                "launch",
                "--num-processes",
                "2",
                "--main_process_port",
                f"{get_torch_dist_unique_port()}",
                "-m",
                "axolotl.cli.train",
                str(Path(temp_dir) / "config.yaml"),
            ]
        )

        check_tensorboard(temp_dir + "/runs", "eval/loss", 2.5, "Eval Loss is too high")

    def test_eval(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "load_in_8bit": False,
                "load_in_4bit": True,
                "strict": False,
                "sequence_len": 2048,
                "adapter": "qlora",
                "sample_packing": True,
                "eval_sample_packing": False,
                "pad_to_sequence_len": True,
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "lora_modules_to_save": ["embed_tokens", "lm_head"],
                "val_set_size": 0.01,
                "special_tokens": {"pad_token": "<|endoftext|>"},
                "datasets": [
                    {
                        "path": "teknium/GPT4-LLM-Cleaned",
                        "type": "alpaca",
                        "split": "train[:5%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "loss_watchdog_threshold": 5.0,
                "loss_watchdog_patience": 3,
                "bf16": "auto",
                "warmup_steps": 1,
                "evals_per_epoch": 2,
                "eval_max_new_tokens": 128,
                "saves_per_epoch": 1,
                "logging_steps": 1,
                "weight_decay": 0.0,
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "accelerate",
                "launch",
                "--num-processes",
                "2",
                "--main_process_port",
                f"{get_torch_dist_unique_port()}",
                "-m",
                "axolotl.cli.train",
                str(Path(temp_dir) / "config.yaml"),
            ]
        )

        check_tensorboard(temp_dir + "/runs", "eval/loss", 2.9, "Eval Loss is too high")


================================================
FILE: tests/e2e/multigpu/test_fp8_fsdp2.py
================================================
"""Test module for FP8 mixed precision with FSDP2 multi-GPU functionality."""

import os
from pathlib import Path

import torch
import yaml
from accelerate.test_utils import execute_subprocess_async
from tbparse import SummaryReader
from transformers.testing_utils import get_torch_dist_unique_port

from axolotl.utils.dict import DictDefault

from tests.e2e.utils import most_recent_subdir, require_torch_2_7_0, supports_fp8

AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent


def verify_fp8_training_success(temp_dir):
    """Verify that FP8 training completed successfully by checking artifacts and loss."""
    output_path = Path(temp_dir)

    model_files = list(output_path.glob("*.bin")) + list(
        output_path.glob("*.safetensors")
    )
    assert len(model_files) > 0, "No model files found - training may have failed"

    checkpoint_files = list(output_path.glob("checkpoint-*"))
    assert len(checkpoint_files) > 0, (
        "No checkpoint files found - training may have failed"
    )

    tb_log_path = most_recent_subdir(temp_dir + "/runs")
    if tb_log_path:
        event_files = sorted(os.listdir(tb_log_path))
        if event_files:
            event_file = os.path.join(tb_log_path, event_files[0])
            reader = SummaryReader(event_file)
            df = reader.scalars
            train_loss_df = df[df.tag == "train/train_loss"]
            if len(train_loss_df) > 0:
                final_loss = train_loss_df.value.values[-1]
                assert not torch.isnan(torch.tensor(final_loss)), (
                    f"Training loss is NaN: {final_loss}"
                )


class TestFP8FSDP2:
    """Test class for FP8 mixed precision with FSDP2 functionality."""

    @require_torch_2_7_0
    @supports_fp8
    def test_fp8_fsdp2_smoke(self, temp_dir):
        """Smoke test for 2-GPU FP8 + torch.compile + FSDP2 training"""
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "trust_remote_code": True,
                "sequence_len": 512,
                "val_set_size": 0.05,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 3,  # Very short smoke test
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",  # Use standard optimizer for stability
                "lr_scheduler": "cosine",
                "sdp_attention": True,
                "pad_to_seq_len": True,
                "sample_packing": True,
                # FP8 configuration
                "fp8": True,
                "fp8_enable_fsdp_float8_all_gather": True,
                "torch_compile": True,
                # FSDP2 configuration
                "fsdp_version": 2,
                "fsdp_config": {
                    "offload_params": False,
                    "cpu_ram_efficient_loading": False,
                    "transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
                    "state_dict_type": "FULL_STATE_DICT",
                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    "reshard_after_forward": True,
                },
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        verify_fp8_training_success(temp_dir)


================================================
FILE: tests/e2e/multigpu/test_fsdp1.py
================================================
"""Test module for FSDP1 multi-GPU functionality."""

import os
from pathlib import Path

import pytest
import torch
import yaml
from accelerate.test_utils import execute_subprocess_async
from tbparse import SummaryReader
from transformers.testing_utils import get_torch_dist_unique_port

from axolotl.utils.dict import DictDefault

from tests.e2e.utils import most_recent_subdir

AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent


def verify_training_success(temp_dir):
    """Verify that training completed successfully by checking artifacts and loss."""
    output_path = Path(temp_dir)

    model_files = list(output_path.glob("*.bin")) + list(
        output_path.glob("*.safetensors")
    )
    assert len(model_files) > 0, "No model files found - training may have failed"

    checkpoint_files = list(output_path.glob("checkpoint-*"))
    assert len(checkpoint_files) > 0, (
        "No checkpoint files found - training may have failed"
    )

    tb_log_path = most_recent_subdir(temp_dir + "/runs")
    if tb_log_path:
        event_files = sorted(os.listdir(tb_log_path))
        if event_files:
            event_file = os.path.join(tb_log_path, event_files[0])
            reader = SummaryReader(event_file)
            df = reader.scalars
            train_loss_df = df[df.tag == "train/train_loss"]
            if len(train_loss_df) > 0:
                final_loss = train_loss_df.value.values[-1]
                assert not torch.isnan(torch.tensor(final_loss)), (
                    f"Training loss is NaN: {final_loss}"
                )


class TestFSDP1:
    """Test class for FSDP1 functionality."""

    @pytest.mark.parametrize(
        "fsdp_cpu_ram_efficient_loading",
        [True, False],
    )
    def test_fft_sft(self, temp_dir, fsdp_cpu_ram_efficient_loading):
        cfg = DictDefault(
            {
                "base_model": "Qwen/Qwen2.5-0.5B",
                "sequence_len": 2048,
                "val_set_size": 0.01,
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp_version": "1",
                "fsdp_config": {
                    "fsdp_offload_params": False,
                    "fsdp_cpu_ram_efficient_loading": fsdp_cpu_ram_efficient_loading,
                    "fsdp_transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
                    "fsdp_state_dict_type": "FULL_STATE_DICT",
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    "fsdp_sharding_strategy": "FULL_SHARD",
                    "fsdp_sync_module_states": True,
                    "fsdp_use_orig_params": False,
                },
                "use_tensorboard": True,
                "bf16": True,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        verify_training_success(temp_dir)

    @pytest.mark.parametrize(
        "adapter_config",
        [
            {
                "adapter": "lora",
                "load_in_4bit": False,
            },
            {
                "adapter": "qlora",
                "load_in_4bit": True,
            },
        ],
    )
    def test_lora_sft(self, temp_dir, adapter_config):
        cfg = DictDefault(
            {
                "base_model": "Qwen/Qwen2.5-0.5B",
                "sequence_len": 2048,
                "val_set_size": 0.01,
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "adapter": adapter_config["adapter"],
                "load_in_4bit": adapter_config["load_in_4bit"],
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp_version": "1",
                "fsdp_config": {
                    "fsdp_offload_params": False,
                    "fsdp_cpu_ram_efficient_loading": True,
                    "fsdp_transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
                    "fsdp_state_dict_type": "FULL_STATE_DICT",
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    "fsdp_sharding_strategy": "FULL_SHARD",
                    "fsdp_sync_module_states": True,
                    "fsdp_use_orig_params": False,
                },
                "use_tensorboard": True,
                "bf16": True,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        verify_training_success(temp_dir)

    @pytest.mark.skip(reason="slow test, deprecate fsdp1 asap")
    def test_dpo_fft(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "Qwen/Qwen2.5-0.5B",
                "sequence_len": 2048,
                "val_set_size": 0.01,
                "rl": "dpo",
                "chat_template": "chatml",
                "datasets": [
                    {
                        "path": "Intel/orca_dpo_pairs",
                        "split": "train",
                        "type": "chatml.intel",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp_version": "1",
                "fsdp_config": {
                    "fsdp_offload_params": False,
                    "fsdp_cpu_ram_efficient_loading": True,
                    "fsdp_transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
                    "fsdp_state_dict_type": "FULL_STATE_DICT",
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    "fsdp_sharding_strategy": "FULL_SHARD",
                    "fsdp_sync_module_states": True,
                    "fsdp_use_orig_params": False,
                },
                "use_tensorboard": True,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        verify_training_success(temp_dir)

    @pytest.mark.skip("broken in transformers v5")
    @pytest.mark.parametrize(
        "adapter_config",
        [
            {
                "adapter": "lora",
                "load_in_4bit": False,
            },
            {
                "adapter": "qlora",
                "load_in_4bit": True,
            },
        ],
    )
    def test_dpo_lora(self, temp_dir, adapter_config):
        cfg = DictDefault(
            {
                "base_model": "Qwen/Qwen2.5-0.5B",
                "load_in_4bit": adapter_config["load_in_4bit"],
                "rl": "dpo",
                "chat_template": "chatml",
                "sequence_len": 2048,
                "adapter": adapter_config["adapter"],
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.01,
                "datasets": [
                    {
                        "path": "Intel/orca_dpo_pairs",
                        "split": "train",
                        "type": "chatml.intel",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp_version": "1",
                "fsdp_config": {
                    "fsdp_offload_params": False,
                    "fsdp_cpu_ram_efficient_loading": True,
                    "fsdp_transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
                    "fsdp_state_dict_type": "FULL_STATE_DICT",
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    "fsdp_sharding_strategy": "FULL_SHARD",
                    "fsdp_sync_module_states": True,
                    "fsdp_use_orig_params": False,
                },
                "use_tensorboard": True,
                "bf16": "auto",
                "tf32": True,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        verify_training_success(temp_dir)


================================================
FILE: tests/e2e/multigpu/test_fsdp2.py
================================================
"""Test module for FSDP2 multi-GPU functionality."""

import os
from pathlib import Path

import pytest
import torch
import yaml
from accelerate.test_utils import execute_subprocess_async
from tbparse import SummaryReader
from transformers.testing_utils import get_torch_dist_unique_port

from axolotl.utils.dict import DictDefault

from tests.e2e.utils import most_recent_subdir, require_torch_2_7_0

AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent


def verify_training_success(temp_dir):
    """Verify that training completed successfully by checking artifacts and loss."""
    output_path = Path(temp_dir)

    model_files = list(output_path.glob("*.bin")) + list(
        output_path.glob("*.safetensors")
    )
    assert len(model_files) > 0, "No model files found - training may have failed"

    checkpoint_files = list(output_path.glob("checkpoint-*"))
    assert len(checkpoint_files) > 0, (
        "No checkpoint files found - training may have failed"
    )

    tb_log_path = most_recent_subdir(temp_dir + "/runs")
    if tb_log_path:
        event_files = sorted(os.listdir(tb_log_path))
        if event_files:
            event_file = os.path.join(tb_log_path, event_files[0])
            reader = SummaryReader(event_file)
            df = reader.scalars
            train_loss_df = df[df.tag == "train/train_loss"]
            if len(train_loss_df) > 0:
                final_loss = train_loss_df.value.values[-1]
                assert not torch.isnan(torch.tensor(final_loss)), (
                    f"Training loss is NaN: {final_loss}"
                )


class TestFSDP2:
    """Test class for FSDP2 functionality."""

    @require_torch_2_7_0
    @pytest.mark.parametrize(
        "fsdp_cpu_ram_efficient_loading",
        [True, False],
    )
    def test_fft_sft(self, temp_dir, fsdp_cpu_ram_efficient_loading):
        cfg = DictDefault(
            {
                "base_model": "Qwen/Qwen2.5-0.5B",
                "sequence_len": 2048,
                "val_set_size": 0.01,
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp_version": 2,
                "fsdp_config": {
                    "offload_params": False,
                    "cpu_ram_efficient_loading": fsdp_cpu_ram_efficient_loading,
                    "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
                    "state_dict_type": "FULL_STATE_DICT",
                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    "reshard_after_forward": True,
                },
                "use_tensorboard": True,
                "bf16": True,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        verify_training_success(temp_dir)

    @require_torch_2_7_0
    @pytest.mark.parametrize("peft_use_dora", [True, False])
    def test_lora_sft(self, temp_dir, peft_use_dora):
        cfg = DictDefault(
            {
                "base_model": "Qwen/Qwen2.5-0.5B",
                "sequence_len": 2048,
                "val_set_size": 0.01,
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "peft_use_dora": peft_use_dora,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp_version": 2,
                "fsdp_config": {
                    "offload_params": False,
                    "cpu_ram_efficient_loading": False,
                    "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
                    "state_dict_type": "FULL_STATE_DICT",
                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    "reshard_after_forward": True,
                },
                "use_tensorboard": True,
                "bf16": True,
                # explicitly disable LORA kernels, as they may be auto-enabled
                "lora_mlp_kernel": False,
                "lora_qkv_kernel": False,
                "lora_o_kernel": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        verify_training_success(temp_dir)

    @require_torch_2_7_0
    def test_lora_sft_kernels(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "Qwen/Qwen2.5-0.5B",
                "sequence_len": 2048,
                "val_set_size": 0.01,
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_target_linear": True,
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp_version": 2,
                "fsdp_config": {
                    "offload_params": False,
                    "cpu_ram_efficient_loading": False,
                    "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
                    "state_dict_type": "FULL_STATE_DICT",
                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    "reshard_after_forward": True,
                },
                "use_tensorboard": True,
                "bf16": True,
                "lora_mlp_kernel": True,
                "lora_qkv_kernel": True,
                "lora_o_kernel": True,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        verify_training_success(temp_dir)

    @require_torch_2_7_0
    def test_qlora_sft(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "Qwen/Qwen2.5-0.5B",
                "sequence_len": 2048,
                "val_set_size": 0.01,
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "load_in_4bit": True,
                "adapter": "qlora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp_version": 2,
                "fsdp_config": {
                    "offload_params": False,
                    "cpu_ram_efficient_loading": False,
                    "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
                    "state_dict_type": "FULL_STATE_DICT",
                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    "reshard_after_forward": True,
                },
                "use_tensorboard": True,
                "bf16": True,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        verify_training_success(temp_dir)

    @require_torch_2_7_0
    def test_qlora_sft_kernels(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "Qwen/Qwen2.5-0.5B",
                "sequence_len": 2048,
                "val_set_size": 0.01,
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "load_in_4bit": True,
                "adapter": "qlora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_target_linear": True,
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp_version": 2,
                "fsdp_config": {
                    "offload_params": False,
                    "cpu_ram_efficient_loading": False,
                    "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
                    "state_dict_type": "FULL_STATE_DICT",
                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    "reshard_after_forward": True,
                },
                "use_tensorboard": True,
                "bf16": True,
                "lora_mlp_kernel": True,
                "lora_qkv_kernel": True,
                "lora_o_kernel": True,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        verify_training_success(temp_dir)

    @pytest.mark.skip(reason="slow test w cu129 + torch 2.9.1 + py3.12")
    @require_torch_2_7_0
    def test_dpo_fft(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "Qwen/Qwen2.5-0.5B",
                "sequence_len": 2048,
                "val_set_size": 0.01,
                "rl": "dpo",
                "chat_template": "chatml",
                "datasets": [
                    {
                        "path": "Intel/orca_dpo_pairs",
                        "split": "train",
                        "type": "chatml.intel",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp_version": 2,
                "fsdp_config": {
                    "offload_params": False,
                    "cpu_ram_efficient_loading": False,
                    "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
                    "state_dict_type": "FULL_STATE_DICT",
                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    "reshard_after_forward": True,
                },
                "use_tensorboard": True,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        verify_training_success(temp_dir)

    @pytest.mark.skip(reason="slow test w cu129 + torch 2.9.1 + py3.12")
    @require_torch_2_7_0
    def test_dpo_lora(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "Qwen/Qwen2.5-0.5B",
                "sequence_len": 2048,
                "rl": "dpo",
                "chat_template": "chatml",
                "datasets": [
                    {
                        "path": "Intel/orca_dpo_pairs",
                        "split": "train",
                        "type": "chatml.intel",
                    },
                ],
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp_version": 2,
                "fsdp_config": {
                    "offload_params": False,
                    "cpu_ram_efficient_loading": False,
                    "transformer_layer_cls_to_wrap": "Qwen2DecoderLayer",
                    "state_dict_type": "FULL_STATE_DICT",
                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    "reshard_after_forward": True,
                },
                "use_tensorboard": True,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        verify_training_success(temp_dir)


================================================
FILE: tests/e2e/multigpu/test_gemma3.py
================================================
"""
E2E tests for multigpu lora tinyllama
"""

from pathlib import Path

import pytest
import yaml
from accelerate.test_utils import execute_subprocess_async
from huggingface_hub import snapshot_download
from transformers.testing_utils import get_torch_dist_unique_port

from axolotl.utils.dict import DictDefault

from tests.e2e.utils import check_tensorboard

AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent


@pytest.fixture(scope="session", autouse=True)
def download_model():
    # download the model
    snapshot_download("axolotl-mirrors/gemma-3-4b-pt", repo_type="model")


@pytest.mark.skip(reason="FIXME")
class TestMultiGPUGemma3:
    """
    Test case for Gemma3 models using LoRA
    """

    def test_lora_ddp_packed(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "axolotl-mirrors/gemma-3-4b-pt",
                "unfrozen_parameters": ["model.language_model.*", "lm_head"],
                "sequence_len": 2048,
                "ddp_find_unused_parameters": True,
                "sample_packing": True,
                "eval_sample_packing": False,
                "pad_to_sequence_len": True,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.0,
                "chat_template": "gemma3",
                "datasets": [
                    {
                        "path": "mlabonne/FineTome-100k",
                        "type": "chat_template",
                        "split": "train[:10%]",
                        "field_messages": "conversations",
                        "message_field_role": "from",
                        "message_field_content": "value",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 4,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {
                    "use_reentrant": False,
                },
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.0001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
                "save_first_step": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 1.8, "Train Loss (%s) is too high"
        )


================================================
FILE: tests/e2e/multigpu/test_llama.py
================================================
"""
E2E tests for multigpu lora tinyllama
"""

from pathlib import Path

import pytest
import transformers
import yaml
from accelerate.test_utils import execute_subprocess_async
from huggingface_hub import snapshot_download
from packaging import version
from transformers.testing_utils import get_torch_dist_unique_port

from axolotl.utils.dict import DictDefault

from tests.e2e.utils import check_tensorboard, require_torch_2_6_0

AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent


@pytest.fixture(scope="session", autouse=True)
def download_model():
    # download the model
    snapshot_download("HuggingFaceTB/SmolLM2-135M")


def transformers_version_eq(required_version):
    return version.parse(transformers.__version__) == version.parse(required_version)


class TestMultiGPULlama:
    """
    Test case for Llama models using LoRA
    """

    def test_lora_ddp(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 2048,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 2,
                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
                "save_first_step": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.8, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
        "gradient_accumulation_steps",
        [1, 2],
    )
    def test_lora_ddp_packed(self, temp_dir, gradient_accumulation_steps):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 2048,
                "sample_packing": True,
                "eval_sample_packing": False,
                "pad_to_sequence_len": True,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.05,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:20%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": gradient_accumulation_steps,
                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
                "save_first_step": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    def test_dpo_lora_ddp(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 2048,
                "sample_packing": False,
                "eval_sample_packing": False,
                "pad_to_sequence_len": True,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "rl": "dpo",
                "chat_template": "chatml",
                "datasets": [
                    {
                        "path": "fozziethebeat/alpaca_messages_2k_dpo_test",
                        "type": "chat_template.default",
                        "field_messages": "conversation",
                        "field_chosen": "chosen",
                        "field_rejected": "rejected",
                        "message_field_role": "role",
                        "message_field_content": "content",
                        "roles": {
                            "system": ["system"],
                            "user": ["user"],
                            "assistant": ["assistant"],
                        },
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                "gradient_checkpointing": False,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "warmup_steps": 0,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
                "save_first_step": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        loss_threshold = 2.3
        check_tensorboard(
            temp_dir + "/runs",
            "train/train_loss",
            loss_threshold,
            "Train Loss (%s) is too high",
        )

    def test_dpo_qlora_ddp(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 2048,
                "sample_packing": False,
                "eval_sample_packing": False,
                "pad_to_sequence_len": True,
                "load_in_4bit": True,
                "adapter": "qlora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "rl": "dpo",
                "chat_template": "chatml",
                "datasets": [
                    {
                        "path": "fozziethebeat/alpaca_messages_2k_dpo_test",
                        "type": "chat_template.default",
                        "field_messages": "conversation",
                        "field_chosen": "chosen",
                        "field_rejected": "rejected",
                        "message_field_role": "role",
                        "message_field_content": "content",
                        "roles": {
                            "system": ["system"],
                            "user": ["user"],
                            "assistant": ["assistant"],
                        },
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                "gradient_checkpointing": False,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "warmup_steps": 0,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
                "save_first_step": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        loss_threshold = 2.3
        check_tensorboard(
            temp_dir + "/runs",
            "train/train_loss",
            loss_threshold,
            "Train Loss (%s) is too high",
        )

    @pytest.mark.parametrize(
        "gradient_accumulation_steps",
        [1, 2],
    )
    def test_fsdp(self, temp_dir, gradient_accumulation_steps):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 2048,
                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": gradient_accumulation_steps,
                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp": [
                    "full_shard",
                    "auto_wrap",
                ],
                "fsdp_config": {
                    "fsdp_offload_params": False,
                    "fsdp_sync_module_states": True,
                    "fsdp_use_orig_params": False,
                    "fsdp_cpu_ram_efficient_loading": False,
                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
                    "fsdp_state_dict_type": "FULL_STATE_DICT",
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                },
                "use_tensorboard": True,
                "seed": 42,
                "save_first_step": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
        "fsdp_state_dict_type",
        [
            "FULL_STATE_DICT",
            # "SHARDED_STATE_DICT",  # not supported since intermediate checkpoints fail with fsdp1
        ],
    )
    def test_fsdp_packed(self, temp_dir, fsdp_state_dict_type):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "sequence_len": 1024,
                "val_set_size": 0.05,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 3,
                "save_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp": [
                    "full_shard",
                    "auto_wrap",
                ],
                "fsdp_config": {
                    "fsdp_offload_params": False,
                    "fsdp_sync_module_states": True,
                    "fsdp_use_orig_params": False,
                    "fsdp_cpu_ram_efficient_loading": False,
                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
                    "fsdp_state_dict_type": fsdp_state_dict_type,
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                },
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @require_torch_2_6_0
    @pytest.mark.parametrize(
        "attention_backend",
        ["flash", "flex"],
    )
    @pytest.mark.parametrize(
        "fsdp_reshard_after_forward",
        [True, False],
    )
    def test_fsdp2_packed(
        self, temp_dir, attention_backend, fsdp_reshard_after_forward
    ):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "sequence_len": 2048,
                "val_set_size": 0.1,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
                "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_8bit",
                "lr_scheduler": "cosine",
                "fsdp": [
                    "auto_wrap",
                ],
                "fsdp_config": {
                    "fsdp_version": 2,
                    # "fsdp_forward_prefetch": True,  # not yet implemented in accelerate
                    "fsdp_offload_params": False,
                    "fsdp_cpu_ram_efficient_loading": False,
                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
                    "fsdp_state_dict_type": "SHARDED_STATE_DICT",
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    "fsdp_reshard_after_forward": fsdp_reshard_after_forward,
                },
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )
        if attention_backend == "flash":
            cfg.flash_attention = True
        elif attention_backend == "flex":
            cfg.flex_attention = True

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
        )

    def test_fsdp_qlora_prequant_packed(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "axolotl-ai-co/SmolLM2-135M-bnb-nf4-bf16",
                "adapter": "qlora",
                "mean_resizing_embeddings": True,
                "load_in_4bit": True,
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                # "lora_modules_to_save": [
                #     "embed_tokens",
                #     "lm_head",
                # ],
                "sample_packing": True,
                "eval_sample_packing": False,
                "pad_to_sequence_len": True,
                "sequence_len": 1024,
                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp": [
                    "full_shard",
                    "auto_wrap",
                ],
                "fsdp_config": {
                    "fsdp_offload_params": False,
                    "fsdp_sync_module_states": True,
                    "fsdp_use_orig_params": False,
                    "fsdp_cpu_ram_efficient_loading": True,
                    "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
                    "fsdp_state_dict_type": "FULL_STATE_DICT",
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                },
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
        "gradient_accumulation_steps",
        [1, 2],
    )
    @pytest.mark.parametrize(
        "deepspeed",
        [
            "deepspeed_configs/zero3_bf16.json",
            "deepspeed_configs/zero3_bf16_cpuoffload_all.json",
            # "deepspeed_configs/zero3_bf16_cpuoffload_params.json",
        ],
    )
    @pytest.mark.parametrize(
        "qlora",
        [True, False],
    )
    def test_ds_zero3_packed(
        self, temp_dir, gradient_accumulation_steps, deepspeed, qlora
    ):
        if qlora:
            adapter = {
                "adapter": "qlora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "load_in_4bit": True,
            }
        else:
            adapter = {}
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "sequence_len": 1024,
                "val_set_size": 0.05,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": gradient_accumulation_steps,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "deepspeed": str(AXOLOTL_ROOT / deepspeed),
                "use_tensorboard": True,
                "save_first_step": False,
                **adapter,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.45, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
        "gradient_accumulation_steps",
        [1, 2],
    )
    @pytest.mark.parametrize(
        "qlora",
        [True, False],
    )
    def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps, qlora):
        if qlora:
            adapter = {
                "adapter": "qlora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "load_in_4bit": True,
            }
        else:
            adapter = {}
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "sequence_len": 1024,
                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": gradient_accumulation_steps,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
                "use_tensorboard": True,
                "seed": 42,
                "save_first_step": False,
                **adapter,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
        "gradient_accumulation_steps",
        [1, 2],
    )
    @pytest.mark.parametrize(
        "qlora",
        [True, False],
    )
    def test_ds_zero1_packed(self, temp_dir, gradient_accumulation_steps, qlora):
        if qlora:
            adapter = {
                "adapter": "qlora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "load_in_4bit": True,
            }
        else:
            adapter = {}
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "sequence_len": 1024,
                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": gradient_accumulation_steps,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
                "use_tensorboard": True,
                "save_first_step": False,
                **adapter,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
        )

    @pytest.mark.skip(
        reason="fix untrained tokens brittle with lots of edge cases in latest transformers"
    )
    def test_fix_untrained_tokens(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "fix_untrained_tokens": True,
                "sequence_len": 512,
                "val_set_size": 0.0,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                    "bos_token": "<|custom_im_start|>",
                    "eos_token": "<|custom_im_end|>",
                },
                "datasets": [
                    {
                        "chat_template": "jinja",
                        "chat_template_jinja": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|custom_im_start|>' + message['role'] + '\n' + message['content'] + '<|custom_im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|custom_im_start|>assistant\n' }}{% endif %}",
                        "path": "mlabonne/FineTome-100k",
                        "type": "chat_template",
                        "split": "train[:10%]",
                        "field_messages": "conversations",
                        "message_field_role": "from",
                        "message_field_content": "value",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                # "gradient_checkpointing": True,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "sample_packing": True,
                "bf16": True,
                # "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 4.0, "Train Loss (%s) is too high"
        )


================================================
FILE: tests/e2e/multigpu/test_locking.py
================================================
"""Tests for FileLockLoader class."""

import tempfile
import threading
import time
from pathlib import Path
from unittest.mock import MagicMock, Mock, patch

import pytest

from axolotl.utils.data.lock import FileLockLoader
from axolotl.utils.dict import DictDefault


class TestFileLockLoader:
    """Class with tests for FileLockLoader."""

    @pytest.fixture
    def temp_dir(self):
        """Create a temporary directory for testing."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            yield Path(tmp_dir)

    @pytest.fixture
    def cfg(self, temp_dir):
        """Create a test configuration."""
        return DictDefault({"dataset_prepared_path": str(temp_dir)})

    @pytest.fixture
    def loader(self, cfg):
        """Create a FileLockLoader instance for testing."""
        return FileLockLoader(cfg)

    def test_load_first_process(self, loader):
        """Test load() when no ready flag exists (first process)."""
        mock_load_fn = Mock(return_value="test_data")

        result = loader.load(mock_load_fn)

        # Should call the load function
        mock_load_fn.assert_called_once()
        assert result == "test_data"

        # Should create the ready flag
        assert loader.ready_flag_path.exists()

    def test_load_subsequent_process(self, loader):
        """Test load() when ready flag already exists (subsequent process)."""
        # Create ready flag first
        loader.ready_flag_path.touch()

        mock_load_fn = Mock(return_value="loaded_data")

        result = loader.load(mock_load_fn)

        # Should still call load function (to load the prepared data)
        mock_load_fn.assert_called_once()
        assert result == "loaded_data"

    def test_load_concurrent_processes(self, cfg):
        """Test that concurrent processes coordinate correctly."""
        results = []
        call_count = 0

        def slow_load_fn():
            nonlocal call_count
            call_count += 1
            time.sleep(0.1)  # Simulate slow loading
            return f"data_{call_count}"

        def worker():
            loader = FileLockLoader(cfg)
            result = loader.load(slow_load_fn)
            results.append(result)

        # Start multiple threads simultaneously
        threads = [threading.Thread(target=worker) for _ in range(3)]
        for t in threads:
            t.start()
        for t in threads:
            t.join()

        # Only one thread should have done the initial loading
        # All should return data, but the load function should be called
        # once by the first process and once by each subsequent process
        assert len(results) == 3
        assert all(result.startswith("data_") for result in results)

    @patch("time.sleep")
    def test_load_waiting_for_ready_flag(self, mock_sleep, loader):
        """Test that processes wait for the ready flag to appear."""
        mock_load_fn = Mock(return_value="waiting_data")
        mock_ready_flag_path = Mock()
        exists_call_count = 0

        def mock_exists():
            nonlocal exists_call_count
            exists_call_count += 1

            if exists_call_count == 1:
                # First check: ready flag exists (not first process)
                return True
            if exists_call_count <= 3:
                # While loop checks: flag doesn't exist yet
                return False
            return True

        mock_ready_flag_path.exists.side_effect = mock_exists

        # Replace the ready_flag_path with our mock
        original_path = loader.ready_flag_path
        loader.ready_flag_path = mock_ready_flag_path

        try:
            result = loader.load(mock_load_fn)
        finally:
            # Restore original path
            loader.ready_flag_path = original_path

        # Should have slept twice while waiting
        assert mock_sleep.call_count == 2
        mock_sleep.assert_called_with(1)

        # Should eventually call load function
        mock_load_fn.assert_called_once()
        assert result == "waiting_data"

    def test_complete_workflow_with_cleanup(self, loader):
        """Test the complete load -> cleanup workflow."""
        mock_load_fn = Mock(return_value="test_data")

        # First process calls load (this should set up counter)
        result = loader.load(mock_load_fn)
        assert result == "test_data"
        assert loader.ready_flag_path.exists()
        assert loader.counter_path.exists()

        # Cleanup should remove everything since there's only one process
        loader.cleanup()
        assert not loader.ready_flag_path.exists()
        assert not loader.counter_path.exists()

    def test_multiple_processes_workflow(self, loader):
        """Test workflow with multiple processes."""
        # Simulate multiple processes by manually setting up counter
        loader.ready_flag_path.touch()
        loader.counter_path.write_text("3")  # 3 processes

        # First process cleanup
        loader.cleanup()
        assert loader.ready_flag_path.exists()
        assert loader.counter_path.read_text().strip() == "2"

        # Second process cleanup
        loader.cleanup()
        assert loader.ready_flag_path.exists()
        assert loader.counter_path.read_text().strip() == "1"

        # Last process cleanup
        loader.cleanup()
        assert not loader.ready_flag_path.exists()
        assert not loader.counter_path.exists()

    def test_load_exception_handling(self, loader):
        """Test behavior when load_fn raises an exception."""

        def failing_load_fn():
            raise ValueError("Load failed")

        with pytest.raises(ValueError, match="Load failed"):
            loader.load(failing_load_fn)

        # Ready flag should not be created on failure
        assert not loader.ready_flag_path.exists()

    def test_file_lock_called(self, loader):
        """Test that FileLock is properly used."""
        mock_load_fn = Mock(return_value="locked_data")

        with patch("axolotl.utils.data.lock.FileLock") as mock_filelock:
            mock_context = MagicMock()
            mock_filelock.return_value.__enter__ = Mock(return_value=mock_context)
            mock_filelock.return_value.__exit__ = Mock(return_value=None)

            loader.load(mock_load_fn)

            # Verify FileLock was called with correct path
            mock_filelock.assert_called_once_with(str(loader.lock_file_path))

            # Verify context manager was used
            mock_filelock.return_value.__enter__.assert_called_once()
            mock_filelock.return_value.__exit__.assert_called_once()


================================================
FILE: tests/e2e/multigpu/test_ray.py
================================================
"""
E2E tests for multigpu post-training use Ray Train
"""

from pathlib import Path

import pytest
import yaml
from accelerate.test_utils import execute_subprocess_async

from axolotl.utils.dict import DictDefault

from tests.e2e.utils import (
    check_tensorboard,
    require_torch_2_7_0,
)

AXOLOTL_ROOT = Path(__file__).parent.parent.parent.parent


class TestMultiGPURay:
    """
    Test cases for AnyScale Ray post training
    """

    @require_torch_2_7_0
    def test_lora_ddp(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 1024,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.05,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "use_tensorboard": True,
                "use_ray": True,
                "ray_num_workers": 2,
                "save_first_step": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--use-ray",
                "--ray-num-workers",
                "2",
            ]
        )

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @require_torch_2_7_0
    @pytest.mark.parametrize(
        "gradient_accumulation_steps",
        [1, 2],
    )
    def test_ds_zero2_packed(self, temp_dir, gradient_accumulation_steps):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "sequence_len": 1024,
                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": gradient_accumulation_steps,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero2.json"),
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--use-ray",
                "--ray-num-workers",
                "2",
            ]
        )

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )

    @require_torch_2_7_0
    @pytest.mark.parametrize(
        "gradient_accumulation_steps",
        [1, 2],
    )
    def test_sft_fsdp2_packed(self, temp_dir, gradient_accumulation_steps):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "sequence_len": 1024,
                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": gradient_accumulation_steps,
                "output_dir": temp_dir,
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "fsdp_version": 2,
                "fsdp_config": {
                    "offload_params": False,
                    "cpu_ram_efficient_loading": False,
                    "transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
                    "state_dict_type": "FULL_STATE_DICT",
                    "auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    "reshard_after_forward": True,
                },
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--use-ray",
                "--ray-num-workers",
                "2",
            ]
        )

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.3, "Train Loss (%s) is too high"
        )


================================================
FILE: tests/e2e/multigpu/test_tp.py
================================================
"""multigpu e2e test for tensor parallelism."""

from pathlib import Path

import pytest
import yaml
from accelerate.test_utils import execute_subprocess_async, get_torch_dist_unique_port

from axolotl.utils.dict import DictDefault

from tests.e2e.utils import check_tensorboard, require_torch_2_7_0


class TestTensorParallel:
    """Test class for Tensor Parallel functionality."""

    @pytest.mark.skip(
        reason="TP doesn't work with models with tied weights (embeddings)"
    )
    @require_torch_2_7_0
    def test_fft_sft(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "Qwen/Qwen2.5-0.5B",
                "sequence_len": 2048,
                "val_set_size": 0.01,
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch",
                "tensor_parallel_size": 2,
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "train",
                str(Path(temp_dir) / "config.yaml"),
                "--num-processes",
                "2",
                "--main-process-port",
                f"{get_torch_dist_unique_port()}",
            ]
        )

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 1.0, "Train Loss (%s) is too high"
        )


================================================
FILE: tests/e2e/patched/__init__.py
================================================


================================================
FILE: tests/e2e/patched/lora_kernels/__init__.py
================================================


================================================
FILE: tests/e2e/patched/lora_kernels/test_lora_kernel_patching.py
================================================
"""Integration tests for LoRA activation and attention kernels."""

from pathlib import Path

import pytest
import torch
import yaml
from accelerate.state import PartialState
from peft import PeftModelForCausalLM, get_peft_config
from transformers import AutoModelForCausalLM, LlamaForCausalLM
from transformers.models.llama.configuration_llama import LlamaConfig
from transformers.models.llama.modeling_llama import LlamaAttention
from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeAttention

from axolotl.cli.config import load_cfg
from axolotl.kernels.lora import (
    apply_lora_mlp_geglu,
    apply_lora_mlp_swiglu,
    apply_lora_o,
    apply_lora_qkv,
)
from axolotl.loaders.model import ModelLoader
from axolotl.loaders.tokenizer import load_tokenizer
from axolotl.monkeypatch.lora_kernels import (
    apply_lora_kernel_patches,
    find_self_attn_in_layer,
    get_attention_cls_from_config,
    get_layers,
    patch_self_attn_lora,
)
from axolotl.utils.dict import DictDefault

MODEL_CONFIGS = [
    {
        "name": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
        "expected_activation": apply_lora_mlp_swiglu,
        "dtype": torch.float16,
    },
    {
        "name": "trl-internal-testing/tiny-Qwen2ForCausalLM-2.5",
        "expected_activation": apply_lora_mlp_swiglu,
        "dtype": torch.float16,
    },
    {
        "name": "HuggingFaceTB/SmolLM2-135M",
        "expected_activation": apply_lora_mlp_swiglu,
        "dtype": torch.float32,
    },
    {
        "name": "trl-internal-testing/tiny-Gemma2ForCausalLM",
        "expected_activation": apply_lora_mlp_geglu,
        "dtype": torch.float16,
    },
]


@pytest.fixture(autouse=True)
def init_accelerate():
    """Initialize Accelerate state before tests."""
    _ = PartialState()


@pytest.fixture
def small_llama_model():
    """Create a small LLaMA model for testing."""
    config = {
        "vocab_size": 100,
        "hidden_size": 128,
        "intermediate_size": 256,
        "num_hidden_layers": 2,
        "num_attention_heads": 4,
    }

    return LlamaForCausalLM(LlamaConfig(**config))


@pytest.mark.parametrize(
    "model_name,attention_cls",
    [
        ("HuggingFaceTB/SmolLM2-135M", LlamaAttention),
        ("Qwen/Qwen3-30B-A3B", Qwen3MoeAttention),
    ],
)
def test_attention_patching_integration(model_name, attention_cls):
    """Test attention patching in integration context."""
    cfg = DictDefault({"base_model": model_name})

    # Store the original implementation
    original_forward = attention_cls.forward

    # Apply patch
    patch_self_attn_lora(cfg)

    # Get the new forward method
    patched_forward = attention_cls.forward

    # Check the forward method was replaced
    assert original_forward is not patched_forward
    assert patched_forward.__name__ == "axolotl_attn_forward"

    # Check original implementation was stored
    assert hasattr(attention_cls, "_original_forward")

    # Clean up
    attention_cls.forward = original_forward
    delattr(attention_cls, "_original_forward")


def test_swiglu_mlp_integration(small_llama_model):
    """Test SwiGLU activation in LoRA MLP context."""
    peft_config = get_peft_config(
        {
            "peft_type": "LORA",
            "task_type": "CAUSAL_LM",
            "r": 8,
            "lora_alpha": 16,
            "target_modules": ["gate_proj", "up_proj", "down_proj"],
            "lora_dropout": 0,
            "bias": "none",
        }
    )
    model = PeftModelForCausalLM(small_llama_model, peft_config).to("cuda")
    cfg = DictDefault({"lora_mlp_kernel": True})

    # Apply patches
    patched_model = apply_lora_kernel_patches(model, cfg)

    # Verify patches
    layer = patched_model.model.model.layers[0]
    assert layer.mlp.forward.__func__ is apply_lora_mlp_swiglu

    # Test forward pass
    batch_size, seq_len = 2, 10
    hidden_states = torch.randn(
        batch_size, seq_len, model.config.hidden_size, device=model.device
    )
    position_ids = (
        torch.arange(seq_len, device=model.device).unsqueeze(0).expand(batch_size, -1)
    )
    cos, sin = model.model.model.rotary_emb(hidden_states, position_ids)

    inputs = {
        "hidden_states": hidden_states,
        "attention_mask": None,
        "position_embeddings": (cos, sin),
        "output_attentions": False,
        "use_cache": False,
        "past_key_value": None,
    }

    # Compare outputs
    with torch.no_grad():
        original_output = model.model.model.layers[0](**inputs)[0]
        patched_output = layer(**inputs)[0]

    assert torch.allclose(original_output, patched_output, rtol=1e-4)


def test_geglu_model_integration():
    """Test GeGLU activation with Gemma model."""
    model = AutoModelForCausalLM.from_pretrained(
        "trl-internal-testing/tiny-Gemma2ForCausalLM",
        dtype=torch.float16,
        device_map="cuda:0",
    )
    peft_config = get_peft_config(
        {
            "peft_type": "LORA",
            "task_type": "CAUSAL_LM",
            "r": 8,
            "lora_alpha": 16,
            "target_modules": ["gate_proj", "up_proj", "down_proj"],
            "lora_dropout": 0,
            "bias": "none",
        }
    )
    model = PeftModelForCausalLM(model, peft_config)

    cfg = DictDefault({"lora_mlp_kernel": True})
    patched_model = apply_lora_kernel_patches(model, cfg)

    # Verify patches
    layer = patched_model.model.model.layers[0]
    assert layer.mlp.forward.__func__ is apply_lora_mlp_geglu

    # Test end-to-end
    inputs = torch.randint(0, 100, (1, 20), device=model.device, dtype=torch.long)
    with torch.no_grad():
        original_output = model(inputs).logits
        patched_output = patched_model(inputs).logits

    assert torch.allclose(original_output, patched_output, rtol=1e-4)


@pytest.mark.parametrize(
    "model_name,expected_activation",
    [
        ("HuggingFaceTB/SmolLM2-135M", apply_lora_mlp_swiglu),
        ("mhenrichsen/gemma-2b", apply_lora_mlp_geglu),
    ],
)
def test_model_specific_activation(model_name, expected_activation):
    """Test that each model type gets the correct activation function."""
    model = AutoModelForCausalLM.from_pretrained(model_name)
    peft_config = get_peft_config(
        {
            "peft_type": "LORA",
            "task_type": "CAUSAL_LM",
            "r": 8,
            "lora_alpha": 16,
            "target_modules": ["gate_proj", "up_proj", "down_proj"],
            "lora_dropout": 0,
            "bias": "none",
        }
    )
    model = PeftModelForCausalLM(model, peft_config)
    cfg = DictDefault({"lora_mlp_kernel": True})

    patched_model = apply_lora_kernel_patches(model, cfg)
    layer = patched_model.model.model.layers[0]
    assert layer.mlp.forward.__func__ is expected_activation


def test_kernel_patch_conditions():
    """Test various conditions that should prevent kernel patching."""
    test_configs = [
        # Dropout prevents patching
        {
            "peft_type": "LORA",
            "task_type": "CAUSAL_LM",
            "r": 8,
            "lora_alpha": 16,
            "target_modules": ["gate_proj", "up_proj", "down_proj"],
            "lora_dropout": 0.1,
            "bias": "none",
        },
        # Bias prevents patching
        {
            "peft_type": "LORA",
            "task_type": "CAUSAL_LM",
            "r": 8,
            "lora_alpha": 16,
            "target_modules": ["gate_proj", "up_proj", "down_proj"],
            "lora_dropout": 0,
            "bias": "lora_only",
        },
    ]

    for config in test_configs:
        model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM2-135M")
        peft_config = get_peft_config(config)
        model = PeftModelForCausalLM(model, peft_config)
        cfg = DictDefault({"lora_mlp_kernel": True})

        # Should not patch
        patched_model = apply_lora_kernel_patches(model, cfg)
        layer = patched_model.model.model.layers[0].mlp

        # Verify no patches applied
        assert layer.forward.__func__ is not apply_lora_mlp_swiglu
        assert layer.forward.__func__ is not apply_lora_mlp_geglu


def test_kernel_config_options():
    """Test that kernel configuration options are respected."""
    # Test different configurations
    test_configs = [
        (
            {"lora_mlp_kernel": True, "lora_qkv_kernel": False, "lora_o_kernel": False},
            lambda layer: (
                layer.mlp.forward.__func__ is apply_lora_mlp_swiglu
                and layer.self_attn.apply_qkv.__func__ is not apply_lora_qkv
                and layer.self_attn.apply_o.__func__ is not apply_lora_o
            ),
        ),
        (
            {"lora_mlp_kernel": False, "lora_qkv_kernel": True, "lora_o_kernel": False},
            lambda layer: (
                layer.mlp.forward.__func__ is not apply_lora_mlp_swiglu
                and layer.self_attn.apply_qkv.__func__ is apply_lora_qkv
                and layer.self_attn.apply_o.__func__ is not apply_lora_o
            ),
        ),
        (
            {"lora_mlp_kernel": False, "lora_qkv_kernel": False, "lora_o_kernel": True},
            lambda layer: (
                layer.mlp.forward.__func__ is not apply_lora_mlp_swiglu
                and layer.self_attn.apply_qkv.__func__ is not apply_lora_qkv
                and layer.self_attn.apply_o.__func__ is apply_lora_o
            ),
        ),
    ]

    for config_dict, check_fn in test_configs:
        # Create fresh model for each test
        config = {
            "vocab_size": 100,
            "hidden_size": 128,
            "intermediate_size": 256,
            "num_hidden_layers": 2,
            "num_attention_heads": 4,
        }
        small_llama_model = LlamaForCausalLM(LlamaConfig(**config))

        peft_config = get_peft_config(
            {
                "peft_type": "LORA",
                "task_type": "CAUSAL_LM",
                "r": 8,
                "lora_alpha": 16,
                "target_modules": [
                    "gate_proj",
                    "up_proj",
                    "down_proj",
                    "q_proj",
                    "k_proj",
                    "v_proj",
                    "o_proj",
                ],
                "lora_dropout": 0,
                "bias": "none",
            }
        )
        model = PeftModelForCausalLM(small_llama_model, peft_config).to("cuda")
        cfg = DictDefault(config_dict)
        patched_model = apply_lora_kernel_patches(model, cfg)

        # Verify only requested optimizations were applied
        for layer in patched_model.model.model.layers:
            assert check_fn(layer), f"Failed for config: {config_dict}"

        # Clean up
        del model
        del small_llama_model
        del patched_model


def get_lora_config():
    """Get standard LoRA configuration for testing."""
    return {
        "peft_type": "LORA",
        "task_type": "CAUSAL_LM",
        "r": 8,
        "lora_alpha": 16,
        "target_modules": ["gate_proj", "up_proj", "down_proj"],
        "lora_dropout": 0,
        "bias": "none",
    }


def get_test_inputs(model, seq_length=20):
    """Generate test inputs for model evaluation."""
    return torch.randint(
        0,
        model.config.vocab_size,
        (1, seq_length),
        device=model.device,
        dtype=torch.long,
    )


@pytest.mark.parametrize("model_config", MODEL_CONFIGS)
def test_model_architecture(model_config):
    """Test LoRA kernel patches across different model architectures."""
    # Load model with appropriate dtype
    model = AutoModelForCausalLM.from_pretrained(
        model_config["name"], torch_dtype=model_config["dtype"], device_map="cuda:0"
    )

    # Apply LoRA configuration
    peft_config = get_peft_config(get_lora_config())
    model = PeftModelForCausalLM(model, peft_config)

    # Apply kernel patches
    cfg = DictDefault({"lora_mlp_kernel": True})
    patched_model = apply_lora_kernel_patches(model, cfg)

    # Verify correct activation function
    layer = patched_model.model.model.layers[0]
    assert layer.mlp.forward.__func__ is model_config["expected_activation"], (
        f"Wrong activation for {model_config['name']}"
    )

    # Test forward pass
    inputs = get_test_inputs(model)
    with torch.no_grad():
        original_output = model(inputs).logits
        patched_output = patched_model(inputs).logits

    # Check outputs match
    assert torch.allclose(original_output, patched_output, rtol=1e-4), (
        f"Outputs don't match for {model_config['name']}"
    )


def test_kernel_training_integration(temp_dir):
    """Test model loading with kernel patches enabled."""
    from axolotl.cli.utils import load_model_and_tokenizer

    # Create minimal config
    cfg = DictDefault(
        {
            "base_model": "HuggingFaceTB/SmolLM2-135M",
            "tokenizer_config": "HuggingFaceTB/SmolLM2-135M",
            "learning_rate": 0.000001,
            "datasets": [
                {
                    "path": "mhenrichsen/alpaca_2k_test",
                    "type": "alpaca",
                }
            ],
            "micro_batch_size": 1,
            "gradient_accumulation_steps": 1,
            "adapter": "lora",
            "lora_r": 8,
            "lora_alpha": 16,
            "lora_dropout": 0.0,
            "lora_target_linear": True,
            "sequence_len": 1024,
            "lora_mlp_kernel": True,
            "lora_qkv_kernel": True,
            "lora_o_kernel": True,
        }
    )

    # Write cfg to yaml file
    path = Path(temp_dir) / "config.yaml"
    with open(path, "w", encoding="utf-8") as fout:
        fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

    # Load config
    cfg = load_cfg(str(path))

    # Load model
    model, _, _ = load_model_and_tokenizer(cfg=cfg)

    # Verify correct activation function
    layer = model.model.model.layers[0]
    assert layer.mlp.forward.__func__ is apply_lora_mlp_swiglu


def test_kernel_training_integration_auto_enable(temp_dir):
    """Test model loading with auto-enabled kernel patches."""
    # Create minimal config without explicitly setting kernel options
    cfg = DictDefault(
        {
            "base_model": "HuggingFaceTB/SmolLM2-135M",
            "tokenizer_config": "HuggingFaceTB/SmolLM2-135M",
            "learning_rate": 0.000001,
            "datasets": [
                {
                    "path": "mhenrichsen/alpaca_2k_test",
                    "type": "alpaca",
                }
            ],
            "micro_batch_size": 1,
            "gradient_accumulation_steps": 1,
            "adapter": "lora",
            "lora_r": 8,
            "lora_alpha": 16,
            "lora_dropout": 0.0,
            "lora_target_linear": True,
            "sequence_len": 1024,
        }
    )

    # Write cfg to yaml file
    path = Path(temp_dir) / "config.yaml"
    with open(path, "w", encoding="utf-8") as fout:
        fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

    # Load config
    cfg = load_cfg(str(path))

    # Verify kernel options were auto-enabled in the config
    assert cfg.lora_mlp_kernel is True
    assert cfg.lora_qkv_kernel is True
    assert cfg.lora_o_kernel is True

    # Get the attention class before patching to check for side effects
    attention_cls = get_attention_cls_from_config(cfg)

    # Store original state before patching
    original_forward_method = attention_cls.forward

    # Load the model (this should trigger the patches)
    tokenizer = load_tokenizer(cfg)
    model, _ = ModelLoader(cfg, tokenizer).load()

    # Test side effects of patch_self_attn_lora
    assert hasattr(attention_cls, "_original_forward")
    assert attention_cls.forward != original_forward_method

    # Find at least one self-attention module and verify it has the patched methods
    found_patched_attn = False
    for layer in model.model.model.layers:
        if hasattr(layer, "self_attn"):
            self_attn = layer.self_attn
            if all(
                hasattr(self_attn, proj)
                for proj in ["q_proj", "k_proj", "v_proj", "o_proj"]
            ):
                # These methods should be added by apply_lora_kernel_patches
                assert hasattr(self_attn, "apply_qkv") and callable(self_attn.apply_qkv)
                assert hasattr(self_attn, "apply_o") and callable(self_attn.apply_o)

                found_patched_attn = True
                break

    assert found_patched_attn


def test_kernel_training_integration_dropout_non_zero(temp_dir):
    """Test model loading with dropout non-zero should not patch."""

    from axolotl.cli.utils import load_model_and_tokenizer

    # Create minimal config
    cfg = DictDefault(
        {
            "base_model": "HuggingFaceTB/SmolLM2-135M",
            "tokenizer_config": "HuggingFaceTB/SmolLM2-135M",
            "learning_rate": 0.000001,
            "datasets": [
                {
                    "path": "mhenrichsen/alpaca_2k_test",
                    "type": "alpaca",
                }
            ],
            "micro_batch_size": 1,
            "gradient_accumulation_steps": 1,
            "adapter": "lora",
            "lora_r": 8,
            "lora_alpha": 16,
            "lora_dropout": 0.1,
            "lora_target_linear": True,
            "sequence_len": 1024,
        }
    )

    # Write cfg to yaml file
    path = Path(temp_dir) / "config.yaml"
    with open(path, "w", encoding="utf-8") as fout:
        fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

    # Load config
    cfg = load_cfg(str(path))

    # Get original attention class
    attention_cls = get_attention_cls_from_config(cfg)

    # Store original state before patching
    original_forward_method = attention_cls.forward

    # Load model
    model, tokenizer, _ = load_model_and_tokenizer(cfg=cfg)

    # We call modelloader as that's where the patches are applied
    # despite the fact that we're not using it to load the model
    model_loader = ModelLoader(cfg, tokenizer)

    # Apply patch
    model_loader.patch_manager._apply_self_attention_lora_patch()

    # Verify patch was not applied
    assert attention_cls.forward == original_forward_method

    # Apply apply_lora_kernel_patches
    model_loader.patch_manager._apply_lora_kernel_patch(model)

    # Verify patch was not applied
    layers = get_layers(model)
    for layer in layers:
        for self_attn in find_self_attn_in_layer(layer):
            assert not hasattr(self_attn, "apply_qkv")
            assert not hasattr(self_attn, "apply_o")


================================================
FILE: tests/e2e/patched/test_4d_multipack_llama.py
================================================
"""
E2E tests for multipack fft llama using 4d attention masks
"""

import unittest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from ..utils import check_model_output_exists, with_temp_dir


class Test4dMultipackLlama(unittest.TestCase):
    """
    Test case for Llama models using 4d attention with multipack
    """

    @with_temp_dir
    def test_sdp_lora_packing(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "flash_attention": False,
                "sdp_attention": True,
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 32,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "sequence_len": 1024,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "save_steps": 3,
                "eval_steps": 4,
                "fp16": True,
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

    @with_temp_dir
    def test_torch_lora_packing(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "flash_attention": False,
                "sdp_attention": False,
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 32,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "save_steps": 3,
                "eval_steps": 4,
                "fp16": True,
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/patched/test_activation_checkpointing.py
================================================
"""
E2E tests for activation checkpointing
"""

import pytest
import transformers
from torch.utils.checkpoint import checkpoint

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from ..utils import check_model_output_exists


@pytest.fixture()
def fix_checkpoint_after_test():
    yield
    transformers.modeling_utils.checkpoint = checkpoint


class TestActivationCheckpointing:
    """
    E2E tests for activation checkpointing
    """

    @pytest.mark.parametrize(
        "gradient_checkpointing",
        ["offload", "offload_disk"],
    )
    def test_activation_checkpointing_offload(
        self,
        temp_dir,
        fix_checkpoint_after_test,
        gradient_checkpointing,
    ):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 1024,
                "val_set_size": 0.0,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                    "eos_token": "<|im_end|>",
                },
                "datasets": [
                    {
                        "chat_template": "chatml",
                        "path": "mlabonne/FineTome-100k",
                        "type": "chat_template",
                        "split": "train[:10%]",
                        "field_messages": "conversations",
                        "message_field_role": "from",
                        "message_field_content": "value",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 5,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "sample_packing": True,
                "bf16": True,
                "gradient_checkpointing": gradient_checkpointing,
                "save_first_step": False,
                "dataset_num_proc": 4,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/patched/test_cli_integrations.py
================================================
"""
test cases to make sure the plugin args are loaded from the config file
"""

from pathlib import Path

import yaml

from axolotl.cli.config import load_cfg
from axolotl.utils.dict import DictDefault


class TestPluginArgs:
    """
    test class for plugin args loaded from the config file
    """

    def test_liger_plugin_args(self, temp_dir):
        test_cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "learning_rate": 0.000001,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "plugins": ["axolotl.integrations.liger.LigerPlugin"],
                "liger_layer_norm": True,
                "liger_rope": True,
                "liger_rms_norm": False,
                "liger_glu_activation": True,
                "liger_fused_linear_cross_entropy": True,
            }
        )

        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(test_cfg.to_dict()))
        cfg = load_cfg(str(Path(temp_dir) / "config.yaml"))
        assert cfg.liger_layer_norm is True
        assert cfg.liger_rope is True
        assert cfg.liger_rms_norm is False
        assert cfg.liger_glu_activation is True
        assert cfg.liger_fused_linear_cross_entropy is True


================================================
FILE: tests/e2e/patched/test_fa_xentropy.py
================================================
"""
E2E tests for lora llama
"""

import pytest
from transformers.utils import is_torch_bf16_gpu_available

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from ..utils import check_model_output_exists, check_tensorboard


class TestFAXentropyLlama:
    """
    Test case for Llama models using LoRA w multipack
    """

    @pytest.mark.parametrize(
        "gradient_accumulation_steps",
        [1, 4],
    )
    def test_lora_packing_fa_cross_entropy(self, temp_dir, gradient_accumulation_steps):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 1024,
                "sample_packing": True,
                "flash_attention": True,
                "flash_attn_cross_entropy": True,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.05,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "chat_template": "chatml",
                "datasets": [
                    {
                        "path": "mlabonne/FineTome-100k",
                        "field_messages": "conversations",
                        "message_field_content": "value",
                        "message_field_role": "from",
                        "type": "chat_template",
                        "split": "train[:2%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 5,
                "save_steps": 5,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": gradient_accumulation_steps,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
            cfg.bf16 = True
        else:
            cfg.fp16 = True

        cfg = validate_config(cfg)
        normalize_config(cfg)

        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
        )


================================================
FILE: tests/e2e/patched/test_falcon_samplepack.py
================================================
"""
E2E tests for falcon
"""

import unittest

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from ..utils import check_model_output_exists, with_temp_dir


class TestFalconPatched(unittest.TestCase):
    """
    Test case for Falcon models
    """

    @pytest.mark.skip(reason="no tiny models for testing with safetensors")
    @with_temp_dir
    def test_qlora(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "illuin/tiny-random-FalconForCausalLM",
                "flash_attention": True,
                "sample_packing": True,
                "sequence_len": 2048,
                "load_in_4bit": True,
                "adapter": "qlora",
                "lora_r": 16,
                "lora_alpha": 32,
                "lora_dropout": 0.1,
                "lora_target_linear": True,
                "lora_modules_to_save": ["word_embeddings", "lm_head"],
                "val_set_size": 0.05,
                "special_tokens": {
                    "bos_token": "<|endoftext|>",
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

    @pytest.mark.skip(reason="no tiny models for testing with safetensors")
    @with_temp_dir
    def test_ft(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "illuin/tiny-random-FalconForCausalLM",
                "flash_attention": True,
                "sample_packing": True,
                "sequence_len": 2048,
                "val_set_size": 0.05,
                "special_tokens": {
                    "bos_token": "<|endoftext|>",
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/patched/test_flattening.py
================================================
"""
E2E tests for flattening batches
"""

import pytest
from transformers.utils import is_torch_bf16_gpu_available

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from ..utils import check_model_output_exists, check_tensorboard


class TestFAFlattening:
    """
    Test case for Llama models using LoRA w batch flattening
    """

    @pytest.mark.parametrize(
        "gradient_accumulation_steps",
        [1, 4],
    )
    def test_lora_packing_flattening(self, temp_dir, gradient_accumulation_steps):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 1024,
                "batch_flattening": True,
                "flash_attention": True,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.05,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "chat_template": "chatml",
                "datasets": [
                    {
                        "path": "mlabonne/FineTome-100k",
                        "field_messages": "conversations",
                        "message_field_content": "value",
                        "message_field_role": "from",
                        "type": "chat_template",
                        "split": "train[:2%]",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 5,
                "save_steps": 5,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": gradient_accumulation_steps,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
            cfg.bf16 = True
        else:
            cfg.fp16 = True

        cfg = validate_config(cfg)
        normalize_config(cfg)

        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 1.5, "Train Loss (%s) is too high"
        )


================================================
FILE: tests/e2e/patched/test_fsdp2_qlora.py
================================================
"""Integration tests for FSDP2 Params4bit patches."""

import pytest
from torch.distributed.fsdp._fully_shard._fsdp_param import FSDPParam


class TestFSDPPatchIntegration:
    """Test FSDP patch integration."""

    @pytest.mark.integration
    def test_fsdp2_init_patches(self):
        """Test that all patches can be applied together."""
        from axolotl.monkeypatch.fsdp2_qlora import (
            apply_init_sharded_param_patch,
            apply_init_unsharded_param_patch,
        )

        original_init_sharded = FSDPParam._init_sharded_param
        original_init_unsharded = FSDPParam.init_unsharded_param

        # Apply patches
        apply_init_sharded_param_patch()
        apply_init_unsharded_param_patch()

        assert FSDPParam._init_sharded_param != original_init_sharded, (
            "_init_sharded_param was not patched"
        )
        assert FSDPParam.init_unsharded_param != original_init_unsharded, (
            "init_unsharded_param was not patched"
        )


================================================
FILE: tests/e2e/patched/test_fused_llama.py
================================================
"""
E2E tests for lora llama
"""

import unittest

import pytest
from transformers.utils import is_torch_bf16_gpu_available

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from ..utils import check_model_output_exists, with_temp_dir


@pytest.mark.skip("FIXME, mostly underused functionality")
class TestFusedLlama(unittest.TestCase):
    """
    Test case for Llama models using Fused layers
    """

    @with_temp_dir
    def test_fft_packing(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "flash_attention": True,
                "pad_to_sequence_len": True,
                "flash_attn_fuse_mlp": True,
                "sample_packing": True,
                "sequence_len": 1024,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 10,
                "save_steps": 5,
                "eval_steps": 5,
                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
            cfg.bf16 = True
        else:
            cfg.fp16 = True
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/patched/test_llama_s2_attention.py
================================================
"""
E2E tests for llama w/ S2 attn
"""

import unittest

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from ..utils import check_model_output_exists, with_temp_dir


@pytest.mark.skip(reason="FIXME?")
class TestLlamaShiftedSparseAttention(unittest.TestCase):
    """
    Test case for Llama models using S2 Attn
    """

    @with_temp_dir
    def test_lora_s2_attn(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 16384,
                "sample_packing": False,
                "flash_attention": True,
                "s2_attention": True,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 32,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "Yukang/LongAlpaca-12k",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 10,
                "save_steps": 5,
                "eval_steps": 5,
                "bf16": "auto",
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

    @with_temp_dir
    def test_fft_s2_attn(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 16384,
                "sample_packing": False,
                "flash_attention": True,
                "s2_attention": True,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "Yukang/LongAlpaca-12k",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 10,
                "save_steps": 5,
                "eval_steps": 5,
                "bf16": "auto",
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/patched/test_lora_llama_multipack.py
================================================
"""
E2E tests for lora llama
"""

import unittest

from transformers.utils import is_torch_bf16_gpu_available

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from ..utils import check_model_output_exists, with_temp_dir


class TestLoraLlama(unittest.TestCase):
    """
    Test case for Llama models using LoRA w multipack
    """

    @with_temp_dir
    def test_lora_packing(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "sample_packing": True,
                "flash_attention": True,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 32,
                "lora_alpha": 64,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.2,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "max_steps": 20,
                "save_steps": 10,
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
            cfg.bf16 = True
        else:
            cfg.fp16 = True

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/patched/test_mistral_samplepack.py
================================================
"""
E2E tests for lora llama
"""

import unittest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from ..utils import check_model_output_exists, require_torch_2_6_0, with_temp_dir


class TestMistral(unittest.TestCase):
    """
    Test case for Llama models using LoRA
    """

    @require_torch_2_6_0
    @with_temp_dir
    def test_lora_packing(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
                "flash_attention": True,
                "sample_packing": True,
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 32,
                "lora_alpha": 64,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.05,
                "special_tokens": {
                    "unk_token": "<unk>",
                    "bos_token": "<s>",
                    "eos_token": "</s>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "save_steps": 3,
                "eval_steps": 4,
                "bf16": "auto",
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

    @with_temp_dir
    def test_ft_packing(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
                "flash_attention": True,
                "sample_packing": True,
                "sequence_len": 1024,
                "val_set_size": 0.05,
                "special_tokens": {
                    "unk_token": "<unk>",
                    "bos_token": "<s>",
                    "eos_token": "</s>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "save_steps": 3,
                "eval_steps": 4,
                "bf16": "auto",
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/patched/test_mixtral_samplepack.py
================================================
"""
E2E tests for mixtral
"""

import unittest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from ..utils import check_model_output_exists, with_temp_dir


class TestMixtral(unittest.TestCase):
    """
    Test case for Llama models using LoRA
    """

    @with_temp_dir
    def test_qlora(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "hf-internal-testing/Mixtral-tiny",
                "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF",
                "flash_attention": True,
                "sample_packing": True,
                "sequence_len": 2048,
                "load_in_4bit": True,
                "adapter": "qlora",
                "lora_r": 16,
                "lora_alpha": 32,
                "lora_dropout": 0.1,
                "lora_target_linear": True,
                "val_set_size": 0.05,
                "special_tokens": {},
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "save_steps": 3,
                "eval_steps": 4,
                "bf16": "auto",
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

    @with_temp_dir
    def test_ft(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "hf-internal-testing/Mixtral-tiny",
                "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF",
                "flash_attention": True,
                "sample_packing": True,
                "sequence_len": 2048,
                "val_set_size": 0.05,
                "special_tokens": {},
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "save_steps": 3,
                "eval_steps": 4,
                "bf16": "auto",
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/patched/test_model_patches.py
================================================
"""
E2E smoke tests to check that the monkeypatches are in place for certain configurations
"""

import unittest

import transformers

from axolotl.loaders import ModelLoader, load_tokenizer
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from ..utils import with_temp_dir


class TestModelPatches(unittest.TestCase):
    """
    TestCases for the multipack monkey patches
    """

    @with_temp_dir
    def test_mixtral_multipack(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "hf-internal-testing/Mixtral-tiny",
                "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF",
                "flash_attention": True,
                "sample_packing": True,
                "sequence_len": 2048,
                "val_set_size": 0.02,
                "special_tokens": {},
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        tokenizer = load_tokenizer(cfg)
        ModelLoader(cfg, tokenizer, inference=False).load()

    @with_temp_dir
    def test_mistral_multipack(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
                "flash_attention": True,
                "sample_packing": True,
                "sequence_len": 2048,
                "val_set_size": 0.02,
                "special_tokens": {},
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        tokenizer = load_tokenizer(cfg)
        ModelLoader(cfg, tokenizer, inference=False).load()

        assert (
            "torch.jit"
            in transformers.modeling_flash_attention_utils._get_unpad_data.__module__
        )


================================================
FILE: tests/e2e/patched/test_peft_embeddings.py
================================================
"""
Test case for handling embeddings when using peft
"""

import torch

from axolotl.train import setup_model_and_tokenizer
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault


class TestLlamaPeftEmbeddings:
    """
    test class for handling embeddings when using peft
    """

    def test_peft_embeddings_upcast(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "load_in_4bit": True,
                "adapter": "qlora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_target_linear": True,
                "trust_remote_code": True,
                "sequence_len": 512,
                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "sample_packing": False,
                "bf16": "auto",
                "embeddings_skip_upcast": True,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)

        model, _, _, _ = setup_model_and_tokenizer(cfg)

        # Check if the embeddings are upcast correctly
        # only embed_tokens is a parameter that may be upcast
        assert model.base_model.model.model.embed_tokens.weight.dtype == torch.bfloat16
        assert model.base_model.model.lm_head.weight.dtype == torch.bfloat16


================================================
FILE: tests/e2e/patched/test_phi_multipack.py
================================================
"""
E2E tests for lora llama
"""

import unittest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from ..utils import check_model_output_exists, with_temp_dir


class TestPhiMultipack(unittest.TestCase):
    """
    Test case for Phi2 models
    """

    @with_temp_dir
    def test_ft_packed(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "microsoft/phi-1_5",
                "model_type": "PhiForCausalLM",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "sample_packing": True,
                "flash_attention": True,
                "pad_to_sequence_len": True,
                "load_in_8bit": False,
                "adapter": None,
                "val_set_size": 0.05,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "dataset_shard_num": 10,
                "dataset_shard_idx": 0,
                "num_epochs": 1,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "eval_steps": 3,
                "save_steps": 4,
                "bf16": "auto",
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

    @with_temp_dir
    def test_qlora_packed(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "microsoft/phi-1_5",
                "model_type": "PhiForCausalLM",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "sample_packing": True,
                "flash_attention": True,
                "pad_to_sequence_len": True,
                "load_in_4bit": True,
                "adapter": "qlora",
                "lora_r": 64,
                "lora_alpha": 32,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "dataset_shard_num": 10,
                "dataset_shard_idx": 0,
                "num_epochs": 1,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "eval_steps": 3,
                "save_steps": 4,
                "bf16": "auto",
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/patched/test_resume.py
================================================
"""
E2E tests for resuming training
"""

import os
import re
import subprocess

from transformers.utils import is_torch_bf16_gpu_available

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.callbacks.tokens_per_second import TOKENS_STATE_FILE
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from ..utils import check_model_output_exists, most_recent_subdir, require_torch_2_6_0


class TestResumeLlama:
    """
    Test case for resuming training of llama models
    """

    @require_torch_2_6_0
    def test_resume_lora_packed(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 1024,
                "sample_packing": True,
                "flash_attention": True,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.001,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "save_steps": 3,
                "save_total_limit": 5,
                "max_steps": 15,
                "use_tensorboard": True,
                "save_first_step": False,
                "include_tkps": True,
            }
        )
        if is_torch_bf16_gpu_available():
            cfg.bf16 = True
        else:
            cfg.fp16 = True
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        initial_total_num_tokens = cfg.total_num_tokens
        assert initial_total_num_tokens is not None, (
            "total_num_tokens should be calculated during load_datasets"
        )

        train(cfg=cfg, dataset_meta=dataset_meta)

        checkpoint_path = f"{temp_dir}/checkpoint-9"
        tokens_state_path = os.path.join(checkpoint_path, TOKENS_STATE_FILE)
        assert os.path.isfile(tokens_state_path), (
            f"{TOKENS_STATE_FILE} should exist in checkpoint at {tokens_state_path}"
        )

        resume_cfg = cfg | DictDefault(
            {
                "resume_from_checkpoint": f"{temp_dir}/checkpoint-9/",
            }
        )
        normalize_config(resume_cfg)

        assert resume_cfg.total_num_tokens == initial_total_num_tokens, (
            f"total_num_tokens should be preserved on resume. "
            f"Expected {initial_total_num_tokens}, got {resume_cfg.total_num_tokens}"
        )

        resume_dataset_meta = load_datasets(cfg=resume_cfg)

        assert resume_cfg.total_num_tokens == initial_total_num_tokens, (
            f"total_num_tokens should not be recalculated when resuming. "
            f"Expected {initial_total_num_tokens}, got {resume_cfg.total_num_tokens}"
        )

        train(cfg=resume_cfg, dataset_meta=resume_dataset_meta)

        assert resume_cfg.total_num_tokens == initial_total_num_tokens, (
            f"total_num_tokens should remain unchanged after resume training. "
            f"Expected {initial_total_num_tokens}, got {resume_cfg.total_num_tokens}"
        )
        check_model_output_exists(temp_dir, cfg)

        tb_log_path_1 = most_recent_subdir(temp_dir + "/runs")
        cmd = f"tensorboard --inspect  --logdir {tb_log_path_1}"
        res = subprocess.run(
            cmd, shell=True, text=True, capture_output=True, check=True
        )
        pattern = r"first_step\s+(\d+)"
        first_steps = int(re.findall(pattern, res.stdout)[0])
        assert first_steps == 10


================================================
FILE: tests/e2e/patched/test_unsloth_integration.py
================================================
"""Test module for checking whether the integration of Unsloth with Hugging Face Transformers is working as expected."""

import unittest

import pytest


@pytest.mark.skip(
    reason="Unsloth integration will be broken going into latest transformers"
)
class TestUnslothIntegration(unittest.TestCase):
    """Unsloth monkeypatch integration tests."""

    def test_is_self_attn_patchable(self):
        from axolotl.monkeypatch.unsloth_ import check_self_attn_is_patchable

        # ensures the current version of transformers has loss code that matches our patching code
        self.assertTrue(
            check_self_attn_is_patchable(),
            "HF transformers self attention code has changed and isn't patchable",
        )


================================================
FILE: tests/e2e/patched/test_unsloth_qlora.py
================================================
"""
e2e tests for unsloth qlora
"""

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from ..utils import check_model_output_exists, check_tensorboard


@pytest.mark.skip(
    reason="Unsloth integration will be broken going into latest transformers"
)
class TestUnslothQLoRA:
    """
    Test class for Unsloth QLoRA Llama models
    """

    @pytest.mark.parametrize(
        "sample_packing",
        [True, False],
    )
    def test_unsloth_llama_qlora_fa2(self, temp_dir, sample_packing):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 1024,
                "sample_packing": sample_packing,
                "flash_attention": True,
                "unsloth_lora_mlp": True,
                "unsloth_lora_qkv": True,
                "unsloth_lora_o": True,
                "load_in_4bit": True,
                "adapter": "qlora",
                "lora_r": 16,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.05,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 5,
                "save_steps": 10,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "use_tensorboard": True,
                "bf16": "auto",
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
        )

    def test_unsloth_llama_qlora_unpacked(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 1024,
                "unsloth_lora_mlp": True,
                "unsloth_lora_qkv": True,
                "unsloth_lora_o": True,
                "sample_packing": False,
                "load_in_4bit": True,
                "adapter": "qlora",
                "lora_r": 16,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.05,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 5,
                "save_steps": 10,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "use_tensorboard": True,
                "bf16": "auto",
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
        )

    @pytest.mark.parametrize(
        "sdp_attention",
        [True, False],
    )
    def test_unsloth_llama_qlora_unpacked_no_fa2_fp16(self, temp_dir, sdp_attention):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 1024,
                "unsloth_lora_mlp": True,
                "unsloth_lora_qkv": True,
                "unsloth_lora_o": True,
                "sample_packing": False,
                "load_in_4bit": True,
                "adapter": "qlora",
                "lora_r": 16,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.05,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 5,
                "save_steps": 10,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 2,
                "sdp_attention": sdp_attention,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "use_tensorboard": True,
                "fp16": True,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
        )


================================================
FILE: tests/e2e/solo/__init__.py
================================================


================================================
FILE: tests/e2e/solo/test_flex.py
================================================
"""
E2E tests for packed training w/ flex attention
"""

import unittest

from transformers.utils import is_torch_bf16_gpu_available

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from ..utils import check_tensorboard, require_torch_2_6_0, with_temp_dir


class TestPackedFlex(unittest.TestCase):
    """
    Test case for Packed training of llama models
    """

    @require_torch_2_6_0
    @with_temp_dir
    def test_loss_llama(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 1024,
                "sample_packing": True,
                "flex_attention": True,
                "val_set_size": 0.0,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
            cfg.bf16 = True
        else:
            cfg.fp16 = True

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.1, "Train Loss (%s) is too high"
        )


================================================
FILE: tests/e2e/solo/test_relora_llama.py
================================================
"""
E2E tests for relora llama
"""

import unittest
from pathlib import Path

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from ..utils import check_model_output_exists, check_tensorboard, with_temp_dir


class TestReLoraLlama(unittest.TestCase):
    """
    Test case for Llama models using LoRA
    """

    @with_temp_dir
    def test_relora(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 2048,
                "sample_packing": True,
                "pad_to_sequence_len": True,
                "flash_attention": True,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_modules": ["q_proj", "v_proj"],
                "relora": True,
                "jagged_restart_steps": 50,
                "jagged_restart_warmup_steps": 10,
                "jagged_restart_anneal_steps": 10,
                "relora_prune_ratio": 0.9,
                "relora_cpu_offload": True,
                "val_set_size": 0.0,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "chat_template": "chatml",
                "datasets": [
                    {
                        "path": "mlabonne/FineTome-100k",
                        "type": "chat_template",
                        "split": "train[:10%]",
                        "field_messages": "conversations",
                        "message_field_role": "from",
                        "message_field_content": "value",
                    },
                ],
                "warmup_steps": 10,
                "num_epochs": 2,
                "max_steps": 105,  # at least 2x relora_steps
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(Path(temp_dir) / "checkpoint-100/adapter", cfg)
        assert (Path(temp_dir) / "checkpoint-100/relora/model.safetensors").exists(), (
            "Relora model checkpoint not found"
        )

        check_tensorboard(
            temp_dir + "/runs", "train/grad_norm", 0.2, "grad_norm is too high"
        )


================================================
FILE: tests/e2e/test_activation_offloading.py
================================================
"""
E2E tests for activation offloading
"""

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from .utils import check_model_output_exists


class TestActivationOffloading:
    """
    E2E test cases for activation offloading
    """

    @pytest.mark.parametrize(
        "adapter",
        ["lora", "qlora", None],
    )
    def test_activation_offloading(
        self,
        temp_dir,
        adapter,
    ):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 1024,
                "val_set_size": 0.0,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                    "eos_token": "<|im_end|>",
                },
                "datasets": [
                    {
                        "chat_template": "chatml",
                        "path": "mlabonne/FineTome-100k",
                        "type": "chat_template",
                        "split": "train[:10%]",
                        "field_messages": "conversations",
                        "message_field_role": "from",
                        "message_field_content": "value",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 2,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "sample_packing": True,
                "bf16": "auto",
                "gradient_checkpointing": True,
                "activation_offloading": True,
                "save_first_step": False,
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_target_linear": True,
            }
        )
        if adapter == "lora":
            cfg["adapter"] = "lora"
        if adapter == "qlora":
            cfg["adapter"] = "qlora"
            cfg["load_in_4bit"] = True

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/test_deepseekv3.py
================================================
"""
E2E tests for deepseekv3
"""

from pathlib import Path

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from tests.hf_offline_utils import enable_hf_offline


class TestDeepseekV3:
    """
    Test case for DeepseekV3 models
    """

    @enable_hf_offline
    @pytest.mark.parametrize(
        "sample_packing",
        [True, False],
    )
    def test_lora_deepseekv3(self, temp_dir, sample_packing):
        cfg = DictDefault(
            {
                "base_model": "axolotl-ai-co/DeepSeek-V3-11M",
                "trust_remote_code": True,
                "sample_packing": sample_packing,
                "flash_attention": True,
                "sequence_len": 2048,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0,
                "datasets": [
                    {
                        "path": "mlabonne/FineTome-100k",
                        "type": "chat_template",
                        "field_messages": "conversations",
                        "message_property_mappings": {
                            "role": "from",
                            "content": "value",
                        },
                        "drop_system_message": True,
                        "split": "train[:1%]",
                    },
                ],
                "special_tokens": {
                    "bos_token": "<｜begin▁of▁sentence｜>",
                    "eos_token": "<｜end▁of▁sentence｜>",
                },
                "chat_template": "deepseek_v3",
                "num_epochs": 1,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "bf16": True,
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "adapter_model.safetensors").exists()

    @enable_hf_offline
    @pytest.mark.parametrize(
        "sample_packing",
        [True, False],
    )
    def test_fft_deepseekv3(self, temp_dir, sample_packing):
        cfg = DictDefault(
            {
                "base_model": "axolotl-ai-co/DeepSeek-V3-11M",
                "trust_remote_code": True,
                "sample_packing": sample_packing,
                "flash_attention": True,
                "sequence_len": 2048,
                "val_set_size": 0,
                "datasets": [
                    {
                        "path": "mlabonne/FineTome-100k",
                        "type": "chat_template",
                        "field_messages": "conversations",
                        "message_field_role": "from",
                        "message_field_content": "value",
                        "split": "train[:1%]",
                    },
                ],
                "chat_template": "deepseek_v3",
                "special_tokens": {
                    "bos_token": "<｜begin▁of▁sentence｜>",
                    "eos_token": "<｜end▁of▁sentence｜>",
                },
                "num_epochs": 1,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "bf16": True,
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "model.safetensors").exists()


================================================
FILE: tests/e2e/test_diffusion.py
================================================
"""E2E smoke test for diffusion training plugin."""

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from tests.e2e.utils import check_model_output_exists


class TestDiffusion:
    """Test case for diffusion training plugin."""

    def test_diffusion_smoke_test(self, temp_dir):
        """
        Smoke test for diffusion training to ensure the plugin loads and trains without
        error.
        """
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "trust_remote_code": True,
                "sequence_len": 256,
                "val_set_size": 0.1,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 3,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.0001,
                "optimizer": "adamw_torch",
                "lr_scheduler": "cosine",
                "bf16": True,
                "save_first_step": False,
                "logging_steps": 1,
                "eval_steps": 3,
                # Diffusion-specific config
                "plugins": ["axolotl.integrations.diffusion.DiffusionPlugin"],
                "diffusion": {
                    # sample generation
                    "generate_samples": True,
                    "generation_interval": 1,
                    "num_generation_samples": 1,
                    "generation_steps": 2,
                    "generation_max_length": 32,
                    "generation_temperature": 0.0,
                    # training-specific
                    "mask_token_id": 16,
                    "eps": 1e-3,
                    "importance_weighting": False,
                },
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

    def test_diffusion_sft_labels(self, temp_dir):
        """Test that diffusion training properly handles SFT data with labels."""
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "trust_remote_code": True,
                "sequence_len": 256,
                "val_set_size": 0.1,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 3,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.0001,
                "optimizer": "adamw_torch",
                "lr_scheduler": "cosine",
                "bf16": True,
                "save_first_step": False,
                "logging_steps": 1,
                "eval_steps": 2,
                # Diffusion-specific config
                "plugins": ["axolotl.integrations.diffusion.DiffusionPlugin"],
                "diffusion": {
                    # sample generation
                    "generate_samples": True,
                    "generation_interval": 1,
                    "num_generation_samples": 1,
                    "generation_steps": 2,
                    "generation_max_length": 32,
                    "generation_temperature": 0.0,
                    # training-specific
                    "mask_token_id": 16,
                    "eps": 1e-3,
                    "importance_weighting": True,
                },
                # Ensure we have proper SFT labels
                "train_on_inputs": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        # Verify that the dataset has labels
        sample = dataset_meta.train_dataset[0]
        assert "labels" in sample, "SFT dataset should have labels"

        # Check that some labels are -100 (prompt tokens)
        labels = sample["labels"]
        if hasattr(labels, "tolist"):
            labels = labels.tolist()
        assert -100 in labels, "SFT dataset should have -100 labels for prompt tokens"

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/test_dpo.py
================================================
"""E2E tests for lora llama"""

import unittest
from pathlib import Path

import pytest

from axolotl.cli.args import TrainerCliArgs
from axolotl.common.datasets import load_preference_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from .utils import check_model_output_exists, with_temp_dir


class TestDPOLlamaLora(unittest.TestCase):
    """
    Test case for DPO Llama models using LoRA
    """

    @with_temp_dir
    def test_dpo_lora(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 64,
                "lora_alpha": 32,
                "lora_dropout": 0.1,
                "lora_target_linear": True,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "rl": "dpo",
                "datasets": [
                    {
                        "path": "arcee-ai/distilabel-intel-orca-dpo-pairs-binarized",
                        "type": "chatml.ultra",
                        "split": "train",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "paged_adamw_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(Path(temp_dir) / "checkpoint-20", cfg)

    @with_temp_dir
    def test_dpo_nll_lora(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 64,
                "lora_alpha": 32,
                "lora_dropout": 0.1,
                "lora_target_linear": True,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "rl": "dpo",
                "rpo_alpha": 0.5,
                "datasets": [
                    {
                        "path": "arcee-ai/distilabel-intel-orca-dpo-pairs-binarized",
                        "type": "chatml.ultra",
                        "split": "train",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "paged_adamw_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(Path(temp_dir) / "checkpoint-20", cfg)

    @with_temp_dir
    def test_dpo_use_weighting(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 64,
                "lora_alpha": 32,
                "lora_dropout": 0.1,
                "lora_target_linear": True,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "rl": "dpo",
                "dpo_use_weighting": True,
                "datasets": [
                    {
                        "path": "arcee-ai/distilabel-intel-orca-dpo-pairs-binarized",
                        "type": "chatml.ultra",
                        "split": "train",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "paged_adamw_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(Path(temp_dir) / "checkpoint-20", cfg)

    @pytest.mark.skip("kto_pair no longer supported in trl")
    @with_temp_dir
    def test_kto_pair_lora(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 64,
                "lora_alpha": 32,
                "lora_dropout": 0.1,
                "lora_target_linear": True,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "rl": "kto_pair",
                "datasets": [
                    {
                        "path": "arcee-ai/distilabel-intel-orca-dpo-pairs-binarized",
                        "type": "chatml.ultra",
                        "split": "train",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "paged_adamw_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(Path(temp_dir) / "checkpoint-20", cfg)

    @with_temp_dir
    def test_ipo_lora(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 64,
                "lora_alpha": 32,
                "lora_dropout": 0.1,
                "lora_target_linear": True,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "rl": "ipo",
                "datasets": [
                    {
                        "path": "arcee-ai/distilabel-intel-orca-dpo-pairs-binarized",
                        "type": "chatml.ultra",
                        "split": "train",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "paged_adamw_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(Path(temp_dir) / "checkpoint-20", cfg)

    @with_temp_dir
    def test_orpo_lora(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 64,
                "lora_alpha": 32,
                "lora_dropout": 0.1,
                "lora_target_linear": True,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "rl": "orpo",
                "orpo_alpha": 0.1,
                "remove_unused_columns": False,
                "chat_template": "chatml",
                "datasets": [
                    {
                        "path": "argilla/distilabel-capybara-dpo-7k-binarized",
                        "type": "chat_template.argilla",
                        "split": "train",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "paged_adamw_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(Path(temp_dir) / "checkpoint-20", cfg)

    @pytest.mark.skip(reason="Fix the implementation")
    @with_temp_dir
    def test_kto_lora(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 64,
                "lora_alpha": 32,
                "lora_dropout": 0.1,
                "lora_target_linear": True,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "rl": "kto",
                "rl_beta": 0.5,
                "kto_desirable_weight": 1.0,
                "kto_undesirable_weight": 1.0,
                "remove_unused_columns": False,
                "datasets": [
                    # {
                    #     "path": "argilla/kto-mix-15k",
                    #     "type": "chatml.argilla_chat",
                    #     "split": "train",
                    # },
                    {
                        "path": "argilla/ultrafeedback-binarized-preferences-cleaned-kto",
                        "type": "chatml.ultra",
                        "split": "train",
                    },
                    # {
                    #     "path": "argilla/kto-mix-15k",
                    #     "type": "llama3.argilla_chat",
                    #     "split": "train",
                    # },
                    {
                        "path": "argilla/ultrafeedback-binarized-preferences-cleaned-kto",
                        "type": "llama3.ultra",
                        "split": "train",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "paged_adamw_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "warmup_steps": 5,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": True},
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_preference_datasets(cfg=cfg, cli_args=cli_args)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(Path(temp_dir) / "checkpoint-20", cfg)


================================================
FILE: tests/e2e/test_embeddings_lr.py
================================================
"""
E2E tests for llama pretrain
"""

import unittest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from .utils import check_model_output_exists, check_tensorboard, with_temp_dir


class TestEmbeddingsLrScale(unittest.TestCase):
    """
    Test case for embedding_lr*
    """

    @with_temp_dir
    def test_train_w_embedding_lr_scale(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "flash_attention": True,
                "sequence_len": 1024,
                "sample_packing": True,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "max_steps": 5,
                "num_epochs": 1,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "val_set_size": 0.0,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "embedding_lr_scale": 0.5,
                "lr_scheduler": "cosine",
                "bf16": "auto",
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
        )

    @with_temp_dir
    def test_train_w_embedding_lr(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "flash_attention": True,
                "sequence_len": 1024,
                "sample_packing": True,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "max_steps": 5,
                "num_epochs": 1,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "val_set_size": 0.0,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "embedding_lr": 0.000005,
                "lr_scheduler": "cosine",
                "bf16": "auto",
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.0, "Loss is too high"
        )


================================================
FILE: tests/e2e/test_evaluate.py
================================================
"""E2E smoke test for evaluate CLI command"""

from pathlib import Path

import yaml
from accelerate.test_utils import execute_subprocess_async
from transformers.testing_utils import get_torch_dist_unique_port

from axolotl.utils.dict import DictDefault


class TestE2eEvaluate:
    """Test cases for evaluate CLI"""

    def test_evaluate(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 1024,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_first_step": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "accelerate",
                "launch",
                "--num-processes",
                "2",
                "--main_process_port",
                f"{get_torch_dist_unique_port()}",
                "-m",
                "axolotl.cli.evaluate",
                str(Path(temp_dir) / "config.yaml"),
            ]
        )


================================================
FILE: tests/e2e/test_falcon.py
================================================
"""
E2E tests for falcon
"""

import unittest

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from .utils import check_model_output_exists, with_temp_dir


class TestFalcon(unittest.TestCase):
    """
    Test case for falcon
    """

    @pytest.mark.skip(reason="no tiny models for testing with safetensors")
    @with_temp_dir
    def test_lora(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "illuin/tiny-random-FalconForCausalLM",
                "flash_attention": True,
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 32,
                "lora_alpha": 64,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "lora_modules_to_save": [
                    "word_embeddings",
                    "lm_head",
                ],
                "val_set_size": 0.02,
                "special_tokens": {
                    "bos_token": "<|endoftext|>",
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

    @pytest.mark.skip(reason="no tiny models for testing with safetensors")
    @with_temp_dir
    def test_lora_added_vocab(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "illuin/tiny-random-FalconForCausalLM",
                "flash_attention": True,
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 32,
                "lora_alpha": 64,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "lora_modules_to_save": [
                    "word_embeddings",
                    "lm_head",
                ],
                "val_set_size": 0.02,
                "special_tokens": {
                    "bos_token": "<|endoftext|>",
                    "pad_token": "<|endoftext|>",
                },
                "tokens": [
                    "<|im_start|>",
                    "<|im_end|>",
                ],
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

    @pytest.mark.skip(reason="no tiny models for testing with safetensors")
    @with_temp_dir
    def test_ft(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "illuin/tiny-random-FalconForCausalLM",
                "flash_attention": True,
                "sequence_len": 1024,
                "val_set_size": 0.02,
                "special_tokens": {
                    "bos_token": "<|endoftext|>",
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/test_gemma2.py
================================================
"""
E2E tests for gemma2
"""

from pathlib import Path

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault


class TestGemma2:
    """
    Test case for Gemma2 models
    """

    @pytest.mark.parametrize(
        "sample_packing",
        [True, False],
    )
    def test_lora_gemma2(self, temp_dir, sample_packing):
        cfg = DictDefault(
            {
                "base_model": "axolotl-ai-co/gemma-2-33M",
                "trust_remote_code": True,
                "sample_packing": sample_packing,
                "flash_attention": True,
                "sequence_len": 2048,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0,
                "datasets": [
                    {
                        "path": "mlabonne/FineTome-100k",
                        "type": "chat_template",
                        "field_messages": "conversations",
                        "message_property_mappings": {
                            "role": "from",
                            "content": "value",
                        },
                        "drop_system_message": True,
                        "split": "train[:1%]",
                    },
                ],
                "special_tokens": {
                    "bos_token": "<bos>",
                    "eos_token": "<eos>",
                },
                "chat_template": "gemma",  # gemma2's template is same as gemma
                "num_epochs": 1,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "bf16": True,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "adapter_model.safetensors").exists()

    @pytest.mark.parametrize(
        "sample_packing",
        [True, False],
    )
    def test_fft_gemma2(self, temp_dir, sample_packing):
        cfg = DictDefault(
            {
                "base_model": "axolotl-ai-co/gemma-2-33M",
                "trust_remote_code": True,
                "sample_packing": sample_packing,
                "flash_attention": True,
                "sequence_len": 2048,
                "val_set_size": 0,
                "datasets": [
                    {
                        "path": "mlabonne/FineTome-100k",
                        "type": "chat_template",
                        "field_messages": "conversations",
                        "message_property_mappings": {
                            "role": "from",
                            "content": "value",
                        },
                        "split": "train[:1%]",
                        "drop_system_message": True,
                    },
                ],
                "chat_template": "gemma",  # gemma2's template is same as gemma
                "special_tokens": {
                    "bos_token": "<bos>",
                    "eos_token": "<eos>",
                },
                "num_epochs": 1,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "bf16": True,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "model.safetensors").exists()


================================================
FILE: tests/e2e/test_gemma3_text.py
================================================
"""
E2E tests for gemma3_text
"""

from pathlib import Path

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault


class TestGemma3Text:
    """
    Test case for Gemma3Text models
    """

    @pytest.mark.parametrize(
        "sample_packing",
        [True, False],
    )
    def test_lora_gemma3_text(self, temp_dir, sample_packing):
        cfg = DictDefault(
            {
                "base_model": "axolotl-ai-co/gemma-3-34M",
                "trust_remote_code": True,
                "sample_packing": sample_packing,
                "flash_attention": True,
                "sequence_len": 2048,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0,
                "datasets": [
                    {
                        "path": "mlabonne/FineTome-100k",
                        "type": "chat_template",
                        "field_messages": "conversations",
                        "message_property_mappings": {
                            "role": "from",
                            "content": "value",
                        },
                        "split": "train[:1%]",
                    },
                ],
                "special_tokens": {
                    "bos_token": "<bos>",
                    "eos_token": "<eos>",
                },
                "chat_template": "gemma3",
                "num_epochs": 1,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "bf16": True,
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "adapter_model.safetensors").exists()

    @pytest.mark.parametrize(
        "sample_packing",
        [True, False],
    )
    def test_fft_gemma3_text(self, temp_dir, sample_packing):
        cfg = DictDefault(
            {
                "base_model": "axolotl-ai-co/gemma-3-34M",
                "trust_remote_code": True,
                "sample_packing": sample_packing,
                "flash_attention": True,
                "sequence_len": 2048,
                "val_set_size": 0,
                "datasets": [
                    {
                        "path": "mlabonne/FineTome-100k",
                        "type": "chat_template",
                        "field_messages": "conversations",
                        "message_property_mappings": {
                            "role": "from",
                            "content": "value",
                        },
                        "split": "train[:1%]",
                    },
                ],
                "chat_template": "gemma3",
                "special_tokens": {
                    "bos_token": "<bos>",
                    "eos_token": "<eos>",
                },
                "num_epochs": 1,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "bf16": True,
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "model.safetensors").exists()


================================================
FILE: tests/e2e/test_imports.py
================================================
"""
test module to import various submodules that have historically broken due to dependency issues
"""

import unittest


class TestImports(unittest.TestCase):
    """
    Test class to import various submodules that have historically broken due to dependency issues
    """

    def test_import_causal_trainer(self):
        pass

    def test_import_rl_trainer(self):
        pass


================================================
FILE: tests/e2e/test_llama.py
================================================
"""
E2E tests for llama
"""

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from tests.e2e.utils import check_model_output_exists


class TestLlama:
    """
    Test case for Llama models
    """

    def test_fft_trust_remote_code(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "trust_remote_code": True,
                "sequence_len": 512,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 5,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "sample_packing": True,
                "bf16": True,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

    def test_fix_untrained_tokens(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "fix_untrained_tokens": True,
                "sequence_len": 512,
                "val_set_size": 0.0,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                    "bos_token": "<|custom_im_start|>",
                    "eos_token": "<|custom_im_end|>",
                },
                "datasets": [
                    {
                        "chat_template": "jinja",
                        "chat_template_jinja": "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|custom_im_start|>' + message['role'] + '\n' + message['content'] + '<|custom_im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|custom_im_start|>assistant\n' }}{% endif %}",
                        "path": "mlabonne/FineTome-100k",
                        "type": "chat_template",
                        "split": "train[:10%]",
                        "field_messages": "conversations",
                        "message_field_role": "from",
                        "message_field_content": "value",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 5,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "sample_packing": True,
                "bf16": True,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

    def test_fix_untrained_tokens_already_trained(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "fix_untrained_tokens": True,
                "sequence_len": 512,
                "val_set_size": 0.0,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "chat_template": "chatml",
                "datasets": [
                    {
                        "path": "mlabonne/FineTome-100k",
                        "type": "chat_template",
                        "split": "train[:10%]",
                        "field_messages": "conversations",
                        "message_field_role": "from",
                        "message_field_content": "value",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 5,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "sample_packing": True,
                "bf16": True,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

    @pytest.mark.parametrize("tf32", ["auto", False])
    def test_batch_flattening(self, tf32, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "trust_remote_code": True,
                "sequence_len": 512,
                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 5,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "sample_packing": False,
                "batch_flattening": True,
                "bf16": True,
                "tf32": tf32,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/test_llama_pretrain.py
================================================
"""E2E tests for llama pretrain"""

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from .utils import check_model_output_exists, check_tensorboard


class TestPretrainLlama:
    """Test case for Llama models w pretraining"""

    @pytest.mark.parametrize(
        ("sample_packing", "pretrain_multipack_attn"),
        [
            (False, False),
            (True, True),
            (True, False),
        ],
    )
    def test_pretrain(self, temp_dir, sample_packing, pretrain_multipack_attn):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "flash_attention": True,
                "sequence_len": 1024,
                "sample_packing": sample_packing,
                "pretrain_multipack_attn": pretrain_multipack_attn,
                "dataset_num_proc": 1,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "pretraining_dataset": [
                    {
                        "path": "allenai/c4",
                        "name": "en",
                        "type": "pretrain",
                    }
                ],
                "max_steps": 5,
                "num_epochs": 1,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "val_set_size": 0.0,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "bf16": "auto",
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)
        loss_threshold = 3.6
        if sample_packing and not pretrain_multipack_attn:
            loss_threshold = 6.5
        check_tensorboard(
            temp_dir + "/runs",
            "train/train_loss",
            loss_threshold,
            "Train Loss (%s) is too high",
        )


================================================
FILE: tests/e2e/test_llama_vision.py
================================================
"""
E2E tests for lora llama
"""

import unittest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from .utils import check_model_output_exists, with_temp_dir


class TestLlamaVision(unittest.TestCase):
    """
    Test case for Llama Vision models
    """

    @with_temp_dir
    def test_lora_llama_vision_text_only_dataset(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "axolotl-ai-co/Llama-3.2-39M-Vision",
                "processor_type": "AutoProcessor",
                "skip_prepare_dataset": True,
                "remove_unused_columns": False,
                "sample_packing": False,
                "sequence_len": 1024,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_modules": r"model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj",
                "val_set_size": 0,
                "chat_template": "llama3_2_vision",
                "datasets": [
                    {
                        "path": "LDJnr/Puffin",
                        "type": "chat_template",
                        "field_messages": "conversations",
                        "message_field_role": "from",
                        "message_field_content": "value",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "bf16": True,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

    @with_temp_dir
    def test_lora_llama_vision_multimodal_dataset(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "axolotl-ai-co/Llama-3.2-39M-Vision",
                "processor_type": "AutoProcessor",
                "skip_prepare_dataset": True,
                "remove_unused_columns": False,
                "sample_packing": False,
                "sequence_len": 1024,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_modules": r"model.language_model.layers.[\d]+.(mlp|cross_attn|self_attn).(up|down|gate|q|k|v|o)_proj",
                "val_set_size": 0,
                "chat_template": "llama3_2_vision",
                "datasets": [
                    {
                        "path": "axolotl-ai-co/llava-instruct-mix-vsft-small",
                        "type": "chat_template",
                        "split": "train",
                        "field_messages": "messages",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "bf16": True,
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/test_load_model.py
================================================
"""Module for testing ModelLoader."""

import shutil
import tempfile

import pytest
import torch

from axolotl.loaders import ModelLoader, load_tokenizer
from axolotl.utils.dict import DictDefault


@pytest.fixture(name="temp_dir")
def fixture_temp_dir():
    temp_dir = tempfile.mkdtemp()
    yield temp_dir
    shutil.rmtree(temp_dir)


class TestLoadModelUtils:
    """
    Testing module testing ModelLoader.
    """

    def setup_method(self):
        # load config
        self.cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "tokenizer_config": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 1024,
                "load_in_8bit": False,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "tensor_parallel_size": 1,
                "context_parallel_size": 1,
            }
        )
        self.model_loader = ModelLoader(
            cfg=self.cfg,
            tokenizer="",
            inference=False,
            reference_model=True,
        )

    @pytest.mark.parametrize("embedding_modules", ["embed_tokens", "lm_head"])
    @pytest.mark.parametrize(
        "dist_dtype", [torch.bfloat16, torch.float16, torch.float32]
    )
    @pytest.mark.parametrize("before_kbit_train_or_finetune", [True, False])
    def test_convert_embedding_modules_dtype(
        self, temp_dir, embedding_modules, dist_dtype, before_kbit_train_or_finetune
    ):
        self.cfg.output_dir = temp_dir
        self.model_loader.tokenizer = load_tokenizer(self.cfg)
        self.model_loader.load()
        self.model_loader._convert_embedding_modules_dtype(
            embedding_modules, dist_dtype, before_kbit_train_or_finetune
        )
        for name, module in self.model_loader.model.named_modules():
            if (
                "norm" in name
                or (before_kbit_train_or_finetune and name.endswith(".gate"))
                or (
                    any(m in name for m in embedding_modules)
                    and hasattr(module, "weight")
                )
            ):
                for _, param in module.named_parameters():
                    assert param.dtype == dist_dtype


================================================
FILE: tests/e2e/test_lora_llama.py
================================================
"""
E2E tests for lora llama
"""

import unittest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from .utils import check_model_output_exists, with_temp_dir


class TestLoraLlama(unittest.TestCase):
    """
    Test case for Llama models using LoRA
    """

    @with_temp_dir
    def test_lora(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/test_mamba.py
================================================
"""
E2E tests for lora llama
"""

import unittest

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from .utils import check_model_output_exists, with_temp_dir


@pytest.mark.skip(reason="skipping until upstreamed into transformers")
class TestMamba(unittest.TestCase):
    """
    Test case for Mamba models
    """

    @with_temp_dir
    def test_fft(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "state-spaces/mamba-130m",
                "model_type": "MambaLMHeadModel",
                "tokenizer_type": "AutoTokenizer",
                "tokenizer_config": "EleutherAI/gpt-neox-20b",
                "flash_attention": False,
                "sequence_len": 1024,
                "load_in_8bit": False,
                "val_set_size": 0.0,
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "gradient_checkpointing": False,
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": None,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/test_mistral.py
================================================
"""
E2E tests for lora llama
"""

import unittest

from transformers.utils import is_torch_bf16_gpu_available

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from .utils import check_model_output_exists, with_temp_dir


class TestMistral(unittest.TestCase):
    """
    Test case for Llama models using LoRA
    """

    @with_temp_dir
    def test_lora(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
                "flash_attention": True,
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 32,
                "lora_alpha": 64,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.02,
                "special_tokens": {
                    "unk_token": "<unk>",
                    "bos_token": "<s>",
                    "eos_token": "</s>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

    @with_temp_dir
    def test_ft(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "trl-internal-testing/tiny-MistralForCausalLM-0.2",
                "flash_attention": True,
                "sequence_len": 1024,
                "val_set_size": 0.02,
                "special_tokens": {
                    "unk_token": "<unk>",
                    "bos_token": "<s>",
                    "eos_token": "</s>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
            cfg.bf16 = True
        else:
            cfg.fp16 = True

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/test_mixtral.py
================================================
"""
E2E tests for mixtral
"""

import unittest

import torch
from transformers.utils import is_torch_bf16_gpu_available

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from .utils import check_model_output_exists, with_temp_dir


class TestMixtral(unittest.TestCase):
    """
    Test case for Llama models using LoRA
    """

    @with_temp_dir
    def test_qlora_w_fa2(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "hf-internal-testing/Mixtral-tiny",
                "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF",
                "flash_attention": True,
                "sequence_len": 1024,
                "load_in_4bit": True,
                "adapter": "qlora",
                "lora_r": 4,
                "lora_alpha": 8,
                "lora_dropout": 0.1,
                "lora_target_modules": [
                    "o_proj",
                    "w3",
                    "k_proj",
                    "v_proj",
                    "w1",
                    "q_proj",
                    "w2",
                ],
                "val_set_size": 0.02,
                "special_tokens": {},
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta)
        assert (
            model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype
            == torch.float32
        )
        check_model_output_exists(temp_dir, cfg)

    @with_temp_dir
    def test_qlora_wo_fa2(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "hf-internal-testing/Mixtral-tiny",
                "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF",
                "flash_attention": False,
                "sequence_len": 1024,
                "load_in_4bit": True,
                "adapter": "qlora",
                "lora_r": 4,
                "lora_alpha": 8,
                "lora_dropout": 0.1,
                "lora_target_modules": [
                    "o_proj",
                    "w3",
                    "k_proj",
                    "v_proj",
                    "w1",
                    "q_proj",
                    "w2",
                ],
                "val_set_size": 0.02,
                "special_tokens": {},
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta)
        assert (
            model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype
            == torch.float32
        )
        check_model_output_exists(temp_dir, cfg)

    @with_temp_dir
    def test_16bit_lora_w_fa2(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "hf-internal-testing/Mixtral-tiny",
                "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF",
                "flash_attention": True,
                "sequence_len": 1024,
                "adapter": "lora",
                "lora_r": 4,
                "lora_alpha": 8,
                "lora_dropout": 0.1,
                "lora_target_modules": [
                    "o_proj",
                    "w3",
                    "k_proj",
                    "v_proj",
                    "w1",
                    "q_proj",
                    "w2",
                ],
                "val_set_size": 0.02,
                "special_tokens": {},
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
            cfg.bf16 = True
        else:
            cfg.fp16 = True

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta)
        assert (
            model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype
            == torch.float32
        )
        check_model_output_exists(temp_dir, cfg)

    @with_temp_dir
    def test_16bit_lora_wo_fa2(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "hf-internal-testing/Mixtral-tiny",
                "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF",
                "flash_attention": False,
                "sequence_len": 1024,
                "adapter": "lora",
                "lora_r": 4,
                "lora_alpha": 8,
                "lora_dropout": 0.1,
                "lora_target_modules": [
                    "o_proj",
                    "w3",
                    "k_proj",
                    "v_proj",
                    "w1",
                    "q_proj",
                    "w2",
                ],
                "val_set_size": 0.02,
                "special_tokens": {},
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        if is_torch_bf16_gpu_available():
            cfg.bf16 = True
        else:
            cfg.fp16 = True
        dataset_meta = load_datasets(cfg=cfg)

        model, _, _ = train(cfg=cfg, dataset_meta=dataset_meta)
        assert (
            model.base_model.model.model.layers[0].block_sparse_moe.gate.weight.dtype
            == torch.float32
        )
        check_model_output_exists(temp_dir, cfg)

    @with_temp_dir
    def test_ft(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "hf-internal-testing/Mixtral-tiny",
                "tokenizer_config": "LoneStriker/Mixtral-8x7B-v0.1-HF",
                "flash_attention": True,
                "sequence_len": 1024,
                "val_set_size": 0.02,
                "special_tokens": {},
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 2,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 20,
                "save_steps": 10,
                "eval_steps": 10,
                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
            cfg.bf16 = True
        else:
            cfg.fp16 = True

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/test_optimizers.py
================================================
"""
E2E tests for custom optimizers using Llama
"""

import unittest

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from .utils import (
    check_model_output_exists,
    require_torch_2_5_1,
    require_torch_2_6_0,
    require_torch_2_7_0,
    with_temp_dir,
)


class TestCustomOptimizers(unittest.TestCase):
    """
    Test case for Llama models using LoRA
    """

    @with_temp_dir
    def test_optimi_adamw(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "model_type": "AutoModelForCausalLM",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "optimi_adamw",
                "max_steps": 5,
                "lr_scheduler": "cosine",
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        _, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)
        assert trainer.optimizer.optimizer.__class__.__name__ == "AdamW"

    @with_temp_dir
    @require_torch_2_5_1
    def test_adopt_adamw(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "model_type": "AutoModelForCausalLM",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 5,
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adopt_adamw",
                "lr_scheduler": "cosine",
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        _, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)
        assert "ADOPT" in trainer.optimizer.optimizer.__class__.__name__

    @with_temp_dir
    @require_torch_2_5_1
    def test_muon(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "model_type": "AutoModelForCausalLM",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 5,
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "muon",
                "lr_scheduler": "cosine",
                "weight_decay": 0.01,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        _, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)
        assert "Muon" in trainer.optimizer.optimizer.__class__.__name__

    @with_temp_dir
    @require_torch_2_7_0
    def test_dion(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "model_type": "AutoModelForCausalLM",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "val_set_size": 0.0,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 5,
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "dion",
                "dion_lr": 0.01,
                "dion_momentum": 0.95,
                "lr_scheduler": "cosine",
                "weight_decay": 0.01,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        _, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)
        assert "Dion" in trainer.optimizer.optimizer.__class__.__name__

    @with_temp_dir
    def test_fft_schedule_free_adamw(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "model_type": "AutoModelForCausalLM",
                "sequence_len": 1024,
                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "schedule_free_adamw",
                "lr_scheduler": "constant",
                "max_steps": 10,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

    @with_temp_dir
    @require_torch_2_6_0
    def test_came_pytorch(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "JackFram/llama-68m",
                "tokenizer_type": "LlamaTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.1,
                "special_tokens": {
                    "unk_token": "<unk>",
                    "bos_token": "<s>",
                    "eos_token": "</s>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "came_pytorch",
                "adam_beta3": 0.9999,
                "adam_epsilon2": 1e-16,
                "max_steps": 5,
                "lr_scheduler": "cosine",
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


@require_torch_2_7_0
@pytest.mark.parametrize(
    "optimizer_name,expected_class,learning_rate",
    [
        ("flash_adamw", "FlashAdamW", 0.00001),
        ("flash_adam", "FlashAdam", 0.00001),
        ("flash_sgd", "FlashSGD", 0.01),
        ("flash_sgdw", "FlashSGDW", 0.01),
        ("flash_lion", "FlashLion", 0.0001),
    ],
)
def test_flash_optimizers(tmp_path, optimizer_name, expected_class, learning_rate):
    temp_dir = str(tmp_path)
    cfg = DictDefault(
        {
            "base_model": "HuggingFaceTB/SmolLM2-135M",
            "model_type": "AutoModelForCausalLM",
            "tokenizer_type": "AutoTokenizer",
            "sequence_len": 1024,
            "load_in_8bit": True,
            "adapter": "lora",
            "lora_r": 8,
            "lora_alpha": 16,
            "lora_dropout": 0.05,
            "lora_target_linear": True,
            "val_set_size": 0.02,
            "special_tokens": {
                "pad_token": "<|endoftext|>",
            },
            "datasets": [
                {
                    "path": "mhenrichsen/alpaca_2k_test",
                    "type": "alpaca",
                },
            ],
            "num_epochs": 1,
            "micro_batch_size": 8,
            "gradient_accumulation_steps": 1,
            "output_dir": temp_dir,
            "learning_rate": learning_rate,
            "optimizer": optimizer_name,
            "max_steps": 5,
            "lr_scheduler": "cosine",
            "save_first_step": False,
        }
    )

    cfg = validate_config(cfg)
    normalize_config(cfg)
    dataset_meta = load_datasets(cfg=cfg)

    _, _, trainer = train(cfg=cfg, dataset_meta=dataset_meta)
    check_model_output_exists(temp_dir, cfg)
    assert trainer.optimizer.optimizer.__class__.__name__ == expected_class


================================================
FILE: tests/e2e/test_packing_loss.py
================================================
"""
E2E tests for packed training
"""

import unittest

from transformers.utils import is_torch_bf16_gpu_available

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from .utils import check_tensorboard, with_temp_dir


class TestPackedLlama(unittest.TestCase):
    """
    Test case for Packed training of llama models
    """

    @with_temp_dir
    def test_loss_packed(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 1024,
                "sample_packing": True,
                "flash_attention": True,
                "val_set_size": 0.0,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )
        if is_torch_bf16_gpu_available():
            cfg.bf16 = True
        else:
            cfg.fp16 = True

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)

        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.0, "Train Loss (%s) is too high"
        )


================================================
FILE: tests/e2e/test_phi.py
================================================
"""
E2E tests for lora llama
"""

import unittest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from .utils import check_model_output_exists, with_temp_dir


class TestPhi(unittest.TestCase):
    """
    Test case for Phi2 models
    """

    @with_temp_dir
    def test_phi_ft(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "microsoft/phi-1_5",
                "model_type": "AutoModelForCausalLM",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 2048,
                "sample_packing": False,
                "load_in_8bit": False,
                "adapter": None,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "dataset_shard_num": 10,
                "dataset_shard_idx": 0,
                "num_epochs": 1,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "paged_adamw_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "max_steps": 10,
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

    @with_temp_dir
    def test_phi_qlora(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "microsoft/phi-1_5",
                "model_type": "AutoModelForCausalLM",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 2048,
                "sample_packing": False,
                "load_in_4bit": True,
                "adapter": "qlora",
                "lora_r": 64,
                "lora_alpha": 32,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "dataset_shard_num": 10,
                "dataset_shard_idx": 0,
                "num_epochs": 1,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "paged_adamw_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "max_steps": 10,
                "save_steps": 10,
                "eval_steps": 10,
                "bf16": "auto",
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/test_preprocess.py
================================================
"""E2E Test the preprocess cli"""

from pathlib import Path

import yaml
from accelerate.test_utils import execute_subprocess_async

from axolotl.utils.dict import DictDefault

AXOLOTL_ROOT = Path(__file__).parent.parent.parent


class TestPreprocess:
    """test cases for preprocess"""

    def test_w_deepspeed(self, temp_dir):
        """make sure preprocess doesn't choke when using deepspeed in the config"""

        cfg = DictDefault(
            {
                "base_model": "Qwen/Qwen2.5-0.5B",
                "sequence_len": 2048,
                "val_set_size": 0.01,
                "datasets": [
                    {
                        "path": "tatsu-lab/alpaca",
                        "type": "alpaca",
                        "split": "train[:10%]",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "bf16": "auto",
                "deepspeed": str(AXOLOTL_ROOT / "deepspeed_configs/zero1.json"),
                "dataset_prepared_path": temp_dir + "/last_run_prepared",
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "axolotl",
                "preprocess",
                str(Path(temp_dir) / "config.yaml"),
            ]
        )

        assert (Path(temp_dir) / "last_run_prepared").exists()


================================================
FILE: tests/e2e/test_process_reward_model_smollm2.py
================================================
"""
E2E tests for process reward model w/ lora llama
"""

import unittest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from .utils import check_model_output_exists, check_tensorboard, with_temp_dir


class TestProcessRewardSmolLM2(unittest.TestCase):
    """
    Test case for Llama process reward models using LoRA
    """

    @with_temp_dir
    def test_prm(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "model_type": "AutoModelForTokenClassification",
                "num_labels": 2,
                "process_reward_model": True,
                "sequence_len": 512,
                "val_set_size": 0.0,
                "datasets": [
                    {
                        "path": "trl-lib/math_shepherd",
                        "type": "stepwise_supervised",
                        "step_separator": "\n",
                        "split": "train[:10%]",
                    },
                ],
                "max_steps": 100,
                "num_epochs": 1,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.0005,
                "optimizer": "adamw_torch",
                "lr_scheduler": "cosine",
                "gradient_checkpointing": True,
                "warmup_ratio": 0.1,
                "use_tensorboard": True,
                "special_tokens": {"pad_token": "<|endoftext|>"},
                "seed": 42,
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.7, "Train Loss (%s) is too high"
        )

        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/test_profiler.py
================================================
"""
e2e gpu test for the pytorch profiler callback
"""

from pathlib import Path

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault


@pytest.fixture(name="profiler_base_cfg")
def fixture_profiler_base_cfg():
    cfg = DictDefault(
        base_model="HuggingFaceTB/SmolLM2-135M",
        tokenizer_type="AutoTokenizer",
        sequence_len=1024,
        load_in_8bit=True,
        adapter="lora",
        lora_r=8,
        lora_alpha=16,
        lora_dropout=0.05,
        lora_target_linear=True,
        val_set_size=0.02,
        special_tokens={"pad_token": "<|endoftext|>"},
        datasets=[
            {
                "path": "mhenrichsen/alpaca_2k_test",
                "type": "alpaca",
            },
        ],
        num_epochs=1,
        micro_batch_size=2,
        gradient_accumulation_steps=1,
        learning_rate=0.00001,
        optimizer="adamw_torch_fused",
        lr_scheduler="cosine",
    )
    return cfg


class TestProfiler:
    """
    test cases for the pytorch profiler callback
    """

    def test_profiler_saves(self, profiler_base_cfg, temp_dir):
        cfg = profiler_base_cfg | DictDefault(
            output_dir=temp_dir,
            max_steps=5,
            profiler_steps=3,
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "snapshot.pickle").exists()

    def test_profiler_saves_w_start(self, profiler_base_cfg, temp_dir):
        cfg = profiler_base_cfg | DictDefault(
            output_dir=temp_dir,
            max_steps=5,
            profiler_steps=3,
            profiler_steps_start=1,
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "snapshot.pickle").exists()

    @pytest.mark.parametrize(
        "profiler_steps_start",
        [3, 5],
    )
    def test_profiler_saves_past_end(
        self, profiler_base_cfg, temp_dir, profiler_steps_start
    ):
        cfg = profiler_base_cfg | DictDefault(
            output_dir=temp_dir,
            max_steps=5,
            profiler_steps=3,
            profiler_steps_start=profiler_steps_start,
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        assert (Path(temp_dir) / "snapshot.pickle").exists()

    def test_profiler_never_started(self, profiler_base_cfg, temp_dir):
        cfg = profiler_base_cfg | DictDefault(
            output_dir=temp_dir,
            max_steps=5,
            profiler_steps=3,
            profiler_steps_start=6,
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        assert not (Path(temp_dir) / "snapshot.pickle").exists()


================================================
FILE: tests/e2e/test_qat.py
================================================
"""
E2E tests for QAT
"""

from pathlib import Path

from axolotl.common.datasets import load_datasets, load_preference_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault
from axolotl.utils.schemas.enums import TorchAOQuantDType
from axolotl.utils.schemas.quantization import QATConfig, validate_ao_dtype

from .utils import check_model_output_exists, check_tensorboard


class TestQATLlama:
    """
    Test case for QAT Llama models
    """

    def test_qat(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mlabonne/FineTome-100k",
                        "type": "chat_template",
                        "field_messages": "conversations",
                        "message_property_mappings": {
                            "role": "from",
                            "content": "value",
                        },
                        "drop_system_message": True,
                        "split": "train[:1%]",
                    },
                ],
                "chat_template": "chatml",
                "qat": {
                    "quantize_embedding": True,
                    "activation_dtype": "int8",
                    "weight_dtype": "int4",
                    "group_size": 8,
                },
                "num_epochs": 1,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "max_steps": 5,
                "bf16": True,
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(Path(temp_dir) / "checkpoint-5", cfg)

    def test_qat_dpo(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "sequence_len": 2048,
                "sample_packing": False,
                "eval_sample_packing": False,
                "pad_to_sequence_len": True,
                "val_set_size": 0.01,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "rl": "dpo",
                "chat_template": "chatml",
                "datasets": [
                    {
                        "path": "fozziethebeat/alpaca_messages_2k_dpo_test",
                        "type": "chat_template.default",
                        "field_messages": "conversation",
                        "field_chosen": "chosen",
                        "field_rejected": "rejected",
                        "message_field_role": "role",
                        "message_field_content": "content",
                        "roles": {
                            "system": ["system"],
                            "user": ["user"],
                            "assistant": ["assistant"],
                        },
                    },
                ],
                "num_epochs": 1,
                "max_steps": 5,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "warmup_steps": 0,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "use_tensorboard": True,
                "bf16": True,
                "qat": {
                    "quantize_embedding": True,
                    "activation_dtype": "int8",
                    "weight_dtype": "int4",
                    "group_size": 8,
                },
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_preference_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(Path(temp_dir) / "checkpoint-5", cfg)

        loss_threshold = 2.3
        check_tensorboard(
            temp_dir + "/runs",
            "train/train_loss",
            loss_threshold,
            "Train Loss (%s) is too high",
        )


class TestMXFP4Schema:
    """Test MXFP4 schema validation"""

    def test_validate_mxfp4_dtype(self):
        result = validate_ao_dtype("mxfp4")
        assert result == TorchAOQuantDType.mxfp4

    def test_qat_config_with_mxfp4(self):
        """Test QATConfig accepts mxfp4 weight_dtype"""
        config = QATConfig(
            weight_dtype="mxfp4",
            group_size=32,
            quantize_embedding=False,
        )
        assert config.weight_dtype == TorchAOQuantDType.mxfp4
        assert config.group_size == 32

    def test_qat_config_mxfp4_invalid_group_size(self):
        """Test that invalid group_size raises appropriate error during quantization"""
        # Note: Schema validation doesn't check group_size compatibility,
        # that happens in get_quantization_config
        config = QATConfig(
            weight_dtype="mxfp4",
            group_size=16,  # Invalid for mxfp4, but schema allows it
        )
        assert config.group_size == 16  # Schema accepts it
        # Actual validation happens at runtime in get_quantization_config


================================================
FILE: tests/e2e/test_quantization.py
================================================
"""
Tests for axolotl.utils.quantization
"""

import pytest
import torch
from torch import nn
from torchao.prototype.qat import MXFakeQuantizeConfig
from torchao.quantization import LinearActivationQuantizedTensor
from torchao.quantization.qat.embedding import FakeQuantizedEmbedding
from torchao.quantization.qat.linear import FakeQuantizedLinear
from torchao.quantization.quant_api import (
    Float8DynamicActivationFloat8WeightConfig,
    Float8DynamicActivationInt4WeightConfig,
    Int8DynamicActivationInt4WeightConfig,
)
from torchao.quantization.quantize_.workflows.int4.int4_tensor import Int4Tensor
from transformers import AutoModelForCausalLM
from transformers.trainer_callback import TrainerState

from axolotl.utils.callbacks.qat import QATCallback
from axolotl.utils.quantization import (
    convert_qat_model,
    get_quantization_config,
    prepare_model_for_qat,
    quantize_model,
)
from axolotl.utils.schemas.enums import TorchAOQuantDType
from axolotl.utils.schemas.quantization import QATConfig

from tests.e2e.utils import (
    require_torch_2_8_0,
    requires_cuda_ge_8_9,
    requires_sm_ge_100,
)


@pytest.fixture()
def model():
    dummy_model = AutoModelForCausalLM.from_pretrained(
        "Qwen/Qwen2-0.5B",
        device_map="auto",
        dtype=torch.bfloat16,
    )
    with torch.device(dummy_model.device):
        dummy_model.model.embed_tokens = torch.nn.Embedding(
            dummy_model.model.embed_tokens.weight.shape[0],
            dummy_model.model.embed_tokens.weight.shape[1],
            dtype=dummy_model.model.embed_tokens.weight.dtype,
        )
    yield dummy_model
    del dummy_model


ptq_config_test_cases = [
    # weight_dtype, activation_dtype, group_size, expected_type
    (
        TorchAOQuantDType.int4,
        TorchAOQuantDType.int8,
        None,
        Int8DynamicActivationInt4WeightConfig,
    ),
    (
        TorchAOQuantDType.float8_e4m3fn,
        TorchAOQuantDType.float8_e4m3fn,
        None,
        Float8DynamicActivationFloat8WeightConfig,
    ),
    (
        TorchAOQuantDType.int4,
        TorchAOQuantDType.float8_e4m3fn,
        None,
        Float8DynamicActivationInt4WeightConfig,
    ),
]

ptq_test_cases = [
    # weight_dtype, activation_dtype, group_size, quantize_embedding, expected_exception, expected_tensor_class
    (TorchAOQuantDType.int4, None, 4, True, None, Int4Tensor),
    (
        TorchAOQuantDType.int4,
        TorchAOQuantDType.int8,
        8,
        False,
        None,
        LinearActivationQuantizedTensor,
    ),
    # (
    #     TorchAOQuantDType.int4,
    #     TorchAOQuantDType.float8_e4m3fn,
    #     None,
    #     False,
    #     None,
    #     Int4Tensor,
    # ),
    (TorchAOQuantDType.int4, None, None, False, None, Int4Tensor),
    # Deprecated configs
    (TorchAOQuantDType.int8, None, 8, False, ValueError, None),
    (TorchAOQuantDType.int4, TorchAOQuantDType.int4, 8, False, ValueError, None),
    (TorchAOQuantDType.int8, TorchAOQuantDType.int8, 8, True, ValueError, None),
]


class TestQuantization:
    """
    Test quantization utilities
    """

    @pytest.mark.parametrize(
        "weight_dtype,activation_dtype,group_size,expected_type",
        ptq_config_test_cases,
    )
    @requires_cuda_ge_8_9
    @require_torch_2_8_0
    def test_get_ptq_config(
        self, weight_dtype, activation_dtype, group_size, expected_type
    ):
        config = get_quantization_config(weight_dtype, activation_dtype, group_size)
        assert isinstance(config, expected_type)

    @require_torch_2_8_0
    @requires_sm_ge_100
    def test_get_ptq_config_mxfp4(self):
        config = get_quantization_config(TorchAOQuantDType.mxfp4, None, 32)
        assert isinstance(config, MXFakeQuantizeConfig)
        assert config.block_size == 32

    @require_torch_2_8_0
    @requires_sm_ge_100
    def test_get_ptq_config_mxfp4_invalid_group_size(self):
        with pytest.raises(
            ValueError, match="MXFP4 quantization must use a block_size"
        ):
            get_quantization_config(TorchAOQuantDType.mxfp4, None, 16)

    @requires_cuda_ge_8_9
    @require_torch_2_8_0
    def test_get_ptq_config_int4_weight_only(self):
        from torchao.quantization.quant_api import Int4WeightOnlyConfig

        config = get_quantization_config(TorchAOQuantDType.int4, None, 4)
        assert isinstance(config, Int4WeightOnlyConfig)

    @pytest.mark.parametrize(
        "weight_dtype,activation_dtype,group_size,quantize_embedding,expected_exception,expected_tensor_class",
        ptq_test_cases,
    )
    @requires_cuda_ge_8_9
    @require_torch_2_8_0
    def test_quantize_model_for_ptq(
        self,
        model,
        weight_dtype,
        activation_dtype,
        group_size,
        quantize_embedding,
        expected_exception,
        expected_tensor_class,
    ):
        if expected_exception:
            with pytest.raises(expected_exception):
                quantize_model(
                    model,
                    weight_dtype,
                    group_size,
                    activation_dtype,
                    quantize_embedding,
                )
        else:
            quantize_model(
                model, weight_dtype, group_size, activation_dtype, quantize_embedding
            )
            if quantize_embedding:
                assert isinstance(
                    model.model.embed_tokens.weight, expected_tensor_class
                ), "Embedding weight should be quantized"
            for child in list(model.children()):
                if isinstance(child, torch.nn.Linear):
                    assert isinstance(child.weight, expected_tensor_class)

    @require_torch_2_8_0
    @requires_sm_ge_100
    def test_quantize_model_for_ptq_fp8(
        self,
        model,
    ):
        from torchao.quantization.quantize_.workflows.float8.float8_tensor import (
            Float8Tensor,
            QuantizeTensorToFloat8Kwargs,
        )

        quantize_model(
            model,
            TorchAOQuantDType.float8_e4m3fn,
            None,
            TorchAOQuantDType.float8_e4m3fn,
        )
        for child in list(model.children()):
            if isinstance(child, torch.nn.Linear):
                assert isinstance(child.weight, Float8Tensor)
                assert child.weight.act_quant_kwargs is not None and isinstance(
                    child.weight.act_quant_kwargs, QuantizeTensorToFloat8Kwargs
                )

    @require_torch_2_8_0
    @requires_sm_ge_100
    def test_quantize_model_for_ptq_nvfp4(
        self,
        model,
    ):
        from torchao.prototype.mx_formats.nvfp4_tensor import (
            NVFP4Tensor,
            QuantizeTensorToNVFP4Kwargs,
        )

        quantize_model(model, TorchAOQuantDType.nvfp4, 16, TorchAOQuantDType.nvfp4)
        for child in list(model.children()):
            if isinstance(child, torch.nn.Linear):
                assert isinstance(child.weight, NVFP4Tensor)
                assert child.weight.act_quant_kwargs is not None and isinstance(
                    child.weight.act_quant_kwargs, QuantizeTensorToNVFP4Kwargs
                )

    @pytest.mark.parametrize(
        "weight_dtype,activation_dtype,group_size,quantize_embedding",
        [
            (TorchAOQuantDType.int4, None, 8, False),
            (TorchAOQuantDType.int4, None, 16, True),
            (TorchAOQuantDType.int4, TorchAOQuantDType.int8, 8, False),
            (TorchAOQuantDType.int4, TorchAOQuantDType.int8, 16, True),
            (
                TorchAOQuantDType.float8_e4m3fn,
                TorchAOQuantDType.float8_e4m3fn,
                None,
                False,
            ),
            (TorchAOQuantDType.int4, TorchAOQuantDType.float8_e4m3fn, None, True),
        ],
    )
    @require_torch_2_8_0
    @requires_cuda_ge_8_9
    def test_prepare_model_for_qat(
        self, model, weight_dtype, activation_dtype, group_size, quantize_embedding
    ):
        prepare_model_for_qat(
            model,
            weight_dtype,
            group_size,
            activation_dtype,
            quantize_embedding,
        )
        if quantize_embedding:
            assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
            assert hasattr(model.model.embed_tokens, "weight_fake_quantizer")
            assert (
                model.model.embed_tokens.weight_fake_quantizer.config.dtype
                == weight_dtype.value
            )
            if group_size:
                assert (
                    model.model.embed_tokens.weight_fake_quantizer.config.group_size
                    == group_size
                )

        for child in list(model.children()):
            if isinstance(child, torch.nn.Linear):
                assert isinstance(child, FakeQuantizedLinear)
                assert hasattr(child, "weight_fake_quantizer")
                assert child.weight_fake_quantizer.config.dtype == weight_dtype.value
                if group_size:
                    assert child.weight_fake_quantizer.config.group_size == group_size
                if activation_dtype:
                    assert hasattr(child, "activation_fake_quantizer")
                    assert (
                        child.activation_fake_quantizer.config.dtype
                        == activation_dtype.value
                    )
                else:
                    assert child.activation_fake_quantizer is None

    @pytest.mark.parametrize(
        "weight_dtype,activation_dtype,group_size,quantize_embedding",
        [
            (TorchAOQuantDType.mxfp4, None, 32, False),
            (TorchAOQuantDType.mxfp4, None, 32, True),
        ],
    )
    @require_torch_2_8_0
    @requires_sm_ge_100
    def test_prepare_model_for_qat_mxfp4(
        self, model, weight_dtype, activation_dtype, group_size, quantize_embedding
    ):
        prepare_model_for_qat(
            model,
            weight_dtype,
            group_size,
            activation_dtype,
            quantize_embedding,
        )

        if quantize_embedding:
            assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
            assert hasattr(model.model.embed_tokens, "weight_fake_quantizer")

        for child in list(model.children()):
            if isinstance(child, torch.nn.Linear):
                assert isinstance(child, FakeQuantizedLinear)
                assert hasattr(child, "weight_fake_quantizer")

    @require_torch_2_8_0
    @requires_cuda_ge_8_9
    def test_convert_qat_model(self, model):
        config = QATConfig(
            weight_dtype="int4",
            activation_dtype="int8",
            group_size=8,
            quantize_embedding=True,
        )

        # quantize model for qat
        prepare_model_for_qat(
            model,
            config.weight_dtype,
            config.group_size,
            config.activation_dtype,
            config.quantize_embedding,
        )

        assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
        assert isinstance(model.lm_head, FakeQuantizedLinear)

        # apply conversion
        convert_qat_model(
            model,
            config.quantize_embedding,
        )
        # ensure modules have been swapped out
        assert not isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
        assert not isinstance(model.lm_head, FakeQuantizedLinear)

        # ensure weights have been quantized
        assert isinstance(model.model.embed_tokens.weight, nn.Parameter)
        assert isinstance(model.lm_head.weight, nn.Parameter)


class TestQuantizationCallback:
    """
    Test QATCallback
    """

    @pytest.fixture()
    def trainer_state(self):
        return TrainerState(
            global_step=0,
        )

    @require_torch_2_8_0
    def test_qat_callback_fake_quant_after_n_steps(self, model, trainer_state):
        cfg = QATConfig(
            weight_dtype="int4",
            activation_dtype="int8",
            group_size=8,
            quantize_embedding=True,
            fake_quant_after_n_steps=100,
        )

        prepare_model_for_qat(
            model,
            cfg.weight_dtype,
            cfg.group_size,
            cfg.activation_dtype,
            cfg.quantize_embedding,
        )

        # ensure model has been quantized
        assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
        assert model.model.embed_tokens.weight_fake_quantizer.enabled
        assert isinstance(model.lm_head, FakeQuantizedLinear)
        assert model.lm_head.weight_fake_quantizer.enabled

        qat_callback = QATCallback(cfg)

        # simulate first training step
        qat_callback.on_step_begin(
            args=None,
            state=trainer_state,
            control=None,
            model=model,
        )

        # quantization should have been disabled
        assert not model.model.embed_tokens.weight_fake_quantizer.enabled
        assert not model.lm_head.weight_fake_quantizer.enabled

        trainer_state.global_step = 100
        qat_callback.on_step_begin(
            args=None,
            state=trainer_state,
            control=None,
            model=model,
        )

        # quantization should have been enabled
        assert model.model.embed_tokens.weight_fake_quantizer.enabled
        assert model.lm_head.weight_fake_quantizer.enabled

    @require_torch_2_8_0
    def test_qat_callback_fake_quant_after_n_steps_is_none(self, model, trainer_state):
        cfg = QATConfig(
            weight_dtype="int4",
            activation_dtype="int8",
            group_size=8,
            quantize_embedding=True,
            fake_quant_after_n_steps=None,
        )

        prepare_model_for_qat(
            model,
            cfg.weight_dtype,
            cfg.group_size,
            cfg.activation_dtype,
            cfg.quantize_embedding,
        )

        # ensure model has been quantized
        assert isinstance(model.model.embed_tokens, FakeQuantizedEmbedding)
        assert model.model.embed_tokens.weight_fake_quantizer.enabled
        assert isinstance(model.lm_head, FakeQuantizedLinear)
        assert model.lm_head.weight_fake_quantizer.enabled

        qat_callback = QATCallback(cfg)
        # simulate first training step
        qat_callback.on_step_begin(
            args=None,
            state=trainer_state,
            control=None,
            model=model,
        )

        # quantization should be enabled from the get-go
        assert model.model.embed_tokens.weight_fake_quantizer.enabled
        assert model.lm_head.weight_fake_quantizer.enabled


================================================
FILE: tests/e2e/test_qwen.py
================================================
"""
E2E tests for qwen
"""

from pathlib import Path

import pytest
import yaml
from accelerate.test_utils import execute_subprocess_async
from transformers.testing_utils import get_torch_dist_unique_port

from axolotl.utils.dict import DictDefault


class TestE2eQwen:
    """
    Test cases for qwen models
    """

    @pytest.mark.parametrize("base_model", ["Qwen/Qwen2-0.5B", "Qwen/Qwen2.5-0.5B"])
    def test_dpo(self, base_model, temp_dir):
        cfg = DictDefault(
            {
                "base_model": base_model,
                "rl": "dpo",
                "chat_template": "qwen_25",
                "sequence_len": 2048,
                "val_set_size": 0.0,
                "datasets": [
                    {
                        "path": "fozziethebeat/alpaca_messages_2k_dpo_test",
                        "split": "train",
                        "type": "chat_template.default",
                        "field_messages": "conversation",
                        "field_chosen": "chosen",
                        "field_rejected": "rejected",
                        "message_property_mappings": {
                            "role": "role",
                            "content": "content",
                        },
                        "roles": {
                            "system": ["system"],
                            "user": ["user"],
                            "assistant": ["assistant"],
                        },
                    },
                ],
                "num_epochs": 1,
                "max_steps": 5,
                "warmup_steps": 20,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 2,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "bf16": "auto",
                "tf32": True,
                "gradient_checkpointing": True,
                "save_first_step": False,
            }
        )

        # write cfg to yaml file
        Path(temp_dir).mkdir(parents=True, exist_ok=True)
        with open(Path(temp_dir) / "config.yaml", "w", encoding="utf-8") as fout:
            fout.write(yaml.dump(cfg.to_dict(), Dumper=yaml.Dumper))

        execute_subprocess_async(
            [
                "accelerate",
                "launch",
                "--num-processes",
                "2",
                "--main_process_port",
                f"{get_torch_dist_unique_port()}",
                "-m",
                "axolotl.cli.train",
                str(Path(temp_dir) / "config.yaml"),
            ]
        )


================================================
FILE: tests/e2e/test_reward_model_smollm2.py
================================================
"""
E2E tests for reward model lora llama
"""

import unittest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from .utils import check_model_output_exists, check_tensorboard, with_temp_dir


class TestRewardModelLoraSmolLM2(unittest.TestCase):
    """
    Test case for Llama reward models using LoRA
    """

    @with_temp_dir
    def test_rm_lora(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "model_type": "AutoModelForSequenceClassification",
                "num_labels": 1,
                "chat_template": "alpaca",
                "reward_model": True,
                "sequence_len": 2048,
                "pad_to_sequence_len": True,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.0,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "argilla/distilabel-intel-orca-dpo-pairs",
                        "type": "bradley_terry.chat_template",
                        "split": "train[:10%]",
                    },
                ],
                "lora_modules_to_save": ["embed_tokens", "lm_head"],
                "remove_unused_columns": False,
                "max_steps": 10,
                "num_epochs": 1,
                "micro_batch_size": 4,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch",
                "lr_scheduler": "cosine",
                "gradient_checkpointing": True,
                "warmup_ratio": 0.1,
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_tensorboard(
            temp_dir + "/runs", "train/train_loss", 2.5, "Train Loss (%s) is too high"
        )
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/test_save_first_step.py
================================================
"""
E2E tests for relora llama
"""

import unittest
from pathlib import Path

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from .utils import check_model_output_exists, with_temp_dir


class TestSaveFirstStepCallback(unittest.TestCase):
    """Test cases for save_first_step callback config."""

    @with_temp_dir
    def test_save_first_step(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 512,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 3,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "sample_packing": True,
                "bf16": True,
                "save_first_step": True,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(str(Path(temp_dir) / "checkpoint-1"), cfg)

    @with_temp_dir
    def test_no_save_first_step(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 512,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "max_steps": 3,
                "micro_batch_size": 2,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_bnb_8bit",
                "lr_scheduler": "cosine",
                "flash_attention": True,
                "sample_packing": True,
                "bf16": True,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        with pytest.raises(AssertionError):
            check_model_output_exists(str(Path(temp_dir) / "checkpoint-1"), cfg)


================================================
FILE: tests/e2e/test_schedulers.py
================================================
"""
E2E tests for custom schedulers using Llama
"""

import unittest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from .utils import check_model_output_exists, with_temp_dir


class TestCustomSchedulers(unittest.TestCase):
    """
    Test case for Llama models using LoRA
    """

    @with_temp_dir
    def test_rex_scheduler(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "load_in_8bit": True,
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.02,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "num_epochs": 1,
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "max_steps": 20,
                "lr_scheduler": "rex",
                "warmup_steps": 5,
                "cosine_min_lr_ratio": 0.05,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)


================================================
FILE: tests/e2e/test_streaming.py
================================================
"""E2E tests for streaming dataset functionality"""

# pylint: disable=duplicate-code

import pytest

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from .utils import check_model_output_exists, check_tensorboard


class TestStreamingDatasets:
    """Test case for streaming datasets"""

    @pytest.mark.parametrize(
        "sample_packing",
        [True, False],
    )
    def test_streaming_dataset(self, temp_dir, sample_packing):
        """Test streaming datasets"""

        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "flash_attention": True,
                "sequence_len": 1024,
                "sample_packing": sample_packing,
                "pretrain_multipack_attn": sample_packing,
                "streaming_multipack_buffer_size": 10000,
                "dataset_num_proc": 1,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                # Streaming config
                "streaming": True,
                "max_steps": 3,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "val_set_size": 0.0,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "bf16": "auto",
                "use_tensorboard": True,
                "save_first_step": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        dataset_meta = load_datasets(cfg=cfg)

        train(cfg=cfg, dataset_meta=dataset_meta)
        check_model_output_exists(temp_dir, cfg)

        # Verify training actually happened by checking loss decrease
        check_tensorboard(
            temp_dir + "/runs",
            "train/train_loss",
            3.0,
            "Train Loss (%s) is too high",
        )


================================================
FILE: tests/e2e/test_tokenizer.py
================================================
"""
e2e test for saving the tokenizer
"""

from unittest.mock import patch

from axolotl.common.datasets import load_datasets
from axolotl.train import train
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from tests.e2e.utils import check_model_output_exists


def test_tokenizer_no_save_jinja_files(temp_dir):
    # pylint: disable=duplicate-code
    cfg = DictDefault(
        {
            "base_model": "HuggingFaceTB/SmolLM2-135M",
            "tokenizer_type": "AutoTokenizer",
            "sequence_len": 1024,
            "load_in_8bit": True,
            "adapter": "lora",
            "lora_r": 8,
            "lora_alpha": 16,
            "lora_dropout": 0.05,
            "lora_target_linear": True,
            "val_set_size": 0.02,
            "special_tokens": {
                "pad_token": "<|endoftext|>",
            },
            "chat_template": "chatml",
            "datasets": [
                {
                    "path": "mhenrichsen/alpaca_2k_test",
                    "type": "alpaca",
                },
            ],
            "num_epochs": 1,
            "micro_batch_size": 2,
            "gradient_accumulation_steps": 1,
            "output_dir": temp_dir,
            "learning_rate": 0.00001,
            "optimizer": "adamw_torch_fused",
            "lr_scheduler": "cosine",
            "max_steps": 5,
            "save_first_step": False,
            "fp16": False,
            "tokenizer_save_jinja_files": False,
        }
    )

    cfg = validate_config(cfg)
    normalize_config(cfg)
    dataset_meta = load_datasets(cfg=cfg)

    with patch("axolotl.train.execute_training"):
        train(cfg=cfg, dataset_meta=dataset_meta)

    check_model_output_exists(temp_dir, cfg)
    with open(f"{temp_dir}/tokenizer_config.json", "r", encoding="utf-8") as f:
        tokenizer_config = f.read()
        assert "chat_template" in tokenizer_config


================================================
FILE: tests/e2e/utils.py
================================================
"""
helper utils for tests
"""

import importlib.util
import os
import shutil
import tempfile
import unittest
from functools import wraps
from pathlib import Path

import torch
from packaging import version
from tbparse import SummaryReader

from axolotl.utils.dict import DictDefault


def with_temp_dir(test_func):
    @wraps(test_func)
    def wrapper(*args, **kwargs):
        # Create a temporary directory
        temp_dir = tempfile.mkdtemp()
        try:
            # Pass the temporary directory to the test function
            test_func(*args, temp_dir=temp_dir, **kwargs)
        finally:
            # Clean up the directory after the test
            shutil.rmtree(temp_dir)

    return wrapper


def most_recent_subdir(path):
    base_path = Path(path)
    subdirectories = [d for d in base_path.iterdir() if d.is_dir()]
    if not subdirectories:
        return None
    subdir = max(subdirectories, key=os.path.getctime)

    return subdir


def require_torch_2_4_1(test_case):
    """
    Decorator marking a test that requires torch >= 2.5.1
    """

    def is_min_2_4_1():
        torch_version = version.parse(torch.__version__)
        return torch_version >= version.parse("2.4.1")

    return unittest.skipUnless(is_min_2_4_1(), "test requires torch>=2.4.1")(test_case)


def require_torch_2_5_1(test_case):
    """
    Decorator marking a test that requires torch >= 2.5.1
    """

    def is_min_2_5_1():
        torch_version = version.parse(torch.__version__)
        return torch_version >= version.parse("2.5.1")

    return unittest.skipUnless(is_min_2_5_1(), "test requires torch>=2.5.1")(test_case)


def require_torch_2_6_0(test_case):
    """
    Decorator marking a test that requires torch >= 2.6.0
    """

    def is_min_2_6_0():
        torch_version = version.parse(torch.__version__)
        return torch_version >= version.parse("2.6.0")

    return unittest.skipUnless(is_min_2_6_0(), "test requires torch>=2.6.0")(test_case)


def require_torch_2_7_0(test_case):
    """
    Decorator marking a test that requires torch >= 2.7.0
    """

    def is_min_2_7_0():
        torch_version = version.parse(torch.__version__)
        return torch_version >= version.parse("2.7.0")

    return unittest.skipUnless(is_min_2_7_0(), "test requires torch>=2.7.0")(test_case)


def require_torch_2_8_0(test_case):
    """
    Decorator marking a test that requires torch >= 2.7.0
    """

    def is_min_2_8_0():
        torch_version = version.parse(torch.__version__)
        return torch_version >= version.parse("2.8.0")

    return unittest.skipUnless(is_min_2_8_0(), "test requires torch>=2.8.0")(test_case)


def require_torch_lt_2_6_0(test_case):
    """
    Decorator marking a test that requires torch < 2.6.0
    """

    def is_max_2_6_0():
        torch_version = version.parse(torch.__version__)
        return torch_version < version.parse("2.6.0")

    return unittest.skipUnless(is_max_2_6_0(), "test requires torch<2.6.0")(test_case)


def require_vllm(test_case):
    """
    Decorator marking a test that requires a vllm to be installed
    """

    def is_vllm_installed():
        return importlib.util.find_spec("vllm") is not None

    return unittest.skipUnless(
        is_vllm_installed(), "test requires vllm to be installed"
    )(test_case)


def require_llmcompressor(test_case):
    """
    Decorator marking a test that requires a llmcompressor to be installed
    """

    def is_llmcompressor_installed():
        return importlib.util.find_spec("llmcompressor") is not None

    return unittest.skipUnless(
        is_llmcompressor_installed(), "test requires llmcompressor to be installed"
    )(test_case)


def requires_sm_ge_100(test_case):
    is_sm_ge_100 = (
        torch.cuda.is_available()
        and torch.version.cuda
        and torch.cuda.get_device_capability() >= (10, 0)
    )
    return unittest.skipUnless(is_sm_ge_100, "test requires sm>=100")(test_case)


def requires_cuda_ge_8_9(test_case):
    is_cuda_ge_8_9 = (
        torch.cuda.is_available()
        and torch.version.cuda
        and torch.cuda.get_device_capability() >= (8, 9)
    )
    return unittest.skipUnless(is_cuda_ge_8_9, "test requires cuda>=8.9")(test_case)


def is_hopper():
    compute_capability = torch.cuda.get_device_capability()
    return compute_capability == (9, 0)


def require_hopper(test_case):
    return unittest.skipUnless(is_hopper(), "test requires h100/hopper GPU")(test_case)


def supports_fp8(test_case):
    compute_capability = torch.cuda.get_device_capability()
    return unittest.skipUnless(
        compute_capability >= (9, 0), "test requires h100 or newer GPU"
    )(test_case)


def check_tensorboard(
    temp_run_dir: str,
    tag: str,
    lt_val: float,
    assertion_err: str,
    rtol: float = 0.02,
    gt_zero: bool = True,
) -> None:
    """
    helper function to parse and check tensorboard logs
    """
    tb_log_path = most_recent_subdir(temp_run_dir)
    event_file = os.path.join(tb_log_path, sorted(os.listdir(tb_log_path))[0])
    reader = SummaryReader(event_file)
    df = reader.scalars
    df = df[(df.tag == tag)]
    lt_val = (1 + rtol) * lt_val
    if "%s" in assertion_err:
        assert df.value.values[-1] < lt_val, assertion_err % df.value.values[-1]
    else:
        assert df.value.values[-1] < lt_val, assertion_err
    if gt_zero:
        assert df.value.values[-1] > 1e-5, "Expected loss to be greater than zero"


def check_model_output_exists(temp_dir: str, cfg: DictDefault) -> None:
    """
    helper function to check if a model output file exists after training

    checks based on adapter or not (always safetensors in Transformers V5)
    """

    if not cfg.adapter:
        assert (Path(temp_dir) / "model.safetensors").exists()
    else:
        assert (Path(temp_dir) / "adapter_model.safetensors").exists()


================================================
FILE: tests/fixtures/alpaca/alpaca.json
================================================
[
  {
    "instruction": "You will be given a series of words. Output these words in reverse order, with each word on its own line.",
    "input": "Words: ['Hello', 'world'].",
    "output": "['world', 'Hello']"
  },
  {
    "instruction": "In this task, you're given a short description of an event. Your job is to order the steps involved in the event from first to last. Note that there may be multiple correct answers for each event.",
    "input": "Description: A man walks into a bar and orders a drink. He pays for his drink and leaves the bar.",
    "output": "1. The man walks into the bar.\n2. He orders a drink.\n3. He pays for his drink.\n4. He leaves the bar."
  }
]


================================================
FILE: tests/fixtures/conversation.json
================================================
{
  "conversations": [
    {
      "from": "human",
      "value": "how could i implement a minesweeper algorithm that utilises algebraic topology to solve boards?"
    },
    {
      "from": "gpt",
      "value": "It's possible to use algebraic topology to solve Minesweeper, but it would likely be quite complex. Algebraic topology is a branch of mathematics that studies the properties of spaces that are preserved under continuous deformations, such as stretching and bending. In the context of Minesweeper, this could be used to analyze the structure of the game board and determine the locations of mines based on certain topological invariants. However, implementing this would require a deep understanding of both algebraic topology and the game of Minesweeper. It's not something that can be explained in a simple answer."
    },
    {
      "from": "human",
      "value": "could you give me the gist of how it could though?"
    },
    {
      "from": "gpt",
      "value": "Sure. The basic idea would be to use algebraic topology to analyze the structure of the Minesweeper game board and determine the locations of mines based on certain topological invariants. This could involve representing the game board as a topological space and using topological invariants such as homotopy groups and Betti numbers to infer information about the location of mines on the board. However, as I mentioned before, implementing this would require a deep understanding of both algebraic topology and the game of Minesweeper, so it's not something that can be explained in a simple answer."
    },
    {
      "from": "human",
      "value": "give me a specific example"
    },
    {
      "from": "gpt",
      "value": "As a specific example, suppose we have a Minesweeper game board with the following configuration:\n```\n 1  1  1  1  1\n 1  ?  ?  ?  1\n 1  ?  ?  ?  1\n 1  ?  ?  ?  1\n 1  1  1  1  1\n```\nIn this case, the game board can be represented as a topological space, where each square on the board corresponds to a point in the space. The mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space. Using algebraic topology, we can analyze the structure of this space and determine the locations of the mines based on certain topological invariants.\n\nFor instance, one way to do this is to compute the homotopy groups of the space. Homotopy groups are a way of describing the holes in a space, and they can be used to infer information about the locations of the mines on the Minesweeper board. In the case of the board above, we can compute the first homotopy group of the space, which describes the number and \"shape\" of the holes in the space. In this case, the first homotopy group would tell us that there are four holes in the space, arranged in a square shape.\n\nBased on this information, we can infer that the mines in the game are located in the four corners of the board, as shown below:\n```\n 1  1  1  1  1\n 1  M  ?  ?  1\n 1  ?  ?  ?  1\n 1  ?  ?  M  1\n 1  1  1  1  1\n```\nOf course, this is just one example of how algebraic topology could be used to solve Minesweeper. There are many other possible approaches, and the specific details of how to implement them would depend on the specific game board and the topological invariants being used. As I mentioned before, implementing a solution like this would require a deep understanding of both algebraic topology and the game of Minesweeper."
    },
    {
      "from": "human",
      "value": "assuming i was someone with a deep understanding of both, could you explain the same concept to me with that assumption?"
    },
    {
      "from": "gpt",
      "value": "Sure. Assuming you have a deep understanding of both algebraic topology and the game of Minesweeper, here is how you might use algebraic topology to solve a Minesweeper game board:\n\n1. Represent the Minesweeper game board as a topological space. Each square on the board corresponds to a point in the space, and the mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space.\n2. Use topological invariants to infer information about the locations of the mines on the board. There are many possible topological invariants that could be used for this, such as homotopy groups, Betti numbers, or persistent homology.\n3. Use the information inferred from the topological invariants to determine the locations of the mines on the board. For example, if the first homotopy group of the space indicates that there are four holes in the space, arranged in a square shape, then we can infer that the mines are located in the four corners of the board.\n4. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you can use this information to avoid them and clear the board.\n\nOf course, this is just one possible approach to using algebraic topology to solve Minesweeper. There may be other ways to do it, depending on the specific game board and the topological invariants being used. However, this should give you a general idea of how algebraic topology can be used to solve Minesweeper."
    },
    {
      "from": "human",
      "value": "how would i do it with persistent homology or any of the other topological invariants?"
    },
    {
      "from": "gpt",
      "value": "To use persistent homology or any of the other topological invariants to solve a Minesweeper game board, you would follow the same basic steps as described above, but with some differences in the specific details.\n\nFor instance, here is how you might use persistent homology to solve a Minesweeper game board:\n\n1. Represent the Minesweeper game board as a topological space. Each square on the board corresponds to a point in the space, and the mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space.\n2. Compute the persistent homology of the space. Persistent homology is a way of describing the holes in a space over time, and it can be used to infer information about the locations of the mines on the Minesweeper board.\n3. Use the information from the persistent homology to determine the locations of the mines on the board. For example, if the persistent homology indicates that there are four holes in the space, arranged in a square shape, then we can infer that the mines are located in the four corners of the board.\n4. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you can use this information to avoid them and clear the board.\n\nOf course, this is just one possible approach to using persistent homology to solve Minesweeper. There may be other ways to do it, depending on the specific game board and the specific details of the persistent homology computation. However, this should give you a general idea of how persistent homology can be used to solve Minesweeper."
    },
    {
      "from": "human",
      "value": "how could i implement this in python?"
    },
    {
      "from": "gpt",
      "value": "To implement a Minesweeper solver that uses algebraic topology in Python, you would need to do the following:\n\n1. Install a Python package for algebraic topology. There are several options available, such as Gudhi, Dionysus, or scikit-tda. These packages provide algorithms and data structures for computing topological invariants, such as homotopy groups, Betti numbers, or persistent homology.\n2. Represent the Minesweeper game board as a topological space. This could be done using a data structure provided by the algebraic topology package, such as a simplicial complex or a Vietoris-Rips complex. Each square on the board would correspond to a point in the space, and the mines on the board would be represented as \"holes\" in the space.\n3. Compute the topological invariants of the space. This could be done using algorithms provided by the algebraic topology package. For example, you could compute the persistent homology of the space using the persistent homology algorithm.\n4. Use the information from the topological invariants to determine the locations of the mines on the board. This could be done using a combination of mathematical reasoning and programming. For example, if the persistent homology indicates that there are four holes in the space, arranged in a square shape, then you could use a piece of code to infer that the mines are located in the four corners of the board.\n5. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you could use a piece of code to automatically click on the safe squares and clear the board.\n\nOf course, this is just one possible approach to implementing a Minesweeper solver that uses algebraic topology in Python. There may be other ways to do it, depending on the specific details of the implementation. However, this should give you a general idea of how to get started with such a project."
    }
  ]
}


================================================
FILE: tests/fixtures/conversation.missingturns.json
================================================
{
  "conversations": [
    {
      "from": "human",
      "value": "how could i implement a minesweeper algorithm that utilises algebraic topology to solve boards?"
    },
    {
      "from": "gpt",
      "value": ""
    },
    {
      "from": "human",
      "value": "could you give me the gist of how it could though?"
    },
    {
      "from": "gpt",
      "value": "Sure. The basic idea would be to use algebraic topology to analyze the structure of the Minesweeper game board and determine the locations of mines based on certain topological invariants. This could involve representing the game board as a topological space and using topological invariants such as homotopy groups and Betti numbers to infer information about the location of mines on the board. However, as I mentioned before, implementing this would require a deep understanding of both algebraic topology and the game of Minesweeper, so it's not something that can be explained in a simple answer."
    },
    {
      "from": "human",
      "value": "give me a specific example"
    },
    {
      "from": "gpt",
      "value": "As a specific example, suppose we have a Minesweeper game board with the following configuration:\n```\n 1  1  1  1  1\n 1  ?  ?  ?  1\n 1  ?  ?  ?  1\n 1  ?  ?  ?  1\n 1  1  1  1  1\n```\nIn this case, the game board can be represented as a topological space, where each square on the board corresponds to a point in the space. The mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space. Using algebraic topology, we can analyze the structure of this space and determine the locations of the mines based on certain topological invariants.\n\nFor instance, one way to do this is to compute the homotopy groups of the space. Homotopy groups are a way of describing the holes in a space, and they can be used to infer information about the locations of the mines on the Minesweeper board. In the case of the board above, we can compute the first homotopy group of the space, which describes the number and \"shape\" of the holes in the space. In this case, the first homotopy group would tell us that there are four holes in the space, arranged in a square shape.\n\nBased on this information, we can infer that the mines in the game are located in the four corners of the board, as shown below:\n```\n 1  1  1  1  1\n 1  M  ?  ?  1\n 1  ?  ?  ?  1\n 1  ?  ?  M  1\n 1  1  1  1  1\n```\nOf course, this is just one example of how algebraic topology could be used to solve Minesweeper. There are many other possible approaches, and the specific details of how to implement them would depend on the specific game board and the topological invariants being used. As I mentioned before, implementing a solution like this would require a deep understanding of both algebraic topology and the game of Minesweeper."
    },
    {
      "from": "human",
      "value": "assuming i was someone with a deep understanding of both, could you explain the same concept to me with that assumption?"
    },
    {
      "from": "gpt",
      "value": "Sure. Assuming you have a deep understanding of both algebraic topology and the game of Minesweeper, here is how you might use algebraic topology to solve a Minesweeper game board:\n\n1. Represent the Minesweeper game board as a topological space. Each square on the board corresponds to a point in the space, and the mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space.\n2. Use topological invariants to infer information about the locations of the mines on the board. There are many possible topological invariants that could be used for this, such as homotopy groups, Betti numbers, or persistent homology.\n3. Use the information inferred from the topological invariants to determine the locations of the mines on the board. For example, if the first homotopy group of the space indicates that there are four holes in the space, arranged in a square shape, then we can infer that the mines are located in the four corners of the board.\n4. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you can use this information to avoid them and clear the board.\n\nOf course, this is just one possible approach to using algebraic topology to solve Minesweeper. There may be other ways to do it, depending on the specific game board and the topological invariants being used. However, this should give you a general idea of how algebraic topology can be used to solve Minesweeper."
    },
    {
      "from": "human",
      "value": "how would i do it with persistent homology or any of the other topological invariants?"
    },
    {
      "from": "gpt",
      "value": "To use persistent homology or any of the other topological invariants to solve a Minesweeper game board, you would follow the same basic steps as described above, but with some differences in the specific details.\n\nFor instance, here is how you might use persistent homology to solve a Minesweeper game board:\n\n1. Represent the Minesweeper game board as a topological space. Each square on the board corresponds to a point in the space, and the mines on the board can be thought of as \"holes\" in the space, which are points that are not part of the space.\n2. Compute the persistent homology of the space. Persistent homology is a way of describing the holes in a space over time, and it can be used to infer information about the locations of the mines on the Minesweeper board.\n3. Use the information from the persistent homology to determine the locations of the mines on the board. For example, if the persistent homology indicates that there are four holes in the space, arranged in a square shape, then we can infer that the mines are located in the four corners of the board.\n4. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you can use this information to avoid them and clear the board.\n\nOf course, this is just one possible approach to using persistent homology to solve Minesweeper. There may be other ways to do it, depending on the specific game board and the specific details of the persistent homology computation. However, this should give you a general idea of how persistent homology can be used to solve Minesweeper."
    },
    {
      "from": "human",
      "value": "how could i implement this in python?"
    },
    {
      "from": "gpt",
      "value": "To implement a Minesweeper solver that uses algebraic topology in Python, you would need to do the following:\n\n1. Install a Python package for algebraic topology. There are several options available, such as Gudhi, Dionysus, or scikit-tda. These packages provide algorithms and data structures for computing topological invariants, such as homotopy groups, Betti numbers, or persistent homology.\n2. Represent the Minesweeper game board as a topological space. This could be done using a data structure provided by the algebraic topology package, such as a simplicial complex or a Vietoris-Rips complex. Each square on the board would correspond to a point in the space, and the mines on the board would be represented as \"holes\" in the space.\n3. Compute the topological invariants of the space. This could be done using algorithms provided by the algebraic topology package. For example, you could compute the persistent homology of the space using the persistent homology algorithm.\n4. Use the information from the topological invariants to determine the locations of the mines on the board. This could be done using a combination of mathematical reasoning and programming. For example, if the persistent homology indicates that there are four holes in the space, arranged in a square shape, then you could use a piece of code to infer that the mines are located in the four corners of the board.\n5. Use this information to solve the Minesweeper game. Once you know the locations of the mines, you could use a piece of code to automatically click on the safe squares and clear the board.\n\nOf course, this is just one possible approach to implementing a Minesweeper solver that uses algebraic topology in Python. There may be other ways to do it, depending on the specific details of the implementation. However, this should give you a general idea of how to get started with such a project."
    }
  ]
}


================================================
FILE: tests/fixtures/conversation.tokenized.json
================================================
{"input_ids": [1, 319, 13563, 1546, 263, 12758, 1404, 322, 385, 23116, 21082, 20255, 29889, 450, 20255, 4076, 8444, 29892, 13173, 29892, 322, 1248, 568, 6089, 304, 278, 1404, 29915, 29879, 5155, 29889, 29871, 3148, 1001, 29901, 920, 1033, 474, 2334, 263, 29086, 705, 11356, 5687, 393, 3667, 4637, 21531, 20159, 304, 4505, 1045, 3163, 29973, 29871, 319, 1799, 9047, 13566, 29901, 739, 29915, 29879, 1950, 304, 671, 21531, 20159, 304, 4505, 341, 1475, 705, 11356, 29892, 541, 372, 723, 5517, 367, 3755, 4280, 29889, 838, 5869, 293, 20159, 338, 263, 5443, 310, 23964, 393, 11898, 278, 4426, 310, 8162, 393, 526, 21634, 1090, 9126, 316, 689, 800, 29892, 1316, 408, 16116, 292, 322, 289, 2548, 29889, 512, 278, 3030, 310, 341, 1475, 705, 11356, 29892, 445, 1033, 367, 1304, 304, 27599, 278, 3829, 310, 278, 3748, 7613, 322, 8161, 278, 14354, 310, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 2398, 29892, 16049, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29889, 739, 29915, 29879, 451, 1554, 393, 508, 367, 10824, 297, 263, 2560, 1234, 21106, 29879, 29958, 2, 3148, 1001, 29901, 1033, 366, 2367, 592, 278, 330, 391, 310, 920, 372, 1033, 2466, 29973, 29871, 319, 1799, 9047, 13566, 29901, 18585, 29889, 450, 6996, 2969, 723, 367, 304, 671, 21531, 20159, 304, 27599, 278, 3829, 310, 278, 341, 1475, 705, 11356, 3748, 7613, 322, 8161, 278, 14354, 310, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 910, 1033, 25135, 15783, 278, 3748, 7613, 408, 263, 25002, 2913, 322, 773, 25002, 15939, 1934, 1316, 408, 3632, 327, 2270, 6471, 322, 350, 9890, 3694, 304, 10115, 2472, 1048, 278, 4423, 310, 29086, 373, 278, 7613, 29889, 2398, 29892, 408, 306, 5276, 1434, 29892, 16049, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29892, 577, 372, 29915, 29879, 451, 1554, 393, 508, 367, 10824, 297, 263, 2560, 1234, 21106, 29879, 29958, 2, 3148, 1001, 29901, 2367, 592, 263, 2702, 1342, 29871, 319, 1799, 9047, 13566, 29901, 1094, 263, 2702, 1342, 29892, 7755, 591, 505, 263, 341, 1475, 705, 11356, 3748, 7613, 411, 278, 1494, 5285, 29901, 13, 28956, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 28956, 13, 797, 445, 1206, 29892, 278, 3748, 7613, 508, 367, 9875, 408, 263, 25002, 2913, 29892, 988, 1269, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29889, 450, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 5293, 21531, 20159, 29892, 591, 508, 27599, 278, 3829, 310, 445, 2913, 322, 8161, 278, 14354, 310, 278, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 13, 13, 2831, 2777, 29892, 697, 982, 304, 437, 445, 338, 304, 10272, 278, 3632, 327, 2270, 6471, 310, 278, 2913, 29889, 15089, 327, 2270, 6471, 526, 263, 982, 310, 20766, 278, 26532, 297, 263, 2913, 29892, 322, 896, 508, 367, 1304, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 341, 1475, 705, 11356, 7613, 29889, 512, 278, 1206, 310, 278, 7613, 2038, 29892, 591, 508, 10272, 278, 937, 3632, 327, 2270, 2318, 310, 278, 2913, 29892, 607, 16612, 278, 1353, 322, 376, 12181, 29908, 310, 278, 26532, 297, 278, 2913, 29889, 512, 445, 1206, 29892, 278, 937, 3632, 327, 2270, 2318, 723, 2649, 502, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29889, 13, 13, 29933, 1463, 373, 445, 2472, 29892, 591, 508, 10115, 393, 278, 29086, 297, 278, 3748, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29892, 408, 4318, 2400, 29901, 13, 28956, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 29871, 29896, 29871, 341, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 341, 259, 29896, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 28956, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1342, 310, 920, 21531, 20159, 1033, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 526, 1784, 916, 1950, 13501, 29892, 322, 278, 2702, 4902, 310, 920, 304, 2334, 963, 723, 8839, 373, 278, 2702, 3748, 7613, 322, 278, 25002, 15939, 1934, 1641, 1304, 29889, 1094, 306, 5276, 1434, 29892, 16049, 263, 1650, 763, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 21106, 29879, 29958, 2, 3148, 1001, 29901, 10241, 474, 471, 4856, 411, 263, 6483, 8004, 310, 1716, 29892, 1033, 366, 5649, 278, 1021, 6964, 304, 592, 411, 393, 11833, 29973, 29871, 319, 1799, 9047, 13566, 29901, 18585, 29889, 17090, 366, 505, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29892, 1244, 338, 920, 366, 1795, 671, 21531, 20159, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29901, 13, 13, 29896, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 7806, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 13, 29906, 29889, 4803, 25002, 15939, 1934, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1670, 526, 1784, 1950, 25002, 15939, 1934, 393, 1033, 367, 1304, 363, 445, 29892, 1316, 408, 3632, 327, 2270, 6471, 29892, 350, 9890, 3694, 29892, 470, 28152, 3632, 3002, 29889, 13, 29941, 29889, 4803, 278, 2472, 10115, 1127, 515, 278, 25002, 15939, 1934, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1152, 1342, 29892, 565, 278, 937, 3632, 327, 2270, 2318, 310, 278, 2913, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 591, 508, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29946, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 508, 671, 445, 2472, 304, 4772, 963, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 773, 21531, 20159, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 3748, 7613, 322, 278, 25002, 15939, 1934, 1641, 1304, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 21531, 20159, 508, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 21106, 29879, 29958, 2, 3148, 1001, 29901, 920, 723, 474, 437, 372, 411, 28152, 3632, 3002, 470, 738, 310, 278, 916, 25002, 15939, 1934, 29973, 29871, 319, 1799, 9047, 13566, 29901, 1763, 671, 28152, 3632, 3002, 470, 738, 310, 278, 916, 25002, 15939, 1934, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29892, 366, 723, 1101, 278, 1021, 6996, 6576, 408, 5439, 2038, 29892, 541, 411, 777, 12651, 297, 278, 2702, 4902, 29889, 13, 13, 2831, 2777, 29892, 1244, 338, 920, 366, 1795, 671, 28152, 3632, 3002, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29901, 13, 13, 29896, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 7806, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 13, 29906, 29889, 11796, 29872, 278, 28152, 3632, 3002, 310, 278, 2913, 29889, 9034, 9696, 3632, 3002, 338, 263, 982, 310, 20766, 278, 26532, 297, 263, 2913, 975, 931, 29892, 322, 372, 508, 367, 1304, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 341, 1475, 705, 11356, 7613, 29889, 13, 29941, 29889, 4803, 278, 2472, 515, 278, 28152, 3632, 3002, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1152, 1342, 29892, 565, 278, 28152, 3632, 3002, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 591, 508, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29946, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 508, 671, 445, 2472, 304, 4772, 963, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 773, 28152, 3632, 3002, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 3748, 7613, 322, 278, 2702, 4902, 310, 278, 28152, 3632, 3002, 16287, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 28152, 3632, 3002, 508, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 21106, 29879, 29958, 2, 3148, 1001, 29901, 920, 1033, 474, 2334, 445, 297, 3017, 29973, 29871, 319, 1799, 9047, 13566, 29901, 1763, 2334, 263, 341, 1475, 705, 11356, 899, 369, 393, 3913, 21531, 20159, 297, 5132, 29892, 366, 723, 817, 304, 437, 278, 1494, 29901, 13, 13, 29896, 29889, 16052, 263, 5132, 3577, 363, 21531, 20159, 29889, 1670, 526, 3196, 3987, 3625, 29892, 1316, 408, 402, 566, 2918, 29892, 360, 291, 952, 375, 29892, 470, 4560, 7354, 29899, 29873, 1388, 29889, 4525, 9741, 3867, 14009, 322, 848, 12286, 363, 20602, 25002, 15939, 1934, 29892, 1316, 408, 3632, 327, 2270, 6471, 29892, 350, 9890, 3694, 29892, 470, 28152, 3632, 3002, 29889, 13, 29906, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 910, 1033, 367, 2309, 773, 263, 848, 3829, 4944, 491, 278, 21531, 20159, 3577, 29892, 1316, 408, 263, 3053, 506, 616, 4280, 470, 263, 478, 2035, 29367, 29899, 29934, 4512, 4280, 29889, 7806, 6862, 373, 278, 7613, 723, 3928, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 723, 367, 9875, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29889, 13, 29941, 29889, 11796, 29872, 278, 25002, 15939, 1934, 310, 278, 2913, 29889, 910, 1033, 367, 2309, 773, 14009, 4944, 491, 278, 21531, 20159, 3577, 29889, 1152, 1342, 29892, 366, 1033, 10272, 278, 28152, 3632, 3002, 310, 278, 2913, 773, 278, 28152, 3632, 3002, 5687, 29889, 13, 29946, 29889, 4803, 278, 2472, 515, 278, 25002, 15939, 1934, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 910, 1033, 367, 2309, 773, 263, 10296, 310, 19475, 24481, 322, 8720, 29889, 1152, 1342, 29892, 565, 278, 28152, 3632, 3002, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 366, 1033, 671, 263, 8424, 310, 775, 304, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29945, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 1033, 671, 263, 8424, 310, 775, 304, 6336, 2828, 373, 278, 9109, 25256, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 16049, 263, 341, 1475, 705, 11356, 899, 369, 393, 3913, 21531, 20159, 297, 5132, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 4902, 310, 278, 5314, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 304, 679, 4687, 411, 1316, 263, 2060, 21106, 29879, 29958, 2], "attention_mask": [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], "labels": [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 739, 29915, 29879, 1950, 304, 671, 21531, 20159, 304, 4505, 341, 1475, 705, 11356, 29892, 541, 372, 723, 5517, 367, 3755, 4280, 29889, 838, 5869, 293, 20159, 338, 263, 5443, 310, 23964, 393, 11898, 278, 4426, 310, 8162, 393, 526, 21634, 1090, 9126, 316, 689, 800, 29892, 1316, 408, 16116, 292, 322, 289, 2548, 29889, 512, 278, 3030, 310, 341, 1475, 705, 11356, 29892, 445, 1033, 367, 1304, 304, 27599, 278, 3829, 310, 278, 3748, 7613, 322, 8161, 278, 14354, 310, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 2398, 29892, 16049, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29889, 739, 29915, 29879, 451, 1554, 393, 508, 367, 10824, 297, 263, 2560, 1234, 21106, 29879, 29958, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 18585, 29889, 450, 6996, 2969, 723, 367, 304, 671, 21531, 20159, 304, 27599, 278, 3829, 310, 278, 341, 1475, 705, 11356, 3748, 7613, 322, 8161, 278, 14354, 310, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 910, 1033, 25135, 15783, 278, 3748, 7613, 408, 263, 25002, 2913, 322, 773, 25002, 15939, 1934, 1316, 408, 3632, 327, 2270, 6471, 322, 350, 9890, 3694, 304, 10115, 2472, 1048, 278, 4423, 310, 29086, 373, 278, 7613, 29889, 2398, 29892, 408, 306, 5276, 1434, 29892, 16049, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29892, 577, 372, 29915, 29879, 451, 1554, 393, 508, 367, 10824, 297, 263, 2560, 1234, 21106, 29879, 29958, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 1094, 263, 2702, 1342, 29892, 7755, 591, 505, 263, 341, 1475, 705, 11356, 3748, 7613, 411, 278, 1494, 5285, 29901, 13, 28956, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 28956, 13, 797, 445, 1206, 29892, 278, 3748, 7613, 508, 367, 9875, 408, 263, 25002, 2913, 29892, 988, 1269, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29889, 450, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 5293, 21531, 20159, 29892, 591, 508, 27599, 278, 3829, 310, 445, 2913, 322, 8161, 278, 14354, 310, 278, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 13, 13, 2831, 2777, 29892, 697, 982, 304, 437, 445, 338, 304, 10272, 278, 3632, 327, 2270, 6471, 310, 278, 2913, 29889, 15089, 327, 2270, 6471, 526, 263, 982, 310, 20766, 278, 26532, 297, 263, 2913, 29892, 322, 896, 508, 367, 1304, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 341, 1475, 705, 11356, 7613, 29889, 512, 278, 1206, 310, 278, 7613, 2038, 29892, 591, 508, 10272, 278, 937, 3632, 327, 2270, 2318, 310, 278, 2913, 29892, 607, 16612, 278, 1353, 322, 376, 12181, 29908, 310, 278, 26532, 297, 278, 2913, 29889, 512, 445, 1206, 29892, 278, 937, 3632, 327, 2270, 2318, 723, 2649, 502, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29889, 13, 13, 29933, 1463, 373, 445, 2472, 29892, 591, 508, 10115, 393, 278, 29086, 297, 278, 3748, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29892, 408, 4318, 2400, 29901, 13, 28956, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 29871, 29896, 29871, 341, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 341, 259, 29896, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 28956, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1342, 310, 920, 21531, 20159, 1033, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 526, 1784, 916, 1950, 13501, 29892, 322, 278, 2702, 4902, 310, 920, 304, 2334, 963, 723, 8839, 373, 278, 2702, 3748, 7613, 322, 278, 25002, 15939, 1934, 1641, 1304, 29889, 1094, 306, 5276, 1434, 29892, 16049, 263, 1650, 763, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 21106, 29879, 29958, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 18585, 29889, 17090, 366, 505, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29892, 1244, 338, 920, 366, 1795, 671, 21531, 20159, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29901, 13, 13, 29896, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 7806, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 13, 29906, 29889, 4803, 25002, 15939, 1934, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1670, 526, 1784, 1950, 25002, 15939, 1934, 393, 1033, 367, 1304, 363, 445, 29892, 1316, 408, 3632, 327, 2270, 6471, 29892, 350, 9890, 3694, 29892, 470, 28152, 3632, 3002, 29889, 13, 29941, 29889, 4803, 278, 2472, 10115, 1127, 515, 278, 25002, 15939, 1934, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1152, 1342, 29892, 565, 278, 937, 3632, 327, 2270, 2318, 310, 278, 2913, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 591, 508, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29946, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 508, 671, 445, 2472, 304, 4772, 963, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 773, 21531, 20159, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 3748, 7613, 322, 278, 25002, 15939, 1934, 1641, 1304, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 21531, 20159, 508, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 21106, 29879, 29958, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 1763, 671, 28152, 3632, 3002, 470, 738, 310, 278, 916, 25002, 15939, 1934, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29892, 366, 723, 1101, 278, 1021, 6996, 6576, 408, 5439, 2038, 29892, 541, 411, 777, 12651, 297, 278, 2702, 4902, 29889, 13, 13, 2831, 2777, 29892, 1244, 338, 920, 366, 1795, 671, 28152, 3632, 3002, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29901, 13, 13, 29896, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 7806, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 13, 29906, 29889, 11796, 29872, 278, 28152, 3632, 3002, 310, 278, 2913, 29889, 9034, 9696, 3632, 3002, 338, 263, 982, 310, 20766, 278, 26532, 297, 263, 2913, 975, 931, 29892, 322, 372, 508, 367, 1304, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 341, 1475, 705, 11356, 7613, 29889, 13, 29941, 29889, 4803, 278, 2472, 515, 278, 28152, 3632, 3002, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1152, 1342, 29892, 565, 278, 28152, 3632, 3002, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 591, 508, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29946, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 508, 671, 445, 2472, 304, 4772, 963, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 773, 28152, 3632, 3002, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 3748, 7613, 322, 278, 2702, 4902, 310, 278, 28152, 3632, 3002, 16287, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 28152, 3632, 3002, 508, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 21106, 29879, 29958, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 1763, 2334, 263, 341, 1475, 705, 11356, 899, 369, 393, 3913, 21531, 20159, 297, 5132, 29892, 366, 723, 817, 304, 437, 278, 1494, 29901, 13, 13, 29896, 29889, 16052, 263, 5132, 3577, 363, 21531, 20159, 29889, 1670, 526, 3196, 3987, 3625, 29892, 1316, 408, 402, 566, 2918, 29892, 360, 291, 952, 375, 29892, 470, 4560, 7354, 29899, 29873, 1388, 29889, 4525, 9741, 3867, 14009, 322, 848, 12286, 363, 20602, 25002, 15939, 1934, 29892, 1316, 408, 3632, 327, 2270, 6471, 29892, 350, 9890, 3694, 29892, 470, 28152, 3632, 3002, 29889, 13, 29906, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 910, 1033, 367, 2309, 773, 263, 848, 3829, 4944, 491, 278, 21531, 20159, 3577, 29892, 1316, 408, 263, 3053, 506, 616, 4280, 470, 263, 478, 2035, 29367, 29899, 29934, 4512, 4280, 29889, 7806, 6862, 373, 278, 7613, 723, 3928, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 723, 367, 9875, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29889, 13, 29941, 29889, 11796, 29872, 278, 25002, 15939, 1934, 310, 278, 2913, 29889, 910, 1033, 367, 2309, 773, 14009, 4944, 491, 278, 21531, 20159, 3577, 29889, 1152, 1342, 29892, 366, 1033, 10272, 278, 28152, 3632, 3002, 310, 278, 2913, 773, 278, 28152, 3632, 3002, 5687, 29889, 13, 29946, 29889, 4803, 278, 2472, 515, 278, 25002, 15939, 1934, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 910, 1033, 367, 2309, 773, 263, 10296, 310, 19475, 24481, 322, 8720, 29889, 1152, 1342, 29892, 565, 278, 28152, 3632, 3002, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 366, 1033, 671, 263, 8424, 310, 775, 304, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29945, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 1033, 671, 263, 8424, 310, 775, 304, 6336, 2828, 373, 278, 9109, 25256, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 16049, 263, 341, 1475, 705, 11356, 899, 369, 393, 3913, 21531, 20159, 297, 5132, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 4902, 310, 278, 5314, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 304, 679, 4687, 411, 1316, 263, 2060, 21106, 29879, 29958, 2]}


================================================
FILE: tests/fixtures/conversation.tokenized_llama2chat.json
================================================
{"input_ids": [1, 518, 25580, 29962, 3532, 14816, 29903, 6778, 13, 3492, 526, 263, 8444, 29892, 3390, 1319, 322, 15993, 20255, 29889, 29849, 1234, 408, 1371, 3730, 408, 1950, 29892, 1550, 1641, 9109, 29889, 3575, 6089, 881, 451, 3160, 738, 10311, 1319, 29892, 443, 621, 936, 29892, 11021, 391, 29892, 7916, 391, 29892, 304, 27375, 29892, 18215, 29892, 470, 27302, 2793, 29889, 3529, 9801, 393, 596, 20890, 526, 5374, 635, 443, 5365, 1463, 322, 6374, 297, 5469, 29889, 13, 13, 3644, 263, 1139, 947, 451, 1207, 738, 4060, 29892, 470, 338, 451, 2114, 1474, 16165, 261, 296, 29892, 5649, 2020, 2012, 310, 22862, 1554, 451, 1959, 29889, 960, 366, 1016, 29915, 29873, 1073, 278, 1234, 304, 263, 1139, 29892, 3113, 1016, 29915, 29873, 6232, 2089, 2472, 29889, 13, 29966, 829, 14816, 29903, 6778, 13, 13, 3525, 1033, 474, 2334, 263, 29086, 705, 11356, 5687, 393, 3667, 4637, 21531, 20159, 304, 4505, 1045, 3163, 29973, 518, 29914, 25580, 29962, 739, 29915, 29879, 1950, 304, 671, 21531, 20159, 304, 4505, 341, 1475, 705, 11356, 29892, 541, 372, 723, 5517, 367, 3755, 4280, 29889, 838, 5869, 293, 20159, 338, 263, 5443, 310, 23964, 393, 11898, 278, 4426, 310, 8162, 393, 526, 21634, 1090, 9126, 316, 689, 800, 29892, 1316, 408, 16116, 292, 322, 289, 2548, 29889, 512, 278, 3030, 310, 341, 1475, 705, 11356, 29892, 445, 1033, 367, 1304, 304, 27599, 278, 3829, 310, 278, 3748, 7613, 322, 8161, 278, 14354, 310, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 2398, 29892, 16049, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29889, 739, 29915, 29879, 451, 1554, 393, 508, 367, 10824, 297, 263, 2560, 1234, 29889, 29871, 2, 1, 518, 25580, 29962, 1033, 366, 2367, 592, 278, 330, 391, 310, 920, 372, 1033, 2466, 29973, 518, 29914, 25580, 29962, 18585, 29889, 450, 6996, 2969, 723, 367, 304, 671, 21531, 20159, 304, 27599, 278, 3829, 310, 278, 341, 1475, 705, 11356, 3748, 7613, 322, 8161, 278, 14354, 310, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 910, 1033, 25135, 15783, 278, 3748, 7613, 408, 263, 25002, 2913, 322, 773, 25002, 15939, 1934, 1316, 408, 3632, 327, 2270, 6471, 322, 350, 9890, 3694, 304, 10115, 2472, 1048, 278, 4423, 310, 29086, 373, 278, 7613, 29889, 2398, 29892, 408, 306, 5276, 1434, 29892, 16049, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29892, 577, 372, 29915, 29879, 451, 1554, 393, 508, 367, 10824, 297, 263, 2560, 1234, 29889, 29871, 2, 1, 518, 25580, 29962, 2367, 592, 263, 2702, 1342, 518, 29914, 25580, 29962, 1094, 263, 2702, 1342, 29892, 7755, 591, 505, 263, 341, 1475, 705, 11356, 3748, 7613, 411, 278, 1494, 5285, 29901, 13, 28956, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 28956, 13, 797, 445, 1206, 29892, 278, 3748, 7613, 508, 367, 9875, 408, 263, 25002, 2913, 29892, 988, 1269, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29889, 450, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 5293, 21531, 20159, 29892, 591, 508, 27599, 278, 3829, 310, 445, 2913, 322, 8161, 278, 14354, 310, 278, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 13, 13, 2831, 2777, 29892, 697, 982, 304, 437, 445, 338, 304, 10272, 278, 3632, 327, 2270, 6471, 310, 278, 2913, 29889, 15089, 327, 2270, 6471, 526, 263, 982, 310, 20766, 278, 26532, 297, 263, 2913, 29892, 322, 896, 508, 367, 1304, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 341, 1475, 705, 11356, 7613, 29889, 512, 278, 1206, 310, 278, 7613, 2038, 29892, 591, 508, 10272, 278, 937, 3632, 327, 2270, 2318, 310, 278, 2913, 29892, 607, 16612, 278, 1353, 322, 376, 12181, 29908, 310, 278, 26532, 297, 278, 2913, 29889, 512, 445, 1206, 29892, 278, 937, 3632, 327, 2270, 2318, 723, 2649, 502, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29889, 13, 13, 29933, 1463, 373, 445, 2472, 29892, 591, 508, 10115, 393, 278, 29086, 297, 278, 3748, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29892, 408, 4318, 2400, 29901, 13, 28956, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 29871, 29896, 29871, 341, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 341, 259, 29896, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 28956, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1342, 310, 920, 21531, 20159, 1033, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 526, 1784, 916, 1950, 13501, 29892, 322, 278, 2702, 4902, 310, 920, 304, 2334, 963, 723, 8839, 373, 278, 2702, 3748, 7613, 322, 278, 25002, 15939, 1934, 1641, 1304, 29889, 1094, 306, 5276, 1434, 29892, 16049, 263, 1650, 763, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29889, 29871, 2, 1, 518, 25580, 29962, 10241, 474, 471, 4856, 411, 263, 6483, 8004, 310, 1716, 29892, 1033, 366, 5649, 278, 1021, 6964, 304, 592, 411, 393, 11833, 29973, 518, 29914, 25580, 29962, 18585, 29889, 17090, 366, 505, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29892, 1244, 338, 920, 366, 1795, 671, 21531, 20159, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29901, 13, 13, 29896, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 7806, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 13, 29906, 29889, 4803, 25002, 15939, 1934, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1670, 526, 1784, 1950, 25002, 15939, 1934, 393, 1033, 367, 1304, 363, 445, 29892, 1316, 408, 3632, 327, 2270, 6471, 29892, 350, 9890, 3694, 29892, 470, 28152, 3632, 3002, 29889, 13, 29941, 29889, 4803, 278, 2472, 10115, 1127, 515, 278, 25002, 15939, 1934, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1152, 1342, 29892, 565, 278, 937, 3632, 327, 2270, 2318, 310, 278, 2913, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 591, 508, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29946, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 508, 671, 445, 2472, 304, 4772, 963, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 773, 21531, 20159, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 3748, 7613, 322, 278, 25002, 15939, 1934, 1641, 1304, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 21531, 20159, 508, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 29889, 29871, 2, 1, 518, 25580, 29962, 920, 723, 474, 437, 372, 411, 28152, 3632, 3002, 470, 738, 310, 278, 916, 25002, 15939, 1934, 29973, 518, 29914, 25580, 29962, 1763, 671, 28152, 3632, 3002, 470, 738, 310, 278, 916, 25002, 15939, 1934, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29892, 366, 723, 1101, 278, 1021, 6996, 6576, 408, 5439, 2038, 29892, 541, 411, 777, 12651, 297, 278, 2702, 4902, 29889, 13, 13, 2831, 2777, 29892, 1244, 338, 920, 366, 1795, 671, 28152, 3632, 3002, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29901, 13, 13, 29896, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 7806, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 13, 29906, 29889, 11796, 29872, 278, 28152, 3632, 3002, 310, 278, 2913, 29889, 9034, 9696, 3632, 3002, 338, 263, 982, 310, 20766, 278, 26532, 297, 263, 2913, 975, 931, 29892, 322, 372, 508, 367, 1304, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 341, 1475, 705, 11356, 7613, 29889, 13, 29941, 29889, 4803, 278, 2472, 515, 278, 28152, 3632, 3002, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1152, 1342, 29892, 565, 278, 28152, 3632, 3002, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 591, 508, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29946, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 508, 671, 445, 2472, 304, 4772, 963, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 773, 28152, 3632, 3002, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 3748, 7613, 322, 278, 2702, 4902, 310, 278, 28152, 3632, 3002, 16287, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 28152, 3632, 3002, 508, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 29889, 29871, 2, 1, 518, 25580, 29962, 920, 1033, 474, 2334, 445, 297, 3017, 29973, 518, 29914, 25580, 29962, 1763, 2334, 263, 341, 1475, 705, 11356, 899, 369, 393, 3913, 21531, 20159, 297, 5132, 29892, 366, 723, 817, 304, 437, 278, 1494, 29901, 13, 13, 29896, 29889, 16052, 263, 5132, 3577, 363, 21531, 20159, 29889, 1670, 526, 3196, 3987, 3625, 29892, 1316, 408, 402, 566, 2918, 29892, 360, 291, 952, 375, 29892, 470, 4560, 7354, 29899, 29873, 1388, 29889, 4525, 9741, 3867, 14009, 322, 848, 12286, 363, 20602, 25002, 15939, 1934, 29892, 1316, 408, 3632, 327, 2270, 6471, 29892, 350, 9890, 3694, 29892, 470, 28152, 3632, 3002, 29889, 13, 29906, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 910, 1033, 367, 2309, 773, 263, 848, 3829, 4944, 491, 278, 21531, 20159, 3577, 29892, 1316, 408, 263, 3053, 506, 616, 4280, 470, 263, 478, 2035, 29367, 29899, 29934, 4512, 4280, 29889, 7806, 6862, 373, 278, 7613, 723, 3928, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 723, 367, 9875, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29889, 13, 29941, 29889, 11796, 29872, 278, 25002, 15939, 1934, 310, 278, 2913, 29889, 910, 1033, 367, 2309, 773, 14009, 4944, 491, 278, 21531, 20159, 3577, 29889, 1152, 1342, 29892, 366, 1033, 10272, 278, 28152, 3632, 3002, 310, 278, 2913, 773, 278, 28152, 3632, 3002, 5687, 29889, 13, 29946, 29889, 4803, 278, 2472, 515, 278, 25002, 15939, 1934, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 910, 1033, 367, 2309, 773, 263, 10296, 310, 19475, 24481, 322, 8720, 29889, 1152, 1342, 29892, 565, 278, 28152, 3632, 3002, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 366, 1033, 671, 263, 8424, 310, 775, 304, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29945, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 1033, 671, 263, 8424, 310, 775, 304, 6336, 2828, 373, 278, 9109, 25256, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 16049, 263, 341, 1475, 705, 11356, 899, 369, 393, 3913, 21531, 20159, 297, 5132, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 4902, 310, 278, 5314, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 304, 679, 4687, 411, 1316, 263, 2060, 29889, 29871, 2, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], "labels": [-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 739, 29915, 29879, 1950, 304, 671, 21531, 20159, 304, 4505, 341, 1475, 705, 11356, 29892, 541, 372, 723, 5517, 367, 3755, 4280, 29889, 838, 5869, 293, 20159, 338, 263, 5443, 310, 23964, 393, 11898, 278, 4426, 310, 8162, 393, 526, 21634, 1090, 9126, 316, 689, 800, 29892, 1316, 408, 16116, 292, 322, 289, 2548, 29889, 512, 278, 3030, 310, 341, 1475, 705, 11356, 29892, 445, 1033, 367, 1304, 304, 27599, 278, 3829, 310, 278, 3748, 7613, 322, 8161, 278, 14354, 310, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 2398, 29892, 16049, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29889, 739, 29915, 29879, 451, 1554, 393, 508, 367, 10824, 297, 263, 2560, 1234, 29889, 29871, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 18585, 29889, 450, 6996, 2969, 723, 367, 304, 671, 21531, 20159, 304, 27599, 278, 3829, 310, 278, 341, 1475, 705, 11356, 3748, 7613, 322, 8161, 278, 14354, 310, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 910, 1033, 25135, 15783, 278, 3748, 7613, 408, 263, 25002, 2913, 322, 773, 25002, 15939, 1934, 1316, 408, 3632, 327, 2270, 6471, 322, 350, 9890, 3694, 304, 10115, 2472, 1048, 278, 4423, 310, 29086, 373, 278, 7613, 29889, 2398, 29892, 408, 306, 5276, 1434, 29892, 16049, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29892, 577, 372, 29915, 29879, 451, 1554, 393, 508, 367, 10824, 297, 263, 2560, 1234, 29889, 29871, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 1094, 263, 2702, 1342, 29892, 7755, 591, 505, 263, 341, 1475, 705, 11356, 3748, 7613, 411, 278, 1494, 5285, 29901, 13, 28956, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 28956, 13, 797, 445, 1206, 29892, 278, 3748, 7613, 508, 367, 9875, 408, 263, 25002, 2913, 29892, 988, 1269, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29889, 450, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 5293, 21531, 20159, 29892, 591, 508, 27599, 278, 3829, 310, 445, 2913, 322, 8161, 278, 14354, 310, 278, 29086, 2729, 373, 3058, 25002, 15939, 1934, 29889, 13, 13, 2831, 2777, 29892, 697, 982, 304, 437, 445, 338, 304, 10272, 278, 3632, 327, 2270, 6471, 310, 278, 2913, 29889, 15089, 327, 2270, 6471, 526, 263, 982, 310, 20766, 278, 26532, 297, 263, 2913, 29892, 322, 896, 508, 367, 1304, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 341, 1475, 705, 11356, 7613, 29889, 512, 278, 1206, 310, 278, 7613, 2038, 29892, 591, 508, 10272, 278, 937, 3632, 327, 2270, 2318, 310, 278, 2913, 29892, 607, 16612, 278, 1353, 322, 376, 12181, 29908, 310, 278, 26532, 297, 278, 2913, 29889, 512, 445, 1206, 29892, 278, 937, 3632, 327, 2270, 2318, 723, 2649, 502, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29889, 13, 13, 29933, 1463, 373, 445, 2472, 29892, 591, 508, 10115, 393, 278, 29086, 297, 278, 3748, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29892, 408, 4318, 2400, 29901, 13, 28956, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 29871, 29896, 29871, 341, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 1577, 259, 29896, 13, 29871, 29896, 29871, 1577, 29871, 1577, 29871, 341, 259, 29896, 13, 29871, 29896, 259, 29896, 259, 29896, 259, 29896, 259, 29896, 13, 28956, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1342, 310, 920, 21531, 20159, 1033, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 526, 1784, 916, 1950, 13501, 29892, 322, 278, 2702, 4902, 310, 920, 304, 2334, 963, 723, 8839, 373, 278, 2702, 3748, 7613, 322, 278, 25002, 15939, 1934, 1641, 1304, 29889, 1094, 306, 5276, 1434, 29892, 16049, 263, 1650, 763, 445, 723, 1996, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29889, 29871, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 18585, 29889, 17090, 366, 505, 263, 6483, 8004, 310, 1716, 21531, 20159, 322, 278, 3748, 310, 341, 1475, 705, 11356, 29892, 1244, 338, 920, 366, 1795, 671, 21531, 20159, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29901, 13, 13, 29896, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 7806, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 13, 29906, 29889, 4803, 25002, 15939, 1934, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1670, 526, 1784, 1950, 25002, 15939, 1934, 393, 1033, 367, 1304, 363, 445, 29892, 1316, 408, 3632, 327, 2270, 6471, 29892, 350, 9890, 3694, 29892, 470, 28152, 3632, 3002, 29889, 13, 29941, 29889, 4803, 278, 2472, 10115, 1127, 515, 278, 25002, 15939, 1934, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1152, 1342, 29892, 565, 278, 937, 3632, 327, 2270, 2318, 310, 278, 2913, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 591, 508, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29946, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 508, 671, 445, 2472, 304, 4772, 963, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 773, 21531, 20159, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 3748, 7613, 322, 278, 25002, 15939, 1934, 1641, 1304, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 21531, 20159, 508, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 29889, 29871, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 1763, 671, 28152, 3632, 3002, 470, 738, 310, 278, 916, 25002, 15939, 1934, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29892, 366, 723, 1101, 278, 1021, 6996, 6576, 408, 5439, 2038, 29892, 541, 411, 777, 12651, 297, 278, 2702, 4902, 29889, 13, 13, 2831, 2777, 29892, 1244, 338, 920, 366, 1795, 671, 28152, 3632, 3002, 304, 4505, 263, 341, 1475, 705, 11356, 3748, 7613, 29901, 13, 13, 29896, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 7806, 6862, 373, 278, 7613, 16161, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 508, 367, 2714, 310, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29892, 607, 526, 3291, 393, 526, 451, 760, 310, 278, 2913, 29889, 13, 29906, 29889, 11796, 29872, 278, 28152, 3632, 3002, 310, 278, 2913, 29889, 9034, 9696, 3632, 3002, 338, 263, 982, 310, 20766, 278, 26532, 297, 263, 2913, 975, 931, 29892, 322, 372, 508, 367, 1304, 304, 10115, 2472, 1048, 278, 14354, 310, 278, 29086, 373, 278, 341, 1475, 705, 11356, 7613, 29889, 13, 29941, 29889, 4803, 278, 2472, 515, 278, 28152, 3632, 3002, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 1152, 1342, 29892, 565, 278, 28152, 3632, 3002, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 591, 508, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29946, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 508, 671, 445, 2472, 304, 4772, 963, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 773, 28152, 3632, 3002, 304, 4505, 341, 1475, 705, 11356, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 3748, 7613, 322, 278, 2702, 4902, 310, 278, 28152, 3632, 3002, 16287, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 28152, 3632, 3002, 508, 367, 1304, 304, 4505, 341, 1475, 705, 11356, 29889, 29871, 2, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, 1763, 2334, 263, 341, 1475, 705, 11356, 899, 369, 393, 3913, 21531, 20159, 297, 5132, 29892, 366, 723, 817, 304, 437, 278, 1494, 29901, 13, 13, 29896, 29889, 16052, 263, 5132, 3577, 363, 21531, 20159, 29889, 1670, 526, 3196, 3987, 3625, 29892, 1316, 408, 402, 566, 2918, 29892, 360, 291, 952, 375, 29892, 470, 4560, 7354, 29899, 29873, 1388, 29889, 4525, 9741, 3867, 14009, 322, 848, 12286, 363, 20602, 25002, 15939, 1934, 29892, 1316, 408, 3632, 327, 2270, 6471, 29892, 350, 9890, 3694, 29892, 470, 28152, 3632, 3002, 29889, 13, 29906, 29889, 16314, 278, 341, 1475, 705, 11356, 3748, 7613, 408, 263, 25002, 2913, 29889, 910, 1033, 367, 2309, 773, 263, 848, 3829, 4944, 491, 278, 21531, 20159, 3577, 29892, 1316, 408, 263, 3053, 506, 616, 4280, 470, 263, 478, 2035, 29367, 29899, 29934, 4512, 4280, 29889, 7806, 6862, 373, 278, 7613, 723, 3928, 304, 263, 1298, 297, 278, 2913, 29892, 322, 278, 29086, 373, 278, 7613, 723, 367, 9875, 408, 376, 5391, 267, 29908, 297, 278, 2913, 29889, 13, 29941, 29889, 11796, 29872, 278, 25002, 15939, 1934, 310, 278, 2913, 29889, 910, 1033, 367, 2309, 773, 14009, 4944, 491, 278, 21531, 20159, 3577, 29889, 1152, 1342, 29892, 366, 1033, 10272, 278, 28152, 3632, 3002, 310, 278, 2913, 773, 278, 28152, 3632, 3002, 5687, 29889, 13, 29946, 29889, 4803, 278, 2472, 515, 278, 25002, 15939, 1934, 304, 8161, 278, 14354, 310, 278, 29086, 373, 278, 7613, 29889, 910, 1033, 367, 2309, 773, 263, 10296, 310, 19475, 24481, 322, 8720, 29889, 1152, 1342, 29892, 565, 278, 28152, 3632, 3002, 14088, 393, 727, 526, 3023, 26532, 297, 278, 2913, 29892, 21050, 297, 263, 6862, 8267, 29892, 769, 366, 1033, 671, 263, 8424, 310, 775, 304, 10115, 393, 278, 29086, 526, 5982, 297, 278, 3023, 26995, 310, 278, 7613, 29889, 13, 29945, 29889, 4803, 445, 2472, 304, 4505, 278, 341, 1475, 705, 11356, 3748, 29889, 9038, 366, 1073, 278, 14354, 310, 278, 29086, 29892, 366, 1033, 671, 263, 8424, 310, 775, 304, 6336, 2828, 373, 278, 9109, 25256, 322, 2821, 278, 7613, 29889, 13, 13, 2776, 3236, 29892, 445, 338, 925, 697, 1950, 2948, 304, 16049, 263, 341, 1475, 705, 11356, 899, 369, 393, 3913, 21531, 20159, 297, 5132, 29889, 1670, 1122, 367, 916, 5837, 304, 437, 372, 29892, 8679, 373, 278, 2702, 4902, 310, 278, 5314, 29889, 2398, 29892, 445, 881, 2367, 366, 263, 2498, 2969, 310, 920, 304, 679, 4687, 411, 1316, 263, 2060, 29889, 29871, 2, 1, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100], "attention_mask": [true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, true, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false, false]}


================================================
FILE: tests/hf_offline_utils.py
================================================
"""
test utils for helpers and decorators
"""

import os
from contextlib import contextmanager
from functools import wraps


def reload_modules(hf_hub_offline):
    # Force reload of the modules that check this variable
    import importlib

    import datasets
    import huggingface_hub.constants
    # from huggingface_hub.utils import reset_sessions

    # Reload the constants module first, as others depend on it
    importlib.reload(huggingface_hub.constants)
    huggingface_hub.constants.HF_HUB_OFFLINE = hf_hub_offline
    importlib.reload(datasets.config)
    datasets.config.HF_HUB_OFFLINE = hf_hub_offline


def enable_hf_offline(test_func):
    """
    test decorator that sets HF_HUB_OFFLINE environment variable to True and restores it after the test even if the test fails.
    :param test_func:
    :return:
    """

    @wraps(test_func)
    def wrapper(*args, **kwargs):
        # Save the original value of HF_HUB_OFFLINE environment variable
        original_hf_offline = os.getenv("HF_HUB_OFFLINE")

        # Set HF_OFFLINE environment variable to True
        os.environ["HF_HUB_OFFLINE"] = "1"

        reload_modules(True)
        try:
            # Run the test function
            return test_func(*args, **kwargs)
        finally:
            # Restore the original value of HF_HUB_OFFLINE environment variable
            if original_hf_offline is not None:
                os.environ["HF_HUB_OFFLINE"] = original_hf_offline
                reload_modules(bool(original_hf_offline))
            else:
                del os.environ["HF_HUB_OFFLINE"]
                reload_modules(False)

    return wrapper


def disable_hf_offline(test_func):
    """
    test decorator that sets HF_HUB_OFFLINE environment variable to False and restores it after the wrapped func
    :param test_func:
    :return:
    """

    @wraps(test_func)
    def wrapper(*args, **kwargs):
        # Save the original value of HF_HUB_OFFLINE environment variable
        original_hf_offline = os.getenv("HF_HUB_OFFLINE")

        # Set HF_OFFLINE environment variable to True
        os.environ["HF_HUB_OFFLINE"] = "0"

        reload_modules(False)
        try:
            # Run the test function
            return test_func(*args, **kwargs)
        finally:
            # Restore the original value of HF_HUB_OFFLINE environment variable
            if original_hf_offline is not None:
                os.environ["HF_HUB_OFFLINE"] = original_hf_offline
                reload_modules(bool(original_hf_offline))
            else:
                del os.environ["HF_HUB_OFFLINE"]
                reload_modules(False)

    return wrapper


@contextmanager
def hf_offline_context(hf_hub_offline):
    """
    Context manager that sets HF_HUB_OFFLINE environment variable to the given value.
    :param hf_hub_offline: The new value for HF_HUB_OFFLINE.
    :return: A context manager.
    """
    original_hf_offline = os.getenv("HF_HUB_OFFLINE")
    os.environ["HF_HUB_OFFLINE"] = str(hf_hub_offline)
    reload_modules(bool(hf_hub_offline))
    yield
    # Restore the original value of HF_HUB_OFFLINE environment variable
    if original_hf_offline is not None:
        os.environ["HF_HUB_OFFLINE"] = original_hf_offline
        reload_modules(bool(original_hf_offline))
    else:
        del os.environ["HF_HUB_OFFLINE"]
        reload_modules(False)


================================================
FILE: tests/integrations/__init__.py
================================================


================================================
FILE: tests/integrations/test_diffusion.py
================================================
"""Tests for diffusion trainer integration."""

# pylint: disable=redefined-outer-name,protected-access

from unittest.mock import Mock

import pytest
import torch

from axolotl.integrations.diffusion import DiffusionTrainer
from axolotl.integrations.diffusion.utils import create_bidirectional_attention_mask
from axolotl.utils.dict import DictDefault


@pytest.fixture
def mock_tokenizer():
    """Create a mock tokenizer."""
    tokenizer = Mock()
    tokenizer.bos_token_id = 1
    tokenizer.eos_token_id = 2
    tokenizer.pad_token_id = 0
    return tokenizer


@pytest.fixture
def diffusion_config():
    """Create a diffusion config."""
    return DictDefault(
        {
            "diffusion": {
                "mask_token_id": 32000,
                "eps": 1e-3,
                "importance_weighting": False,
            },
            "sample_packing": False,
        }
    )


@pytest.fixture
def diffusion_trainer_instance(mock_tokenizer, diffusion_config):
    """Create a diffusion trainer instance for testing methods directly."""
    # Create a minimal trainer instance just for testing methods
    trainer = object.__new__(DiffusionTrainer)  # Bypass __init__
    trainer.cfg = diffusion_config
    trainer._special_token_ids = {0, 1, 2}  # pad, bos, eos
    trainer.processing_class = mock_tokenizer
    trainer.store_metrics = Mock()  # Mock metrics storage
    return trainer


class TestDiffusionTrainer:
    """Test the DiffusionTrainer class."""

    def test_forward_process_basic(self, diffusion_trainer_instance):
        """Test basic forward process without labels."""
        input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long)

        noisy_batch, masked_indices, p_mask = (
            diffusion_trainer_instance._forward_process(input_ids, eps=0.1)
        )

        # Check shapes
        assert noisy_batch.shape == input_ids.shape
        assert masked_indices.shape == input_ids.shape
        assert p_mask.shape == input_ids.shape

        # Check that special tokens are not masked
        special_token_positions = (input_ids == 1) | (input_ids == 2) | (input_ids == 0)
        assert not masked_indices[special_token_positions].any()

        # Check that mask token is applied
        mask_token_id = diffusion_trainer_instance.cfg.diffusion.mask_token_id
        masked_positions = masked_indices
        if masked_positions.any():
            assert (noisy_batch[masked_positions] == mask_token_id).all()

    def test_forward_process_with_labels(self, diffusion_trainer_instance):
        """Test forward process with SFT labels."""
        input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long)
        labels = torch.tensor([[-100, -100, 20, 30, 2]], dtype=torch.long)

        noisy_batch, masked_indices, p_mask = (
            diffusion_trainer_instance._forward_process(
                input_ids, labels=labels, eps=0.1
            )
        )

        # Check shapes
        assert noisy_batch.shape == input_ids.shape
        assert masked_indices.shape == input_ids.shape
        assert p_mask.shape == input_ids.shape

        # Check that only answer tokens can be masked (where labels != -100)
        non_answer_mask = labels == -100

        # No masking should occur on non-answer tokens
        assert not masked_indices[non_answer_mask].any()

        # p_mask should be the same for all positions (sampled timestep),
        # but masking is only applied to answer tokens
        assert p_mask.shape == input_ids.shape
        # Verify that masked_indices respects the answer mask
        assert not masked_indices[non_answer_mask].any()

    def test_forward_process_with_attention_mask(self, diffusion_trainer_instance):
        """Test forward process with attention mask."""
        input_ids = torch.tensor([[1, 10, 20, 0]], dtype=torch.long)
        attention_mask = torch.tensor([[1, 1, 1, 0]], dtype=torch.long)

        _, masked_indices, p_mask = diffusion_trainer_instance._forward_process(
            input_ids, attention_mask=attention_mask, eps=0.1
        )

        # Check that padding tokens are not masked
        padding_positions = attention_mask == 0
        assert not masked_indices[padding_positions].any()
        assert (p_mask[padding_positions] == 0).all()

    def test_bidirectional_attention_mask_no_packing(self, diffusion_trainer_instance):
        """Test bidirectional attention mask without sample packing."""
        input_ids = torch.tensor([[1, 10, 20, 2]], dtype=torch.long)

        mask = create_bidirectional_attention_mask(input_ids)

        # Should be all-to-all attention
        expected_shape = (1, 1, 4, 4)
        assert mask.shape == expected_shape
        assert mask.all()

    def test_bidirectional_attention_mask_with_packing(
        self, diffusion_trainer_instance
    ):
        """Test bidirectional attention mask with sample packing."""
        diffusion_trainer_instance.cfg.sample_packing = True
        input_ids = torch.tensor([[1, 10, 20, 30, 40, 2]], dtype=torch.long)
        # Sample IDs: first sample (1), second sample (2)
        attention_mask = torch.tensor([[1, 1, 1, 2, 2, 2]], dtype=torch.long)

        mask = create_bidirectional_attention_mask(
            input_ids, attention_mask, sample_packing=True
        )

        # Check that tokens within same sample can attend to each other
        # but not across samples
        assert mask[0, 0, 0, 1].item()  # First sample tokens can attend to each other
        assert mask[0, 0, 1, 2].item()
        assert not mask[0, 0, 0, 3].item()  # Can't attend across samples
        assert not mask[0, 0, 2, 4].item()
        assert mask[0, 0, 3, 4].item()  # Second sample tokens can attend to each other

    def test_compute_loss_basic(self, diffusion_trainer_instance):
        """Test basic loss computation."""
        # Mock model that returns logits
        mock_model = Mock()
        mock_outputs = Mock()
        vocab_size = 1000
        seq_len = 5
        mock_outputs.logits = torch.randn(1, seq_len, vocab_size, requires_grad=True)
        mock_model.return_value = mock_outputs
        mock_model.training = True

        input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long)

        loss, outputs = diffusion_trainer_instance._compute_diffusion_loss(
            mock_model, input_ids
        )

        # Check that loss is computed
        assert isinstance(loss, torch.Tensor)
        assert loss.requires_grad
        assert outputs == mock_outputs

        # Check that metrics were stored
        diffusion_trainer_instance.store_metrics.assert_called_once()

    def test_compute_loss_sft(self, diffusion_trainer_instance):
        """Test loss computation with SFT labels."""
        # Mock model
        mock_model = Mock()
        mock_outputs = Mock()
        vocab_size = 1000
        seq_len = 5
        mock_outputs.logits = torch.randn(1, seq_len, vocab_size, requires_grad=True)
        mock_model.return_value = mock_outputs
        mock_model.training = True
        diffusion_trainer_instance.cfg.datasets = Mock()

        input_ids = torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long)
        labels = torch.tensor([[-100, -100, 20, 30, 2]], dtype=torch.long)

        loss, _ = diffusion_trainer_instance._compute_diffusion_loss(
            mock_model, input_ids, labels=labels
        )

        # Check that loss is computed
        assert isinstance(loss, torch.Tensor)
        assert loss.requires_grad

        # Check that SFT metrics were added
        call_args = diffusion_trainer_instance.store_metrics.call_args[0][0]
        assert "answer_ratio" in call_args
        assert "avg_answer_length" in call_args

    def test_compute_loss_no_masked_tokens(self, diffusion_trainer_instance):
        """Test loss computation when no tokens are masked."""
        # Mock model
        mock_model = Mock()
        mock_outputs = Mock()
        vocab_size = 1000
        seq_len = 3
        mock_outputs.logits = torch.randn(1, seq_len, vocab_size)
        mock_model.return_value = mock_outputs
        mock_model.training = True

        # Only special tokens (which won't be masked)
        input_ids = torch.tensor([[1, 0, 2]], dtype=torch.long)

        loss, _ = diffusion_trainer_instance._compute_diffusion_loss(
            mock_model, input_ids
        )

        # Loss should be zero when no tokens are masked
        assert loss.item() == 0.0
        assert loss.requires_grad

    def test_cache_special_token_ids(self, mock_tokenizer):
        """Test caching of special token IDs."""
        trainer = object.__new__(DiffusionTrainer)
        trainer.processing_class = mock_tokenizer
        trainer._cache_special_token_ids()
        assert trainer._special_token_ids == {0, 1, 2}

    def test_cache_special_token_ids_no_tokenizer(self):
        """Test caching when no tokenizer is available."""
        trainer = object.__new__(DiffusionTrainer)
        trainer.processing_class = None
        trainer._cache_special_token_ids()

        assert trainer._special_token_ids == set()

    def test_main_compute_loss_interface(self, diffusion_trainer_instance):
        """Test the main compute_loss interface."""
        # Mock model
        mock_model = Mock()
        mock_outputs = Mock()
        mock_outputs.logits = torch.randn(1, 5, 1000)
        mock_model.return_value = mock_outputs
        mock_model.training = True

        inputs = {
            "input_ids": torch.tensor([[1, 10, 20, 30, 2]], dtype=torch.long),
            "attention_mask": torch.tensor([[1, 1, 1, 1, 1]], dtype=torch.long),
            "labels": torch.tensor([[-100, -100, 20, 30, 2]], dtype=torch.long),
        }

        # Test without return_outputs
        loss = diffusion_trainer_instance.compute_loss(mock_model, inputs)
        assert isinstance(loss, torch.Tensor)

        # Test with return_outputs
        loss, outputs = diffusion_trainer_instance.compute_loss(
            mock_model, inputs, return_outputs=True
        )
        assert isinstance(loss, torch.Tensor)
        assert outputs == mock_outputs

    def test_missing_input_ids_raises_error(self, diffusion_trainer_instance):
        """Test that missing input_ids raises ValueError."""
        mock_model = Mock()
        inputs = {"attention_mask": torch.tensor([[1, 1, 1]])}

        with pytest.raises(ValueError, match="input_ids is required"):
            diffusion_trainer_instance.compute_loss(mock_model, inputs)


================================================
FILE: tests/integrations/test_diffusion_callback.py
================================================
"""Tests for diffusion generation callback dataloader selection and triggering."""

from types import SimpleNamespace
from unittest.mock import Mock

import pytest

from axolotl.integrations.diffusion import DiffusionGenerationCallback


class DummyTrainer:
    """Minimal trainer double with required attributes/methods for the callback."""

    def __init__(self, use_eval: bool):
        # Config used by callback
        self.cfg = SimpleNamespace(
            diffusion=SimpleNamespace(
                generation_interval=1,
                num_generation_samples=1,
                generation_max_length=32,
                generation_steps=4,
                generation_temperature=0.0,
                mask_token_id=16,
            ),
            use_wandb=False,
        )

        # Model/tokenizer are passed through to generate_samples; not used here
        self.model = Mock()
        self.processing_class = Mock()

        # Datasets and loaders
        self.eval_dataset = object() if use_eval else None
        self._train_loader = object()
        self._eval_loader = object()

        # State for world process check
        self.state = SimpleNamespace(is_world_process_zero=True)

        # Track which loader was requested
        self.requested: list[str] = []

    def get_train_dataloader(self):
        self.requested.append("train")
        return self._train_loader

    def get_eval_dataloader(self):
        self.requested.append("eval")
        return self._eval_loader


@pytest.mark.parametrize("use_eval", [False, True])
def test_callback_uses_correct_dataloader(monkeypatch, use_eval):
    trainer = DummyTrainer(use_eval=use_eval)
    callback = DiffusionGenerationCallback(trainer)

    captured = {}

    # Patch generate_samples in the callback module's namespace
    def fake_generate_samples(**kwargs):
        captured["dataloader"] = kwargs.get("dataloader")
        # Return one dummy sample to exercise logging path
        return [
            {
                "original": "o",
                "masked": "m",
                "generated": "g",
                "mask_ratio": 0.5,
                "masked_tokens": 1,
                "total_tokens": 2,
            }
        ]

    monkeypatch.setattr(
        "axolotl.integrations.diffusion.callbacks.generate_samples",
        fake_generate_samples,
    )

    # Trigger at step 1 (interval=1)
    args = SimpleNamespace()
    state = SimpleNamespace(global_step=1)
    control = SimpleNamespace()

    callback.on_step_end(args=args, state=state, control=control)

    # Assert the expected dataloader path was used
    if use_eval:
        assert trainer.requested[0] == "eval"
        assert captured["dataloader"] is trainer._eval_loader
    else:
        assert trainer.requested[0] == "train"
        assert captured["dataloader"] is trainer._train_loader


================================================
FILE: tests/integrations/test_kd_chat_template.py
================================================
"""
Test for KD chat template strategies
"""

from unittest.mock import Mock

import pytest

from axolotl.integrations.kd.chat_template import ChatTemplateStrategyWithKDv2


class TestChatTemplateStrategyWithKDv2:
    """Test v2 strategy correctly handles target_token_ids"""

    @pytest.fixture
    def v2_strategy(self):
        """Create v2 strategy instance with mocked dependencies"""
        # Mock prompter
        mock_prompter = Mock()
        mock_prompter.roles = {"user": "user", "assistant": "assistant"}
        mock_prompter.chat_template_msg_variables = ["role", "content"]
        mock_prompter.chat_template = "{{ messages }}"

        # Mock tokenizer
        mock_tokenizer = Mock()
        mock_tokenizer.pad_token_id = 0
        mock_tokenizer.eos_token_id = 2
        mock_tokenizer.bos_token_id = 1
        mock_tokenizer.eos_token = "<|endoftext|>"
        mock_tokenizer.apply_chat_template = Mock(return_value=[1, 10, 20, 30, 2])
        mock_tokenizer.encode = Mock(return_value=[2])

        return ChatTemplateStrategyWithKDv2(
            prompter=mock_prompter,
            tokenizer=mock_tokenizer,
            train_on_inputs=False,
            sequence_len=512,
            logprobs_field="logprobs",
            gen_temperature=1.0,
            kd_temperature=1.0,
        )

    def test_v2_prepare_kd_fields_adds_target_token_ids(self, v2_strategy):
        """
        Test that v2's _prepare_kd_fields hook adds target_token_ids.

        Validates the Template Method pattern fix where v2 overrides
        the hook to add target_token_ids before transform.
        """
        tokenized = {"input_ids": [1, 10, 20, 30, 2], "labels": [1, 10, 20, 30, 2]}
        original = {"target_token_ids": [[10, 20], [30, 40]]}

        result = v2_strategy._prepare_kd_fields(tokenized, original)

        assert "target_token_ids" in result
        assert result["target_token_ids"] == [[10, 20], [30, 40]]

    def test_v2_prepare_kd_fields_handles_missing_field(self, v2_strategy):
        """Test hook handles missing target_token_ids gracefully"""
        tokenized = {"input_ids": [1, 10, 20, 30, 2], "labels": [1, 10, 20, 30, 2]}
        original = {}

        result = v2_strategy._prepare_kd_fields(tokenized, original)

        assert "target_token_ids" not in result

    def test_v2_transform_requires_target_token_ids(self, v2_strategy):
        """
        Test v2's transform fails without target_token_ids.

        Validates the bug fix - transform expects target_token_ids
        to be added by the hook.
        """
        sample = {
            "input_ids": [1, 10, 20, 30, 2],
            "labels": [1, 10, 20, 30, 2],
            "logprobs": [[-0.1, -0.2], [-0.3, -0.4]],
        }

        with pytest.raises(KeyError, match="target_token_ids"):
            v2_strategy.transform_logprobs(sample)


================================================
FILE: tests/integrations/test_liger.py
================================================
"""
config validation tests for swiglu args
"""

from typing import Optional

import pytest

from axolotl.utils.config import prepare_plugins, validate_config
from axolotl.utils.dict import DictDefault


@pytest.fixture(name="minimal_liger_cfg")
def fixture_cfg():
    return DictDefault(
        {
            "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6",
            "learning_rate": 0.000001,
            "datasets": [
                {
                    "path": "mhenrichsen/alpaca_2k_test",
                    "type": "alpaca",
                }
            ],
            "micro_batch_size": 1,
            "gradient_accumulation_steps": 1,
            "plugins": ["axolotl.integrations.liger.LigerPlugin"],
        }
    )


class TestValidation:
    """
    Test the validation module for liger
    """

    _caplog: Optional[pytest.LogCaptureFixture] = None

    @pytest.fixture(autouse=True)
    def inject_fixtures(self, caplog):
        caplog.set_level("WARNING")
        self._caplog = caplog

    def test_deprecated_swiglu(self, minimal_liger_cfg):
        test_cfg = DictDefault(
            {
                "liger_swiglu": False,
            }
            | minimal_liger_cfg
        )

        with self._caplog.at_level("WARNING", logger="axolotl.integrations.liger.args"):
            prepare_plugins(test_cfg)
            updated_cfg = validate_config(test_cfg)
            # TODO this test is brittle in CI
            # assert (
            #     "The 'liger_swiglu' argument is deprecated"
            #     in self._caplog.records[0].message
            # )
            assert updated_cfg.liger_swiglu is None
            assert updated_cfg.liger_glu_activation is False

    def test_conflict_swiglu_ligergluactivation(self, minimal_liger_cfg):
        test_cfg = DictDefault(
            {
                "liger_swiglu": False,
                "liger_glu_activation": True,
            }
            | minimal_liger_cfg
        )

        with pytest.raises(
            ValueError,
            match=r".*You cannot have both `liger_swiglu` and `liger_glu_activation` set.*",
        ):
            prepare_plugins(test_cfg)
            validate_config(test_cfg)

    def test_use_token_scaling_require_flce(self, minimal_liger_cfg):
        test_cfg = DictDefault(
            {
                "liger_fused_linear_cross_entropy": False,
                "liger_use_token_scaling": True,
            }
            | minimal_liger_cfg
        )

        with pytest.raises(
            ValueError,
            match=r"`liger_use_token_scaling: true` requires `liger_fused_linear_cross_entropy` enabled.",
        ):
            prepare_plugins(test_cfg)
            validate_config(test_cfg)


================================================
FILE: tests/integrations/test_routing_parity.py
================================================
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) Axolotl AI
# Licensed under the Apache License, Version 2.0

"""
Parity tests between scattermoe-lora and sonicmoe routing implementations.

These tests verify that both implementations produce numerically identical
results for the same inputs, ensuring safe centralization of the routing code.

ScatterMoE returns 2D tensors [T, K]; SonicMoE returns flattened 1D [T*K].
The core algorithm should be identical — only the output format differs.
"""

from types import SimpleNamespace

import pytest
import torch


def _require_triton():
    pytest.importorskip("triton")


# ============================================================================
# Fixtures / helpers
# ============================================================================


def _make_softmax_block(T=8, H=16, E=4, K=2):
    """Qwen/OLMoE-style block usable by both implementations."""
    gate = SimpleNamespace(
        weight=torch.randn(E, H),
        top_k=K,
        num_experts=E,
        norm_topk_prob=True,
    )
    moe_block = SimpleNamespace(gate=gate)
    hidden = torch.randn(T, H)
    return moe_block, gate, hidden, T, H, E, K


def _make_sigmoid_block(
    T=8, H=16, E=16, K=4, n_group=2, topk_group=1, bias_on_gate=True
):
    """GLM/DeepSeek-style block usable by both implementations."""
    if bias_on_gate:
        gate = SimpleNamespace(
            weight=torch.randn(E, H),
            e_score_correction_bias=torch.zeros(E),
        )
        moe_block = SimpleNamespace(
            gate=gate,
            top_k=K,
            n_routed_experts=E,
            n_group=n_group,
            topk_group=topk_group,
            norm_topk_prob=True,
            routed_scaling_factor=1.0,
        )
    else:
        # minimax_m2 style: bias on block
        gate = SimpleNamespace(
            weight=torch.randn(E, H),
            top_k=K,
        )
        moe_block = SimpleNamespace(
            gate=gate,
            top_k=K,
            e_score_correction_bias=torch.zeros(E),
        )
    return moe_block, gate, hidden_states(T, H), T, H, E, K


def hidden_states(T, H):
    return torch.randn(T, H)


# ============================================================================
# 1. Softmax routing parity
# ============================================================================


class TestSoftmaxRoutingParity:
    """Verify scattermoe and sonicmoe softmax routing produce identical results."""

    @pytest.fixture(autouse=True)
    def _require(self):
        _require_triton()

    def test_weights_match(self):
        """2D weights from scattermoe == reshaped 1D weights from sonicmoe."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _softmax_topk_route,
        )
        from axolotl.integrations.kernels.sonicmoe.routing import softmax_topk_routing

        moe_block, gate, hidden, T, H, E, K = _make_softmax_block()

        # ScatterMoE path (no LoRA delta)
        sm_weights, sm_experts, sm_topk, sm_E = _softmax_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )

        # SonicMoE path
        sonic_scores, sonic_tok_idx, sonic_exp_idx, sonic_logits = softmax_topk_routing(
            hidden, moe_block
        )

        # ScatterMoE returns [T, K], SonicMoE returns [T*K] flattened
        sonic_weights_2d = sonic_scores.reshape(T, K)
        sonic_experts_2d = sonic_exp_idx.reshape(T, K)

        assert sm_topk == K
        assert sm_E == E

        # Both should select the same experts and produce the same weights
        assert torch.equal(sm_experts, sonic_experts_2d.to(sm_experts.dtype))
        assert torch.allclose(sm_weights, sonic_weights_2d, atol=1e-6)

    def test_logits_not_returned_by_scattermoe(self):
        """ScatterMoE doesn't return logits; SonicMoE does — verify SonicMoE logits shape."""
        from axolotl.integrations.kernels.sonicmoe.routing import softmax_topk_routing

        moe_block, gate, hidden, T, H, E, K = _make_softmax_block()
        _, _, _, logits = softmax_topk_routing(hidden, moe_block)
        assert logits.shape == (T, E)

    def test_no_renorm(self):
        """With norm_topk_prob=False, both should skip renormalization."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _softmax_topk_route,
        )
        from axolotl.integrations.kernels.sonicmoe.routing import softmax_topk_routing

        moe_block, gate, hidden, T, H, E, K = _make_softmax_block()
        gate.norm_topk_prob = False

        sm_weights, sm_experts, _, _ = _softmax_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )
        sonic_scores, _, sonic_exp_idx, _ = softmax_topk_routing(hidden, moe_block)

        sonic_weights_2d = sonic_scores.reshape(T, K)
        sonic_experts_2d = sonic_exp_idx.reshape(T, K)

        assert torch.equal(sm_experts, sonic_experts_2d.to(sm_experts.dtype))
        assert torch.allclose(sm_weights, sonic_weights_2d, atol=1e-6)

    def test_various_expert_counts(self):
        """Parity across different E and K values."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _softmax_topk_route,
        )
        from axolotl.integrations.kernels.sonicmoe.routing import softmax_topk_routing

        for E, K in [(2, 1), (8, 2), (16, 4), (32, 8)]:
            moe_block, gate, hidden, T, H, _, _ = _make_softmax_block(E=E, K=K)

            sm_weights, sm_experts, _, _ = _softmax_topk_route(
                moe_block, gate, hidden, gate.weight, None
            )
            sonic_scores, _, sonic_exp_idx, _ = softmax_topk_routing(hidden, moe_block)

            sonic_weights_2d = sonic_scores.reshape(T, K)
            sonic_experts_2d = sonic_exp_idx.reshape(T, K)

            assert torch.equal(sm_experts, sonic_experts_2d.to(sm_experts.dtype)), (
                f"Expert mismatch for E={E}, K={K}"
            )
            assert torch.allclose(sm_weights, sonic_weights_2d, atol=1e-6), (
                f"Weight mismatch for E={E}, K={K}"
            )


# ============================================================================
# 2. Sigmoid routing parity
# ============================================================================


class TestSigmoidRoutingParity:
    """Verify scattermoe and sonicmoe sigmoid routing produce identical results."""

    @pytest.fixture(autouse=True)
    def _require(self):
        _require_triton()

    def test_weights_match_with_groups(self):
        """Both implementations should produce identical weights with group selection."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _sigmoid_topk_route,
        )
        from axolotl.integrations.kernels.sonicmoe.routing import sigmoid_topk_routing

        moe_block, gate, hidden, T, H, E, K = _make_sigmoid_block(
            E=16, K=4, n_group=2, topk_group=1, bias_on_gate=True
        )

        sm_weights, sm_experts, sm_topk, sm_E = _sigmoid_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )

        sonic_scores, sonic_tok_idx, sonic_exp_idx, sonic_logits = sigmoid_topk_routing(
            hidden, moe_block
        )

        sonic_weights_2d = sonic_scores.reshape(T, K)
        sonic_experts_2d = sonic_exp_idx.reshape(T, K)

        assert sm_topk == K
        assert sm_E == E

        # Sort experts within each token to handle different topk orderings
        sm_sorted, sm_order = sm_experts.sort(dim=-1)
        sonic_sorted, sonic_order = sonic_experts_2d.to(sm_experts.dtype).sort(dim=-1)

        assert torch.equal(sm_sorted, sonic_sorted)

        # Gather weights in sorted order for comparison
        sm_weights_sorted = sm_weights.gather(1, sm_order)
        sonic_weights_sorted = sonic_weights_2d.gather(1, sonic_order)
        assert torch.allclose(sm_weights_sorted, sonic_weights_sorted, atol=1e-6)

    def test_weights_match_no_groups(self):
        """Both implementations match without group selection (n_group=1)."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _sigmoid_topk_route,
        )
        from axolotl.integrations.kernels.sonicmoe.routing import sigmoid_topk_routing

        moe_block, gate, hidden, T, H, E, K = _make_sigmoid_block(
            E=16, K=4, n_group=1, topk_group=1, bias_on_gate=True
        )

        sm_weights, sm_experts, _, _ = _sigmoid_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )
        sonic_scores, _, sonic_exp_idx, _ = sigmoid_topk_routing(hidden, moe_block)

        sonic_weights_2d = sonic_scores.reshape(T, K)
        sonic_experts_2d = sonic_exp_idx.reshape(T, K)

        # Sort for comparison (topk with sorted=False may differ in order)
        sm_sorted, sm_order = sm_experts.sort(dim=-1)
        sonic_sorted, sonic_order = sonic_experts_2d.to(sm_experts.dtype).sort(dim=-1)

        assert torch.equal(sm_sorted, sonic_sorted)
        sm_weights_sorted = sm_weights.gather(1, sm_order)
        sonic_weights_sorted = sonic_weights_2d.gather(1, sonic_order)
        assert torch.allclose(sm_weights_sorted, sonic_weights_sorted, atol=1e-6)

    def test_bias_on_block_parity(self):
        """minimax_m2 style: bias on block, not gate."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _sigmoid_topk_route,
        )
        from axolotl.integrations.kernels.sonicmoe.routing import sigmoid_topk_routing

        moe_block, gate, hidden, T, H, E, K = _make_sigmoid_block(
            E=16, K=4, n_group=1, bias_on_gate=False
        )

        sm_weights, sm_experts, _, _ = _sigmoid_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )
        sonic_scores, _, sonic_exp_idx, _ = sigmoid_topk_routing(hidden, moe_block)

        sonic_weights_2d = sonic_scores.reshape(T, K)
        sonic_experts_2d = sonic_exp_idx.reshape(T, K)

        sm_sorted, sm_order = sm_experts.sort(dim=-1)
        sonic_sorted, sonic_order = sonic_experts_2d.to(sm_experts.dtype).sort(dim=-1)

        assert torch.equal(sm_sorted, sonic_sorted)
        sm_weights_sorted = sm_weights.gather(1, sm_order)
        sonic_weights_sorted = sonic_weights_2d.gather(1, sonic_order)
        assert torch.allclose(sm_weights_sorted, sonic_weights_sorted, atol=1e-6)

    def test_scaling_factor_parity(self):
        """routed_scaling_factor applied identically by both."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _sigmoid_topk_route,
        )
        from axolotl.integrations.kernels.sonicmoe.routing import sigmoid_topk_routing

        moe_block, gate, hidden, T, H, E, K = _make_sigmoid_block(
            n_group=1, bias_on_gate=True
        )
        moe_block.routed_scaling_factor = 2.5

        sm_weights, sm_experts, _, _ = _sigmoid_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )
        sonic_scores, _, sonic_exp_idx, _ = sigmoid_topk_routing(hidden, moe_block)

        sonic_weights_2d = sonic_scores.reshape(T, K)
        sonic_experts_2d = sonic_exp_idx.reshape(T, K)

        sm_sorted, sm_order = sm_experts.sort(dim=-1)
        sonic_sorted, sonic_order = sonic_experts_2d.to(sm_experts.dtype).sort(dim=-1)

        assert torch.equal(sm_sorted, sonic_sorted)
        sm_weights_sorted = sm_weights.gather(1, sm_order)
        sonic_weights_sorted = sonic_weights_2d.gather(1, sonic_order)
        assert torch.allclose(sm_weights_sorted, sonic_weights_sorted, atol=1e-6)

    def test_no_renorm_parity(self):
        """norm_topk_prob=False produces same results in both."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _sigmoid_topk_route,
        )
        from axolotl.integrations.kernels.sonicmoe.routing import sigmoid_topk_routing

        moe_block, gate, hidden, T, H, E, K = _make_sigmoid_block(
            n_group=1, bias_on_gate=True
        )
        moe_block.norm_topk_prob = False

        sm_weights, sm_experts, _, _ = _sigmoid_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )
        sonic_scores, _, sonic_exp_idx, _ = sigmoid_topk_routing(hidden, moe_block)

        sonic_weights_2d = sonic_scores.reshape(T, K)
        sonic_experts_2d = sonic_exp_idx.reshape(T, K)

        sm_sorted, sm_order = sm_experts.sort(dim=-1)
        sonic_sorted, sonic_order = sonic_experts_2d.to(sm_experts.dtype).sort(dim=-1)

        assert torch.equal(sm_sorted, sonic_sorted)
        sm_weights_sorted = sm_weights.gather(1, sm_order)
        sonic_weights_sorted = sonic_weights_2d.gather(1, sonic_order)
        assert torch.allclose(sm_weights_sorted, sonic_weights_sorted, atol=1e-6)


# ============================================================================
# 3. Shared expert parity
# ============================================================================


class TestSharedExpertParity:
    """Verify both _compute_shared_expert implementations behave identically."""

    @pytest.fixture(autouse=True)
    def _require(self):
        _require_triton()

    def _get_both_fns(self):
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _compute_shared_expert as scatter_compute,
        )
        from axolotl.integrations.kernels.sonicmoe.patch import (
            _compute_shared_expert as sonic_compute,
        )

        return scatter_compute, sonic_compute

    def test_shared_expert_singular(self):
        scatter_fn, sonic_fn = self._get_both_fns()
        out = torch.randn(4, 8)
        block = SimpleNamespace(shared_expert=lambda x: out)
        hidden = torch.randn(4, 8)

        assert torch.equal(scatter_fn(block, hidden), sonic_fn(block, hidden))

    def test_shared_experts_plural(self):
        scatter_fn, sonic_fn = self._get_both_fns()
        out = torch.randn(4, 8)
        block = SimpleNamespace(shared_experts=lambda x: out)
        hidden = torch.randn(4, 8)

        assert torch.equal(scatter_fn(block, hidden), sonic_fn(block, hidden))

    def test_shared_mlp(self):
        scatter_fn, sonic_fn = self._get_both_fns()
        out = torch.randn(4, 8)
        block = SimpleNamespace(shared_mlp=lambda x: out)
        hidden = torch.randn(4, 8)

        assert torch.equal(scatter_fn(block, hidden), sonic_fn(block, hidden))

    def test_no_shared_expert(self):
        scatter_fn, sonic_fn = self._get_both_fns()
        block = SimpleNamespace()
        hidden = torch.randn(4, 8)

        assert scatter_fn(block, hidden) is None
        assert sonic_fn(block, hidden) is None

    def test_shared_expert_gate_only_in_scattermoe(self):
        """ScatterMoE's _compute_shared_expert handles shared_expert_gate;
        SonicMoE's patch.py handles it externally in the forward function.

        This documents the known divergence: the scattermoe version applies
        sigmoid gating inline, while sonicmoe applies it in the forward.
        """
        scatter_fn, sonic_fn = self._get_both_fns()

        H = 8
        expert_out = torch.ones(4, H)
        gate_fn = lambda x: torch.zeros(4, H)  # noqa: E731  # sigmoid(0) = 0.5

        block = SimpleNamespace(
            shared_expert=lambda x: expert_out,
            shared_expert_gate=gate_fn,
        )
        hidden = torch.randn(4, H)

        scatter_result = scatter_fn(block, hidden)
        sonic_result = sonic_fn(block, hidden)

        # ScatterMoE applies the gate: expert_out * sigmoid(0) = 0.5
        expected_gated = expert_out * 0.5
        assert torch.allclose(scatter_result, expected_gated, atol=1e-6)

        # SonicMoE does NOT apply the gate here (it does it in the forward)
        assert torch.equal(sonic_result, expert_out)


# ============================================================================
# 4. Route dispatcher parity
# ============================================================================


class TestRouteDispatcherParity:
    """Verify _route in scattermoe dispatches correctly and matches individual fns."""

    @pytest.fixture(autouse=True)
    def _require(self):
        _require_triton()

    def test_route_dispatches_softmax(self):
        """_route should use softmax when no e_score_correction_bias."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _route,
            _softmax_topk_route,
        )

        moe_block, gate, hidden, T, H, E, K = _make_softmax_block()

        route_w, route_e, route_k, route_E = _route(
            moe_block, gate, hidden, gate.weight, None
        )
        direct_w, direct_e, direct_k, direct_E = _softmax_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )

        assert torch.equal(route_w, direct_w)
        assert torch.equal(route_e, direct_e)
        assert route_k == direct_k
        assert route_E == direct_E

    def test_route_dispatches_sigmoid(self):
        """_route should use sigmoid when e_score_correction_bias is present."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _route,
            _sigmoid_topk_route,
        )

        moe_block, gate, hidden, T, H, E, K = _make_sigmoid_block(
            n_group=1, bias_on_gate=True
        )

        route_w, route_e, route_k, route_E = _route(
            moe_block, gate, hidden, gate.weight, None
        )
        direct_w, direct_e, direct_k, direct_E = _sigmoid_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )

        assert torch.equal(route_w, direct_w)
        assert torch.equal(route_e, direct_e)
        assert route_k == direct_k
        assert route_E == direct_E


================================================
FILE: tests/integrations/test_scattermoe_autotune_telemetry.py
================================================
"""Tests for scattermoe autotune telemetry integration.

These tests use mocking to verify the collection and reporting logic
without requiring Triton or CUDA.
"""

import sys
from types import SimpleNamespace
from unittest.mock import MagicMock, patch

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------

# Simulate the hash-suffixed module name that LocalLayerRepository creates.
_FAKE_MODULE_NAME = "scattermoe_lora_abc123.kernels.lora_ops"

# Patch target for _find_lora_ops_module inside the collector module.
_FIND_MODULE_PATH = (
    "axolotl.integrations.kernels.autotune_collector._find_lora_ops_module"
)


def _make_mock_config(kwargs, num_warps=4, num_stages=3):
    """Create a mock triton.Config-like object."""
    return SimpleNamespace(kwargs=kwargs, num_warps=num_warps, num_stages=num_stages)


def _make_mock_kernel(cache=None):
    """Create a mock autotuned kernel object with a ``.cache`` dict."""
    kernel = SimpleNamespace()
    kernel.cache = cache if cache is not None else {}
    return kernel


def _make_mock_lora_ops(
    fwd_cache=None, dx_cache=None, bwd_cache=None, fused_cache=None
):
    """Build a mock ``lora_ops`` module with the four kernel attributes."""
    mod = SimpleNamespace(
        _scatter2scatter_lora=_make_mock_kernel(fwd_cache),
        _scatter2scatter_lora_dX=_make_mock_kernel(dx_cache),
        _group_bwd_lora=_make_mock_kernel(bwd_cache),
        _group_bwd_lora_fused=_make_mock_kernel(fused_cache),
    )
    return mod


def _real_lora_ops_module_names():
    """Return sys.modules keys that match the lora_ops discovery pattern.

    Other tests in the same xdist worker may have loaded the *real*
    lora_ops module.  We need to temporarily hide those entries so the
    discovery test finds only the mock we inject.
    """
    return [
        name
        for name, mod in list(sys.modules.items())
        if mod is not None
        and "lora_ops" in name
        and hasattr(mod, "_scatter2scatter_lora")
    ]


# =========================================================================
# TestAutotuneCollector
# =========================================================================


class TestAutotuneCollector:
    """Test ``collect_autotune_configs`` with mocked kernel objects.

    Collection tests patch ``_find_lora_ops_module`` directly so they are
    not affected by real ``lora_ops`` modules that other tests in the same
    pytest-xdist worker may have loaded into ``sys.modules``.
    """

    def test_empty_cache_returns_empty_list(self):
        """When no kernel has been autotuned yet, return ``[]``."""
        mock_lora_ops = _make_mock_lora_ops()

        with patch(_FIND_MODULE_PATH, return_value=mock_lora_ops):
            from axolotl.integrations.kernels.autotune_collector import (
                collect_autotune_configs,
            )

            result = collect_autotune_configs()
            assert result == []

    def test_populated_cache_returns_configs(self):
        """When a cache entry exists, it appears in the output."""
        cfg = _make_mock_config(
            {"BLOCK_N": 128, "BLOCK_K": 64}, num_warps=8, num_stages=4
        )
        mock_lora_ops = _make_mock_lora_ops(fwd_cache={(2048, 4096, 1024): cfg})

        with patch(_FIND_MODULE_PATH, return_value=mock_lora_ops):
            from axolotl.integrations.kernels.autotune_collector import (
                collect_autotune_configs,
            )

            result = collect_autotune_configs()

        assert len(result) == 1
        entry = result[0]
        assert entry["kernel"] == "scatter2scatter_lora_fwd"
        assert entry["key"] == {"M": 2048, "N": 4096, "K": 1024}
        assert entry["config"]["BLOCK_N"] == 128
        assert entry["config"]["BLOCK_K"] == 64
        assert entry["config"]["num_warps"] == 8
        assert entry["config"]["num_stages"] == 4

    def test_multiple_kernels_and_keys(self):
        """Multiple cache entries across kernels are all returned."""
        cfg_fwd = _make_mock_config({"BLOCK_N": 128, "BLOCK_K": 32})
        cfg_dx = _make_mock_config({"BLOCK_K": 64, "BLOCK_N": 128}, num_warps=8)

        mock_lora_ops = _make_mock_lora_ops(
            fwd_cache={(16, 256, 128): cfg_fwd},
            dx_cache={(16, 256, 128): cfg_dx},
        )

        with patch(_FIND_MODULE_PATH, return_value=mock_lora_ops):
            from axolotl.integrations.kernels.autotune_collector import (
                collect_autotune_configs,
            )

            result = collect_autotune_configs()

        assert len(result) == 2
        names = {r["kernel"] for r in result}
        assert "scatter2scatter_lora_fwd" in names
        assert "scatter2scatter_lora_dX" in names

    def test_extra_key_elements_stored(self):
        """Dtype or other extra elements in the cache key are captured."""
        cfg = _make_mock_config({"BLOCK_N": 64, "BLOCK_K": 32})
        cache_key = (512, 1024, 256, "float16", "float16")

        mock_lora_ops = _make_mock_lora_ops(fwd_cache={cache_key: cfg})

        with patch(_FIND_MODULE_PATH, return_value=mock_lora_ops):
            from axolotl.integrations.kernels.autotune_collector import (
                collect_autotune_configs,
            )

            result = collect_autotune_configs()

        assert len(result) == 1
        key = result[0]["key"]
        assert key["M"] == 512
        assert key["N"] == 1024
        assert key["K"] == 256
        assert key["_extra"] == ["float16", "float16"]

    def test_no_module_in_sys_modules_returns_empty(self):
        """If no lora_ops module is loaded, return ``[]``."""
        from axolotl.integrations.kernels.autotune_collector import (
            collect_autotune_configs,
        )

        with patch(_FIND_MODULE_PATH, return_value=None):
            result = collect_autotune_configs()
        assert result == []

    def test_finds_module_under_hash_suffixed_name(self):
        """Collector finds lora_ops regardless of the hash suffix."""
        cfg = _make_mock_config({"BLOCK_N": 256, "BLOCK_K": 128})
        mock_lora_ops = _make_mock_lora_ops(fwd_cache={(8, 512, 64): cfg})

        # Use a different hash to prove it's not hardcoded.
        alt_name = "scattermoe_lora_deadbeef.kernels.lora_ops"

        # Temporarily hide any real lora_ops modules that other tests in
        # the same xdist worker may have loaded, so only our mock is found.
        real_names = _real_lora_ops_module_names()
        hide_patch = {name: None for name in real_names}

        with patch.dict(sys.modules, {alt_name: mock_lora_ops, **hide_patch}):
            from axolotl.integrations.kernels.autotune_collector import (
                collect_autotune_configs,
            )

            result = collect_autotune_configs()

        assert len(result) == 1
        assert result[0]["config"]["BLOCK_N"] == 256


# =========================================================================
# TestAutotuneReportCallback
# =========================================================================


class TestAutotuneReportCallback:
    """Test the callback fires once and sends the correct event."""

    def test_reports_once_on_first_step(self):
        """Callback should call ``send_event`` exactly once."""
        from axolotl.integrations.kernels.autotune_callback import (
            AutotuneReportCallback,
        )

        cb = AutotuneReportCallback()
        mock_state = MagicMock()
        mock_state.global_step = 1

        fake_configs = [{"kernel": "test_fwd", "key": {}, "config": {}}]

        with (
            patch(
                "axolotl.integrations.kernels.autotune_collector.collect_autotune_configs",
                return_value=fake_configs,
            ),
            patch("axolotl.telemetry.manager.TelemetryManager") as mock_tm_cls,
        ):
            mock_tm = MagicMock()
            mock_tm.enabled = True
            mock_tm_cls.get_instance.return_value = mock_tm

            cb.on_step_end(args=MagicMock(), state=mock_state, control=MagicMock())
            assert mock_tm.send_event.call_count == 1

            call_kwargs = mock_tm.send_event.call_args[1]
            assert call_kwargs["event_type"] == "scattermoe-autotune"
            assert call_kwargs["properties"]["kernel_count"] == 1

            # Second call should NOT send again.
            cb.on_step_end(args=MagicMock(), state=mock_state, control=MagicMock())
            assert mock_tm.send_event.call_count == 1

    def test_retries_until_step_5_then_gives_up(self):
        """If no configs found by step 5, stop retrying."""
        from axolotl.integrations.kernels.autotune_callback import (
            AutotuneReportCallback,
        )

        cb = AutotuneReportCallback()

        with patch(
            "axolotl.integrations.kernels.autotune_collector.collect_autotune_configs",
            return_value=[],
        ):
            for step in range(1, 7):
                mock_state = MagicMock()
                mock_state.global_step = step
                cb.on_step_end(args=MagicMock(), state=mock_state, control=MagicMock())

            assert cb._reported is True

    def test_reports_on_retry_when_data_arrives(self):
        """If step 1 has no data but step 2 does, report at step 2."""
        from axolotl.integrations.kernels.autotune_callback import (
            AutotuneReportCallback,
        )

        cb = AutotuneReportCallback()
        fake_configs = [{"kernel": "fwd", "key": {}, "config": {}}]

        call_count = 0

        def _collector():
            nonlocal call_count
            call_count += 1
            if call_count == 1:
                return []
            return fake_configs

        with (
            patch(
                "axolotl.integrations.kernels.autotune_collector.collect_autotune_configs",
                side_effect=_collector,
            ),
            patch("axolotl.telemetry.manager.TelemetryManager") as mock_tm_cls,
        ):
            mock_tm = MagicMock()
            mock_tm.enabled = True
            mock_tm_cls.get_instance.return_value = mock_tm

            # Step 1 — empty, no report
            s1 = MagicMock()
            s1.global_step = 1
            cb.on_step_end(args=MagicMock(), state=s1, control=MagicMock())
            assert mock_tm.send_event.call_count == 0

            # Step 2 — data arrives, report
            s2 = MagicMock()
            s2.global_step = 2
            cb.on_step_end(args=MagicMock(), state=s2, control=MagicMock())
            assert mock_tm.send_event.call_count == 1

    def test_includes_gpu_info(self):
        """Event properties should include GPU identification."""
        from axolotl.integrations.kernels.autotune_callback import (
            AutotuneReportCallback,
        )

        cb = AutotuneReportCallback()
        mock_state = MagicMock()
        mock_state.global_step = 1

        fake_configs = [{"kernel": "fwd", "key": {}, "config": {}}]
        fake_gpu = {
            "gpu_name": "NVIDIA H100",
            "gpu_compute_capability": "9.0",
            "gpu_memory_bytes": 85899345920,
        }

        fake_smem = {"smem_capacity_bytes": 233472}

        with (
            patch(
                "axolotl.integrations.kernels.autotune_collector.collect_autotune_configs",
                return_value=fake_configs,
            ),
            patch(
                "axolotl.integrations.kernels.autotune_callback._get_gpu_info",
                return_value=fake_gpu,
            ),
            patch(
                "axolotl.integrations.kernels.autotune_callback._get_smem_capacity",
                return_value=fake_smem,
            ),
            patch("axolotl.telemetry.manager.TelemetryManager") as mock_tm_cls,
        ):
            mock_tm = MagicMock()
            mock_tm.enabled = True
            mock_tm_cls.get_instance.return_value = mock_tm

            cb.on_step_end(args=MagicMock(), state=mock_state, control=MagicMock())
            props = mock_tm.send_event.call_args[1]["properties"]
            assert props["gpu_name"] == "NVIDIA H100"
            assert props["gpu_compute_capability"] == "9.0"
            assert props["gpu_memory_bytes"] == 85899345920
            assert props["smem_capacity_bytes"] == 233472

    def test_skips_send_when_telemetry_disabled(self):
        """If telemetry is disabled, no event is sent."""
        from axolotl.integrations.kernels.autotune_callback import (
            AutotuneReportCallback,
        )

        cb = AutotuneReportCallback()
        mock_state = MagicMock()
        mock_state.global_step = 1

        with (
            patch(
                "axolotl.integrations.kernels.autotune_collector.collect_autotune_configs",
                return_value=[{"kernel": "fwd", "key": {}, "config": {}}],
            ),
            patch("axolotl.telemetry.manager.TelemetryManager") as mock_tm_cls,
        ):
            mock_tm = MagicMock()
            mock_tm.enabled = False
            mock_tm_cls.get_instance.return_value = mock_tm

            cb.on_step_end(args=MagicMock(), state=mock_state, control=MagicMock())
            assert mock_tm.send_event.call_count == 0
            # Should still mark as reported so we don't retry.
            assert cb._reported is True


# =========================================================================
# TestKernelsPluginCallbackRegistration
# =========================================================================


class TestKernelsPluginCallbackRegistration:
    """Test that ``KernelsPlugin`` registers the callback correctly."""

    def test_scattermoe_registers_callback(self):
        """When ``use_scattermoe=True``, plugin returns the callback."""
        from axolotl.integrations.kernels.autotune_callback import (
            AutotuneReportCallback,
        )
        from axolotl.integrations.kernels.plugin import KernelsPlugin

        plugin = KernelsPlugin()
        cfg = MagicMock()
        cfg.use_scattermoe = True
        model = MagicMock()

        callbacks = plugin.add_callbacks_pre_trainer(cfg, model)
        assert len(callbacks) == 1
        assert isinstance(callbacks[0], AutotuneReportCallback)

    def test_no_scattermoe_no_callback(self):
        """When ``use_scattermoe=False``, plugin returns empty list."""
        from axolotl.integrations.kernels.plugin import KernelsPlugin

        plugin = KernelsPlugin()
        cfg = MagicMock()
        cfg.use_scattermoe = False
        model = MagicMock()

        callbacks = plugin.add_callbacks_pre_trainer(cfg, model)
        assert callbacks == []


================================================
FILE: tests/integrations/test_scattermoe_lora.py
================================================
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) Axolotl AI
# Licensed under the Apache License, Version 2.0

"""
Unit tests for scattermoe-lora.

Tests cover:
- KernelsArgs validator: disable_mlp_kernel
- ParallelExperts: scaling=0.0 not treated as falsy
- single2scatter: non-aligned K/N dimensions
- group_compileable: coeff=None accepted
- HFScatterMoEGatedMLP / ScatterMoEGatedMLP: return value contract
- Routing strategy detection and sigmoid routing
- Generic shared expert handling
"""

from types import SimpleNamespace
from unittest.mock import patch

import pytest
import torch

# ============================================================================
# 1. KernelsArgs: disable_mlp_kernel validator
# ============================================================================


class TestKernelsArgsValidator:
    """Test that disable_mlp_kernel sets both flags correctly.

    These tests call the validator classmethod directly on raw dicts,
    since lora_mlp_kernel / mlp_kernel are not declared model fields.
    """

    def test_disables_lora_mlp_kernel_when_scattermoe(self):
        """lora_mlp_kernel=True gets set to False when use_scattermoe=True."""
        from axolotl.integrations.kernels.args import KernelsArgs

        data = {
            "use_kernels": True,
            "use_scattermoe": True,
            "lora_mlp_kernel": True,
        }
        result = KernelsArgs.disable_mlp_kernel(data)
        assert result["lora_mlp_kernel"] is False
        assert result["mlp_kernel"] is False

    def test_mlp_kernel_disabled_without_lora(self):
        """Even without lora_mlp_kernel, mlp_kernel should be disabled."""
        from axolotl.integrations.kernels.args import KernelsArgs

        data = {
            "use_kernels": True,
            "use_scattermoe": True,
        }
        result = KernelsArgs.disable_mlp_kernel(data)
        assert result["mlp_kernel"] is False
        # lora_mlp_kernel was not in data, should not be added
        assert "lora_mlp_kernel" not in result

    def test_lora_mlp_kernel_false_unchanged(self):
        """lora_mlp_kernel=False should stay False (no warning, no change)."""
        from axolotl.integrations.kernels.args import KernelsArgs

        data = {
            "use_kernels": True,
            "use_scattermoe": True,
            "lora_mlp_kernel": False,
        }
        result = KernelsArgs.disable_mlp_kernel(data)
        assert result["lora_mlp_kernel"] is False

    def test_no_change_when_scattermoe_disabled(self):
        """When use_scattermoe is not True, nothing should be changed."""
        from axolotl.integrations.kernels.args import KernelsArgs

        data = {
            "use_kernels": True,
            "use_scattermoe": False,
            "lora_mlp_kernel": True,
        }
        result = KernelsArgs.disable_mlp_kernel(data)
        assert result["lora_mlp_kernel"] is True


class TestParallelExpertsScaling:
    """Test that scaling=0.0 is preserved and not overridden to 1.0."""

    def test_scaling_zero_preserved(self):
        """scaling=0.0 should be passed as 0.0, not replaced with 1.0."""
        pytest.importorskip("triton")
        from axolotl.integrations.kernels.libs.scattermoe_lora.lora_ops import (
            ParallelExperts,
        )

        pe = ParallelExperts(num_experts=2, input_size=4, output_size=4)
        pe.set_lora(
            lora_A=torch.randn(4, 4),
            lora_B=torch.randn(4, 4),
            scaling=0.0,
        )
        assert pe._lora_scaling == 0.0

        # Patch parallel_linear_lora to capture the scaling arg
        with patch(
            "axolotl.integrations.kernels.libs.scattermoe_lora.lora_ops.parallel_linear_lora"
        ) as mock_pll:
            mock_pll.return_value = torch.randn(4, 4)
            # Create dummy routing tensors
            pe.forward(
                inputs=torch.randn(2, 4),
                k=1,
                sorted_expert_idxs=torch.tensor([0, 0, 1, 1]),
                sorted_scattered_idxs=torch.tensor([0, 1, 0, 1]),
                expert_offsets=torch.tensor([2, 4]),
            )
            # Check that scaling=0.0 was passed, not 1.0
            call_kwargs = mock_pll.call_args
            assert (
                call_kwargs.kwargs.get("scaling") == 0.0
                or call_kwargs[1].get("scaling") == 0.0
            ), f"Expected scaling=0.0 but got {call_kwargs}"

    def test_scaling_none_defaults_to_one(self):
        """scaling=None (no LoRA attached) should default to 1.0."""
        pytest.importorskip("triton")
        from axolotl.integrations.kernels.libs.scattermoe_lora.lora_ops import (
            ParallelExperts,
        )

        pe = ParallelExperts(num_experts=2, input_size=4, output_size=4)
        # No set_lora called, so _lora_scaling is None

        with patch(
            "axolotl.integrations.kernels.libs.scattermoe_lora.lora_ops.parallel_linear_lora"
        ) as mock_pll:
            mock_pll.return_value = torch.randn(4, 4)
            pe.forward(
                inputs=torch.randn(2, 4),
                k=1,
                sorted_expert_idxs=torch.tensor([0, 0, 1, 1]),
                sorted_scattered_idxs=torch.tensor([0, 1, 0, 1]),
                expert_offsets=torch.tensor([2, 4]),
            )
            call_kwargs = mock_pll.call_args
            scaling_val = call_kwargs.kwargs.get("scaling") or call_kwargs[1].get(
                "scaling"
            )
            assert scaling_val == 1.0, (
                f"Expected scaling=1.0 for None but got {scaling_val}"
            )

    def test_scaling_positive_preserved(self):
        """Normal positive scaling should be preserved."""
        pytest.importorskip("triton")
        from axolotl.integrations.kernels.libs.scattermoe_lora.lora_ops import (
            ParallelExperts,
        )

        pe = ParallelExperts(num_experts=2, input_size=4, output_size=4)
        pe.set_lora(
            lora_A=torch.randn(4, 4),
            lora_B=torch.randn(4, 4),
            scaling=0.5,
        )

        with patch(
            "axolotl.integrations.kernels.libs.scattermoe_lora.lora_ops.parallel_linear_lora"
        ) as mock_pll:
            mock_pll.return_value = torch.randn(4, 4)
            pe.forward(
                inputs=torch.randn(2, 4),
                k=1,
                sorted_expert_idxs=torch.tensor([0, 0, 1, 1]),
                sorted_scattered_idxs=torch.tensor([0, 1, 0, 1]),
                expert_offsets=torch.tensor([2, 4]),
            )
            call_kwargs = mock_pll.call_args
            scaling_val = call_kwargs.kwargs.get("scaling") or call_kwargs[1].get(
                "scaling"
            )
            assert scaling_val == 0.5


# ============================================================================
# 4. single2scatter: non-aligned K/N dimensions (GPU only)
# ============================================================================


@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
class TestSingle2ScatterBounds:
    """Test single2scatter with non-aligned dimensions."""

    def test_non_aligned_k(self):
        """K not a multiple of BLOCK_K should produce correct results."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.kernels.single import (
            single2scatter,
        )

        E, K, N = 2, 100, 128  # K=100 not a multiple of 128
        W = torch.randn(E, K, N, device="cuda", dtype=torch.float32)
        X = torch.randn(1, K, device="cuda", dtype=torch.float32)
        expert_idxs = torch.tensor([[0, 1]], device="cuda", dtype=torch.long)

        Y = single2scatter(X, W, expert_idxs)
        assert Y.shape == (2, N)

        # Verify against manual computation
        Y_ref_0 = X[0] @ W[0]
        Y_ref_1 = X[0] @ W[1]
        torch.testing.assert_close(Y[0], Y_ref_0, atol=1e-2, rtol=1e-2)
        torch.testing.assert_close(Y[1], Y_ref_1, atol=1e-2, rtol=1e-2)

    def test_non_aligned_n(self):
        """N not a multiple of BLOCK_N should produce correct results."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.kernels.single import (
            single2scatter,
        )

        E, K, N = 2, 128, 100  # N=100 not a multiple of 128
        W = torch.randn(E, K, N, device="cuda", dtype=torch.float32)
        X = torch.randn(1, K, device="cuda", dtype=torch.float32)
        expert_idxs = torch.tensor([[0, 1]], device="cuda", dtype=torch.long)

        Y = single2scatter(X, W, expert_idxs)
        assert Y.shape == (2, N)

        Y_ref_0 = X[0] @ W[0]
        Y_ref_1 = X[0] @ W[1]
        torch.testing.assert_close(Y[0], Y_ref_0, atol=1e-2, rtol=1e-2)
        torch.testing.assert_close(Y[1], Y_ref_1, atol=1e-2, rtol=1e-2)

    def test_non_aligned_both(self):
        """Both K and N not aligned should produce correct results."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.kernels.single import (
            single2scatter,
        )

        E, K, N = 2, 100, 100  # Neither aligned to 128
        W = torch.randn(E, K, N, device="cuda", dtype=torch.float32)
        X = torch.randn(1, K, device="cuda", dtype=torch.float32)
        expert_idxs = torch.tensor([[0, 1]], device="cuda", dtype=torch.long)

        Y = single2scatter(X, W, expert_idxs)
        assert Y.shape == (2, N)

        Y_ref_0 = X[0] @ W[0]
        Y_ref_1 = X[0] @ W[1]
        torch.testing.assert_close(Y[0], Y_ref_0, atol=1e-2, rtol=1e-2)
        torch.testing.assert_close(Y[1], Y_ref_1, atol=1e-2, rtol=1e-2)


# ============================================================================
# 5. group_compileable: coeff=None accepted
# ============================================================================


@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
class TestGroupCoeffNone:
    """Test that group() works with coeff=None."""

    def test_group_with_none_coeff(self):
        """group() should accept coeff=None without errors."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.kernels.ops import group

        M, K = 4, 32
        A = torch.randn(M, K, device="cuda", dtype=torch.float32)
        sorted_expert_idxs = torch.tensor([0, 1, 2, 3], device="cuda", dtype=torch.long)

        # This should not raise a TypeError
        Y = group(A, sorted_expert_idxs, coeff=None, fan_out=1)
        assert Y.shape == (M, K)

    def test_group_with_coeff(self):
        """group() should also work with actual coeff values."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.kernels.ops import group

        M, K = 4, 32
        A = torch.randn(M, K, device="cuda", dtype=torch.float32)
        sorted_expert_idxs = torch.tensor([0, 1, 2, 3], device="cuda", dtype=torch.long)
        coeff = torch.ones(M, device="cuda", dtype=torch.float32) * 0.5

        Y = group(A, sorted_expert_idxs, coeff=coeff, fan_out=1)
        assert Y.shape == (M, K)


# ============================================================================
# 6. Layer return value contracts
# ============================================================================


class TestLayerReturnValues:
    """Test that layer forward methods return the correct types."""

    def test_hf_scatter_moe_returns_single_tensor(self):
        """HFScatterMoEGatedMLP.forward should return a single tensor, not a tuple."""
        pytest.importorskip("triton")
        # Verify the forward method signature and return annotation
        import inspect

        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            HFScatterMoEGatedMLP,
        )

        sig = inspect.signature(HFScatterMoEGatedMLP.forward)
        # It's a staticmethod taking (self, layer_input)
        params = list(sig.parameters.keys())
        assert "self" in params
        assert "layer_input" in params

    def test_scatter_moe_gated_mlp_docstring_no_router_logits(self):
        """ScatterMoEGatedMLP.forward docstring should not mention router logits as return."""
        pytest.importorskip("triton")
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            ScatterMoEGatedMLP,
        )

        docstring = ScatterMoEGatedMLP.forward.__doc__
        assert docstring is not None
        # The docstring should mention output tensor but NOT router logits
        assert "Output tensor" in docstring or "output tensor" in docstring.lower()
        assert "Router logits" not in docstring, (
            "Docstring should not mention 'Router logits' in Returns section"
        )


# ============================================================================
# 7. Routing strategy detection and sigmoid routing
# ============================================================================


def _make_softmax_gate(E=4, H=16, K=2):
    """Create a mock softmax-style gate (Qwen/OLMoE)."""
    return SimpleNamespace(
        weight=torch.randn(E, H),
        top_k=K,
        num_experts=E,
        norm_topk_prob=True,
    )


def _make_sigmoid_gate_with_bias(E=16, H=16):
    """Create a mock sigmoid-style gate with e_score_correction_bias on gate."""
    return SimpleNamespace(
        weight=torch.randn(E, H),
        e_score_correction_bias=torch.zeros(E),
    )


def _make_sigmoid_moe_block(
    T=8, H=16, E=16, K=4, n_group=2, topk_group=1, bias_on_gate=True
):
    """Create a mock GLM/DeepSeek-style MoE block for sigmoid routing tests."""
    if bias_on_gate:
        gate = SimpleNamespace(
            weight=torch.randn(E, H),
            e_score_correction_bias=torch.zeros(E),
        )
        moe_block = SimpleNamespace(
            gate=gate,
            top_k=K,
            n_routed_experts=E,
            n_group=n_group,
            topk_group=topk_group,
            norm_topk_prob=True,
            routed_scaling_factor=1.0,
        )
    else:
        # minimax_m2 style: bias on block, not gate
        gate = SimpleNamespace(
            weight=torch.randn(E, H),
            top_k=K,
        )
        moe_block = SimpleNamespace(
            gate=gate,
            top_k=K,
            e_score_correction_bias=torch.zeros(E),
        )
    return moe_block, T, H, E, K


def _skip_without_triton():
    pytest.importorskip("triton")


class TestSigmoidRoutingInScatterMoE:
    """Test _sigmoid_topk_route from layers.py."""

    @pytest.fixture(autouse=True)
    def _require_triton(self):
        _skip_without_triton()

    def test_output_shapes(self):
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _sigmoid_topk_route,
        )

        moe_block, T, H, E, K = _make_sigmoid_moe_block()
        gate = moe_block.gate
        hidden = torch.randn(T, H)

        weights, experts, top_k, num_experts = _sigmoid_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )

        assert weights.shape == (T, K)
        assert experts.shape == (T, K)
        assert top_k == K
        assert num_experts == E

    def test_weights_nonnegative(self):
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _sigmoid_topk_route,
        )

        moe_block, T, H, E, K = _make_sigmoid_moe_block()
        gate = moe_block.gate
        hidden = torch.randn(T, H)

        weights, _, _, _ = _sigmoid_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )
        assert (weights >= 0).all()

    def test_group_selection_restricts_experts(self):
        """With n_group=4, topk_group=1, experts should be from selected groups."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _sigmoid_topk_route,
        )

        moe_block, T, H, E, K = _make_sigmoid_moe_block(
            E=16, K=2, n_group=4, topk_group=1
        )
        gate = moe_block.gate
        hidden = torch.randn(T, H)

        _, expert_idx, _, _ = _sigmoid_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )

        # Each token's experts should fall within a single group (size E//n_group=4)
        for t in range(T):
            experts_t = expert_idx[t]
            groups = experts_t // (E // moe_block.n_group)
            assert (groups == groups[0]).all()

    def test_scaling_factor_applied(self):
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _sigmoid_topk_route,
        )

        moe_block, T, H, E, K = _make_sigmoid_moe_block(n_group=1)
        gate = moe_block.gate
        hidden = torch.randn(T, H)

        weights_1x, _, _, _ = _sigmoid_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )

        moe_block.routed_scaling_factor = 2.0
        weights_2x, _, _, _ = _sigmoid_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )

        assert torch.allclose(weights_2x, weights_1x * 2.0, atol=1e-5)

    def test_bias_on_gate(self):
        """e_score_correction_bias on gate is found."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _sigmoid_topk_route,
        )

        moe_block, T, H, E, K = _make_sigmoid_moe_block(bias_on_gate=True)
        gate = moe_block.gate
        hidden = torch.randn(T, H)

        weights, experts, _, _ = _sigmoid_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )
        assert weights.shape == (T, K)

    def test_bias_on_block(self):
        """e_score_correction_bias on moe_block (not gate) is found."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _sigmoid_topk_route,
        )

        moe_block, T, H, E, K = _make_sigmoid_moe_block(bias_on_gate=False)
        gate = moe_block.gate
        hidden = torch.randn(T, H)

        weights, experts, _, _ = _sigmoid_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )
        assert weights.shape == (T, K)

    def test_gate_lora_delta_applied(self):
        """Gate LoRA delta should affect routing logits."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _sigmoid_topk_route,
        )

        moe_block, T, H, E, K = _make_sigmoid_moe_block(n_group=1)
        gate = moe_block.gate
        hidden = torch.randn(T, H)

        weights_no_lora, _, _, _ = _sigmoid_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )

        # Large delta should change the results
        delta = torch.randn(E, H) * 10.0
        weights_with_lora, _, _, _ = _sigmoid_topk_route(
            moe_block, gate, hidden, gate.weight, delta
        )

        assert not torch.equal(weights_no_lora, weights_with_lora)

    def test_no_bias_does_not_crash(self):
        """Calling _sigmoid_topk_route with no e_score_correction_bias should not crash."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _sigmoid_topk_route,
        )

        T, H, E, K = 8, 16, 8, 2
        gate = SimpleNamespace(weight=torch.randn(E, H))
        moe_block = SimpleNamespace(
            gate=gate,
            top_k=K,
            n_routed_experts=E,
            n_group=1,
            norm_topk_prob=True,
            routed_scaling_factor=1.0,
        )
        hidden = torch.randn(T, H)

        weights, experts, top_k, num_experts = _sigmoid_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )
        assert weights.shape == (T, K)
        assert experts.shape == (T, K)
        # Without bias, scores_for_choice == sigmoid(logits) — all positive
        assert (weights >= 0).all()

    def test_missing_topk_group_defaults_to_n_group(self):
        """When topk_group is absent but n_group > 1, should default to n_group (no-op masking)."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _sigmoid_topk_route,
        )

        T, H, E, K, n_group = 8, 16, 16, 2, 4
        gate = SimpleNamespace(
            weight=torch.randn(E, H),
            e_score_correction_bias=torch.zeros(E),
        )
        # Intentionally omit topk_group
        moe_block = SimpleNamespace(
            gate=gate,
            top_k=K,
            n_routed_experts=E,
            n_group=n_group,
            norm_topk_prob=True,
            routed_scaling_factor=1.0,
        )
        hidden = torch.randn(T, H)

        # Should not raise AttributeError; defaults topk_group to n_group
        weights, experts, top_k_out, num_experts = _sigmoid_topk_route(
            moe_block, gate, hidden, gate.weight, None
        )
        assert weights.shape == (T, K)
        assert experts.shape == (T, K)


class TestRoutingStrategyDetection:
    """Test that _route dispatches to the correct strategy."""

    @pytest.fixture(autouse=True)
    def _require_triton(self):
        _skip_without_triton()

    def test_softmax_for_qwen_style(self):
        """Block without e_score_correction_bias should use softmax."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import _route

        gate = _make_softmax_gate(E=4, H=16, K=2)
        moe_block = SimpleNamespace(gate=gate)
        hidden = torch.randn(8, 16)

        weights, experts, top_k, num_experts = _route(
            moe_block, gate, hidden, gate.weight, None
        )

        assert weights.shape == (8, 2)
        assert experts.shape == (8, 2)
        assert top_k == 2
        assert num_experts == 4
        per_token_sums = weights.sum(dim=-1)
        assert torch.allclose(per_token_sums, torch.ones(8), atol=1e-5)

    def test_sigmoid_for_glm_style(self):
        """Block with e_score_correction_bias on gate should use sigmoid."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import _route

        moe_block, T, H, E, K = _make_sigmoid_moe_block(bias_on_gate=True, n_group=1)
        gate = moe_block.gate
        hidden = torch.randn(T, H)

        weights, experts, top_k, num_experts = _route(
            moe_block, gate, hidden, gate.weight, None
        )

        assert weights.shape == (T, K)
        assert experts.shape == (T, K)
        assert (weights >= 0).all()

    def test_sigmoid_for_minimax_m2_style(self):
        """Block with e_score_correction_bias on block (not gate) should use sigmoid."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import _route

        moe_block, T, H, E, K = _make_sigmoid_moe_block(bias_on_gate=False)
        gate = moe_block.gate
        hidden = torch.randn(T, H)

        weights, experts, top_k, num_experts = _route(
            moe_block, gate, hidden, gate.weight, None
        )

        assert weights.shape == (T, K)
        assert (weights >= 0).all()


# ============================================================================
# 8. Generic shared expert handling
# ============================================================================


class TestGenericSharedExpert:
    """Test _compute_shared_expert from layers.py."""

    @pytest.fixture(autouse=True)
    def _require_triton(self):
        _skip_without_triton()

    def test_shared_expert_singular(self):
        """shared_expert attribute (Qwen2MoE style)."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _compute_shared_expert,
        )

        called = torch.randn(4, 8)
        moe_block = SimpleNamespace(
            shared_expert=lambda x: called,
        )
        result = _compute_shared_expert(moe_block, torch.randn(4, 8))
        assert torch.equal(result, called)

    def test_shared_experts_plural(self):
        """shared_experts attribute (DeepSeek V3 style)."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _compute_shared_expert,
        )

        called = torch.randn(4, 8)
        moe_block = SimpleNamespace(
            shared_experts=lambda x: called,
        )
        result = _compute_shared_expert(moe_block, torch.randn(4, 8))
        assert torch.equal(result, called)

    def test_shared_mlp(self):
        """shared_mlp attribute (Hunyuan style)."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _compute_shared_expert,
        )

        called = torch.randn(4, 8)
        moe_block = SimpleNamespace(
            shared_mlp=lambda x: called,
        )
        result = _compute_shared_expert(moe_block, torch.randn(4, 8))
        assert torch.equal(result, called)

    def test_shared_expert_with_gate(self):
        """shared_expert + shared_expert_gate applies sigmoid gating."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _compute_shared_expert,
        )

        H = 8
        expert_out = torch.ones(4, H)
        gate_fn = lambda x: torch.zeros(4, H)  # noqa: E731

        moe_block = SimpleNamespace(
            shared_expert=lambda x: expert_out,
            shared_expert_gate=gate_fn,
        )
        result = _compute_shared_expert(moe_block, torch.randn(4, H))
        expected = expert_out * 0.5  # sigmoid(0) = 0.5
        assert torch.allclose(result, expected, atol=1e-6)

    def test_no_shared_expert(self):
        """No shared expert attributes returns None."""
        from axolotl.integrations.kernels.libs.scattermoe_lora.layers import (
            _compute_shared_expert,
        )

        moe_block = SimpleNamespace()
        result = _compute_shared_expert(moe_block, torch.randn(4, 8))
        assert result is None


================================================
FILE: tests/integrations/test_scattermoe_lora_kernels.py
================================================
# SPDX-License-Identifier: Apache-2.0
# Copyright (c) Axolotl AI
# Licensed under the Apache License, Version 2.0

"""
Unit tests for ScatterMoE LoRA Triton kernels.

Tests correctness of:
  - scatter2scatter_lora (forward)
  - scatter2scatter_lora_dX (backward input gradient)
  - group_bwd_lora (backward LoRA weight gradients via split dA/dB)
  - ScatterMoELoRA autograd function (full forward + backward)

Each kernel is tested against a pure PyTorch per-expert-loop reference
implementation at multiple model shapes and LoRA ranks.
"""

import pytest
import torch

from axolotl.integrations.kernels.libs.scattermoe_lora.kernels import (
    lora_ops,
    ops as base_ops,
)
from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_experts import (
    flatten_sort_count,
)
from axolotl.integrations.kernels.libs.scattermoe_lora.parallel_linear_lora import (
    ScatterMoELoRA,
)

DEVICE = "cuda"
DTYPE = torch.bfloat16


def _requires_cuda():
    return pytest.mark.skipif(
        not torch.cuda.is_available(), reason="CUDA not available"
    )


pytestmark = _requires_cuda()


# ─── Helpers ─────────────────────────────────────────────────────────────────


def _setup(E, K, N, T, top_k, R, seed=42):
    """Create synthetic expert weights, LoRA, routing, and grouped inputs."""
    torch.manual_seed(seed)
    x = torch.randn(T, K, device=DEVICE, dtype=DTYPE)
    W = torch.randn(E, K, N, device=DEVICE, dtype=DTYPE) * 0.02
    lora_A = torch.randn(R * E, K, device=DEVICE, dtype=DTYPE) * 0.01
    lora_B = torch.randn(N, R * E, device=DEVICE, dtype=DTYPE) * 0.01
    logits = torch.randn(T, E, device=DEVICE)
    _, top_idx = torch.topk(torch.softmax(logits, dim=-1), top_k, dim=-1)
    sei, ssi, eo = flatten_sort_count(top_idx, E)
    return x, W, lora_A, lora_B, sei, ssi, eo


def _reference_fwd(x, W, sei, ssi, eo, k, lora_A, lora_B, scaling, E):
    """Per-expert loop reference: Y = X@W + scaling*(X@A^T)@B^T."""
    grouped_x = base_ops.group(x, ssi, fan_out=k)
    M, N = grouped_x.size(0), W.size(2)
    R = lora_A.size(0) // E
    out = torch.zeros(M, N, device=DEVICE, dtype=DTYPE)
    for e in range(E):
        s = eo[e - 1].item() if e > 0 else 0
        end = eo[e].item()
        if s == end:
            continue
        xe = grouped_x[s:end].float()
        we = W[e].float()
        ae = lora_A[e * R : (e + 1) * R].float()
        be = lora_B[:, e * R : (e + 1) * R].float()
        out[s:end] = (xe @ we + scaling * (xe @ ae.T) @ be.T).to(DTYPE)
    result = torch.zeros(M, N, device=DEVICE, dtype=DTYPE)
    result[ssi] = out
    return result


def _reference_dX(dy_grouped, W, sei, ssi, eo, lora_A, lora_B, scaling, E):
    """Per-expert loop reference: dX = dY@W^T + scaling*(dY@B)@A."""
    M, K = dy_grouped.size(0), W.size(1)
    R = lora_A.size(0) // E
    out = torch.zeros(M, K, device=DEVICE, dtype=DTYPE)
    for e in range(E):
        s = eo[e - 1].item() if e > 0 else 0
        end = eo[e].item()
        if s == end:
            continue
        dye = dy_grouped[s:end].float()
        we = W[e].float()
        ae = lora_A[e * R : (e + 1) * R].float()
        be = lora_B[:, e * R : (e + 1) * R].float()
        out[s:end] = (dye @ we.T + scaling * (dye @ be) @ ae).to(DTYPE)
    result = torch.zeros(M, K, device=DEVICE, dtype=DTYPE)
    result[ssi] = out
    return result


def _reference_bwd_lora(dy, grouped_x, lora_A, lora_B, eo, E, scaling):
    """Per-expert loop reference: dA, dB for LoRA weight gradients."""
    R = lora_A.size(0) // E
    dA = torch.zeros_like(lora_A)
    dB = torch.zeros_like(lora_B)
    for e in range(E):
        s = eo[e - 1].item() if e > 0 else 0
        end = eo[e].item()
        if s == end:
            continue
        xe = grouped_x[s:end].float()
        dye = dy[s:end].float()
        ae = lora_A[e * R : (e + 1) * R].float()
        be = lora_B[:, e * R : (e + 1) * R].float()
        dA[e * R : (e + 1) * R] = (scaling * (dye @ be).T @ xe).to(DTYPE)
        dB[:, e * R : (e + 1) * R] = (scaling * dye.T @ (xe @ ae.T)).to(DTYPE)
    return dA, dB


# ─── Model shape configs ────────────────────────────────────────────────────

# (E, K, N, T, top_k, R, description)
CONFIGS_SMALL = [
    (32, 128, 64, 64, 2, 4, "tiny"),
    (64, 256, 128, 128, 4, 8, "small"),
]

CONFIGS_REAL = [
    (256, 2048, 1024, 2048, 8, 16, "qwen35_gate_up"),
    (256, 512, 2048, 2048, 8, 16, "qwen35_down"),
    (64, 2048, 2048, 2048, 8, 16, "olmoe_gate_up"),
    (128, 2048, 1536, 2048, 8, 16, "qwen3_gate_up"),
]

SCALING = 2.0


# ─── Forward tests ──────────────────────────────────────────────────────────


class TestScatter2ScatterLoRAForward:
    """Test scatter2scatter_lora forward kernel vs reference."""

    @pytest.fixture(params=CONFIGS_SMALL + CONFIGS_REAL)
    def config(self, request):
        return request.param

    def test_matches_reference(self, config):
        E, K, N, T, k, R, desc = config
        x, W, lA, lB, sei, ssi, eo = _setup(E, K, N, T, k, R)

        kernel_out = lora_ops.scatter2scatter_lora(
            X=x,
            W=W,
            sorted_expert_idxs=sei,
            sorted_scattered_idxs=ssi,
            k=k,
            lora_A=lA,
            lora_B=lB,
            scaling=SCALING,
        )
        ref_out = _reference_fwd(x, W, sei, ssi, eo, k, lA, lB, SCALING, E)

        err = (kernel_out.float() - ref_out.float()).abs().max().item()
        assert err < 1.0, f"[{desc}] fwd max_err={err}"

    def test_output_shape(self, config):
        E, K, N, T, k, R, desc = config
        x, W, lA, lB, sei, ssi, eo = _setup(E, K, N, T, k, R)

        out = lora_ops.scatter2scatter_lora(
            X=x,
            W=W,
            sorted_expert_idxs=sei,
            sorted_scattered_idxs=ssi,
            k=k,
            lora_A=lA,
            lora_B=lB,
            scaling=SCALING,
        )
        assert out.shape == (T * k, N)
        assert out.dtype == DTYPE


# ─── Backward dX tests ──────────────────────────────────────────────────────


class TestScatter2ScatterLoRADX:
    """Test scatter2scatter_lora_dX backward kernel vs reference."""

    @pytest.fixture(params=CONFIGS_SMALL + CONFIGS_REAL)
    def config(self, request):
        return request.param

    def test_matches_reference(self, config):
        E, K, N, T, k, R, desc = config
        x, W, lA, lB, sei, ssi, eo = _setup(E, K, N, T, k, R)
        gx = base_ops.group(x, ssi, fan_out=k)
        dy = torch.randn(gx.size(0), N, device=DEVICE, dtype=DTYPE)

        kernel_dx = lora_ops.scatter2scatter_lora_dX(
            DY=dy,
            W=W,
            sorted_expert_idxs=sei,
            sorted_scattered_idxs=ssi,
            k=1,
            lora_A=lA,
            lora_B=lB,
            scaling=SCALING,
            dy_grouped=True,
            dx_grouped=False,
        )
        ref_dx = _reference_dX(dy, W, sei, ssi, eo, lA, lB, SCALING, E)

        err = (kernel_dx.float() - ref_dx.float()).abs().max().item()
        assert err < 1.0, f"[{desc}] dX max_err={err}"


# ─── Backward LoRA gradient tests ───────────────────────────────────────────


class TestGroupBwdLoRA:
    """Test group_bwd_lora (split dA/dB kernel) vs reference."""

    @pytest.fixture(params=CONFIGS_SMALL + CONFIGS_REAL)
    def config(self, request):
        return request.param

    def test_matches_reference(self, config):
        E, K, N, T, k, R, desc = config
        x, W, lA, lB, sei, ssi, eo = _setup(E, K, N, T, k, R)
        gx = base_ops.group(x, ssi, fan_out=k)
        dy = torch.randn(gx.size(0), N, device=DEVICE, dtype=DTYPE)

        kern_dA, kern_dB = lora_ops.group_bwd_lora(
            DY=dy,
            X=gx,
            lora_A=lA,
            lora_B=lB,
            expert_offsets=eo,
            E=E,
            scaling=SCALING,
        )
        ref_dA, ref_dB = _reference_bwd_lora(dy, gx, lA, lB, eo, E, SCALING)

        # Use norm-relative error: bf16 accumulation order differs between
        # kernel (tiled + different reduction order) and reference (per-expert
        # fp32 loop), so max absolute error can be large on individual elements
        # while the overall tensor is correct.
        dA_norm_err = (
            (kern_dA.float() - ref_dA.float()).norm() / (ref_dA.float().norm() + 1e-6)
        ).item()
        dB_norm_err = (
            (kern_dB.float() - ref_dB.float()).norm() / (ref_dB.float().norm() + 1e-6)
        ).item()
        assert dA_norm_err < 0.01, f"[{desc}] dA norm_rel_err={dA_norm_err}"
        assert dB_norm_err < 0.01, f"[{desc}] dB norm_rel_err={dB_norm_err}"

    def test_zero_expert_tokens(self):
        """Experts with zero routed tokens produce zero gradients."""
        E, K, N, R = 8, 64, 32, 4
        torch.manual_seed(42)
        # Route all tokens to expert 0 only
        T, k = 16, 1
        top_idx = torch.zeros(T, k, dtype=torch.long, device=DEVICE)
        sei, ssi, eo = flatten_sort_count(top_idx, E)
        gx = torch.randn(T, K, device=DEVICE, dtype=DTYPE)
        dy = torch.randn(T, N, device=DEVICE, dtype=DTYPE)
        lA = torch.randn(R * E, K, device=DEVICE, dtype=DTYPE)
        lB = torch.randn(N, R * E, device=DEVICE, dtype=DTYPE)

        dA, dB = lora_ops.group_bwd_lora(
            DY=dy,
            X=gx,
            lora_A=lA,
            lora_B=lB,
            expert_offsets=eo,
            E=E,
            scaling=2.0,
        )

        # Experts 1..7 should have zero gradients
        for e in range(1, E):
            assert dA[e * R : (e + 1) * R].abs().max() == 0, f"Expert {e} dA not zero"
            assert dB[:, e * R : (e + 1) * R].abs().max() == 0, (
                f"Expert {e} dB not zero"
            )


# ─── Full autograd tests ────────────────────────────────────────────────────


class TestScatterMoELoRAAutograd:
    """Test full forward + backward through ScatterMoELoRA autograd function."""

    @pytest.fixture(params=CONFIGS_SMALL + CONFIGS_REAL[:2])
    def config(self, request):
        return request.param

    def test_gradients_exist_and_finite(self, config):
        E, K, N, T, k, R, desc = config
        x, W, lA, lB, sei, ssi, eo = _setup(E, K, N, T, k, R)

        x = x.requires_grad_(True)
        lA = lA.requires_grad_(True)
        lB = lB.requires_grad_(True)

        out = ScatterMoELoRA.apply(
            x,
            W,
            k,
            sei,
            ssi,
            eo,
            lA,
            lB,
            SCALING,
            None,
            None,
            False,
            False,
            True,
            False,
        )
        out.sum().backward()

        assert x.grad is not None, f"[{desc}] x.grad is None"
        assert lA.grad is not None, f"[{desc}] lA.grad is None"
        assert lB.grad is not None, f"[{desc}] lB.grad is None"
        assert torch.isfinite(x.grad).all(), f"[{desc}] x.grad has non-finite"
        assert torch.isfinite(lA.grad).all(), f"[{desc}] lA.grad has non-finite"
        assert torch.isfinite(lB.grad).all(), f"[{desc}] lB.grad has non-finite"
        assert x.grad.abs().sum() > 0, f"[{desc}] x.grad all zero"
        assert lA.grad.abs().sum() > 0, f"[{desc}] lA.grad all zero"

    def test_split_matches_fused(self):
        """Split dispatch (for few large experts) matches fused kernel."""
        # Use a shape where split would be dispatched (large K*N, few E)
        E, K, N, T, k, R = 8, 512, 1024, 128, 2, 16
        x, W, lA, lB, sei, ssi, eo = _setup(E, K, N, T, k, R)

        # Force fused path
        orig = lora_ops._SPLIT_LORA_FWD_THRESHOLD
        lora_ops._SPLIT_LORA_FWD_THRESHOLD = 10**18
        out_fused = lora_ops.scatter2scatter_lora(
            X=x,
            W=W,
            sorted_expert_idxs=sei,
            sorted_scattered_idxs=ssi,
            k=k,
            lora_A=lA,
            lora_B=lB,
            scaling=SCALING,
        )

        # Force split path
        lora_ops._SPLIT_LORA_FWD_THRESHOLD = 0
        out_split = lora_ops.scatter2scatter_lora(
            X=x,
            W=W,
            sorted_expert_idxs=sei,
            sorted_scattered_idxs=ssi,
            k=k,
            lora_A=lA,
            lora_B=lB,
            scaling=SCALING,
        )
        lora_ops._SPLIT_LORA_FWD_THRESHOLD = orig

        norm_err = (
            (out_fused.float() - out_split.float()).norm()
            / (out_fused.float().norm() + 1e-6)
        ).item()
        assert norm_err < 0.01, f"split vs fused norm_err={norm_err}"

    def test_scaling_zero_gives_base_only(self):
        """With scaling=0.0, LoRA contribution vanishes. Output = X@W."""
        E, K, N, T, k, R = 16, 64, 32, 32, 2, 4
        x, W, lA, lB, sei, ssi, eo = _setup(E, K, N, T, k, R)

        out_lora = ScatterMoELoRA.apply(
            x,
            W,
            k,
            sei,
            ssi,
            eo,
            lA,
            lB,
            0.0,
            None,
            None,
            False,
            False,
            True,
            False,
        )
        out_base = base_ops.scatter2scatter(
            X=x,
            W=W,
            sorted_expert_idxs=sei,
            sorted_scattered_idxs=ssi,
            k=k,
        )
        err = (out_lora.float() - out_base.float()).abs().max().item()
        assert err < 0.01, f"scaling=0 should match base: err={err}"


================================================
FILE: tests/integrations/test_sonicmoe.py
================================================
"""Unit tests for the SonicMoE integration."""

from types import SimpleNamespace

import pytest
import torch

from axolotl.integrations.kernels.args import KernelsArgs
from axolotl.integrations.kernels.sonicmoe.routing import (
    sigmoid_topk_routing,
    softmax_topk_routing,
)
from axolotl.integrations.kernels.sonicmoe.weight_converter import (
    ConcatenatedToInterleaved,
    InterleavedToConcatenated,
    register_sonicmoe_weight_converter,
)


class TestKernelsArgs:
    def test_mutual_exclusivity_raises(self):
        with pytest.raises(ValueError, match="Cannot use both"):
            KernelsArgs.model_validate({"use_scattermoe": True, "use_sonicmoe": True})

    def test_sonicmoe_only(self):
        result = KernelsArgs.model_validate({"use_sonicmoe": True})
        assert result.use_sonicmoe is True
        assert result.use_scattermoe is None

    def test_scattermoe_only(self):
        result = KernelsArgs.model_validate({"use_scattermoe": True})
        assert result.use_scattermoe is True
        assert result.use_sonicmoe is None

    def test_neither_set(self):
        result = KernelsArgs.model_validate({})
        assert result.use_scattermoe is None
        assert result.use_sonicmoe is None

    def test_disables_mlp_kernel_when_sonicmoe(self):
        data = {"use_sonicmoe": True, "lora_mlp_kernel": True}
        result = KernelsArgs.disable_mlp_kernel(data)
        assert result["lora_mlp_kernel"] is False
        assert result["mlp_kernel"] is False


class TestConcatenatedToInterleaved:
    @pytest.fixture
    def sample_tensor(self):
        """Create a test tensor [E=2, 2*I=4, H=3] with distinct gate/up values."""
        E, I, H = 2, 2, 3  # noqa: E741
        gate = torch.arange(1, E * I * H + 1, dtype=torch.float32).reshape(E, I, H)
        up = torch.arange(100, 100 + E * I * H, dtype=torch.float32).reshape(E, I, H)
        return torch.cat([gate, up], dim=1)

    def test_interleave_rows_alternate(self, sample_tensor):
        op = ConcatenatedToInterleaved(dim=1)
        result = op.convert(
            {"test": sample_tensor},
            source_patterns=["test"],
            target_patterns=["test"],
        )
        interleaved = result["test"]

        # For expert 0: even rows should be gate, odd rows should be up
        E, two_I, H = sample_tensor.shape
        I = two_I // 2  # noqa: E741
        gate_orig = sample_tensor[:, :I, :]
        up_orig = sample_tensor[:, I:, :]

        assert torch.equal(interleaved[:, 0::2, :], gate_orig)
        assert torch.equal(interleaved[:, 1::2, :], up_orig)

    def test_interleave_handles_list_input(self, sample_tensor):
        op = ConcatenatedToInterleaved(dim=1)
        result = op.convert(
            {"test": [sample_tensor]},
            source_patterns=["test"],
            target_patterns=["test"],
        )
        assert result["test"].shape == sample_tensor.shape

    def test_reverse_op_type(self):
        op = ConcatenatedToInterleaved(dim=1)
        assert isinstance(op.reverse_op, InterleavedToConcatenated)
        assert op.reverse_op.dim == 1


class TestInterleavedToConcatenated:
    @pytest.fixture
    def interleaved_tensor(self):
        """Create an interleaved tensor [E=2, 2*I=4, H=3]."""
        E, I, H = 2, 2, 3  # noqa: E741
        gate = torch.arange(1, E * I * H + 1, dtype=torch.float32).reshape(E, I, H)
        up = torch.arange(100, 100 + E * I * H, dtype=torch.float32).reshape(E, I, H)
        interleaved = torch.empty(E, 2 * I, H)
        interleaved[:, 0::2, :] = gate
        interleaved[:, 1::2, :] = up
        return interleaved

    def test_deinterleave_gate_up_separated(self, interleaved_tensor):
        op = InterleavedToConcatenated(dim=1)
        result = op.convert(
            {"test": interleaved_tensor},
            source_patterns=["test"],
            target_patterns=["test"],
        )
        concatenated = result["test"]

        E, two_I, H = concatenated.shape
        I = two_I // 2  # noqa: E741

        # First half should be gate (even rows from interleaved)
        assert torch.equal(concatenated[:, :I, :], interleaved_tensor[:, 0::2, :])
        # Second half should be up (odd rows from interleaved)
        assert torch.equal(concatenated[:, I:, :], interleaved_tensor[:, 1::2, :])

    def test_reverse_op_type(self):
        op = InterleavedToConcatenated(dim=1)
        assert isinstance(op.reverse_op, ConcatenatedToInterleaved)
        assert op.reverse_op.dim == 1


class TestRoundTrip:
    @pytest.fixture
    def concat_tensor(self):
        E, I, H = 4, 8, 16  # noqa: E741
        gate = torch.randn(E, I, H)
        up = torch.randn(E, I, H)
        return torch.cat([gate, up], dim=1)

    def test_interleave_then_deinterleave_is_identity(self, concat_tensor):
        fwd = ConcatenatedToInterleaved(dim=1)
        rev = InterleavedToConcatenated(dim=1)

        interleaved = fwd.convert(
            {"k": concat_tensor}, source_patterns=["k"], target_patterns=["k"]
        )["k"]
        recovered = rev.convert(
            {"k": interleaved}, source_patterns=["k"], target_patterns=["k"]
        )["k"]

        assert torch.equal(concat_tensor, recovered)

    def test_reverse_op_chain_is_identity(self, concat_tensor):
        """Verify that op.reverse_op produces an exact inverse."""
        op = ConcatenatedToInterleaved(dim=1)
        rev = op.reverse_op

        interleaved = op.convert(
            {"k": concat_tensor}, source_patterns=["k"], target_patterns=["k"]
        )["k"]
        recovered = rev.convert(
            {"k": interleaved}, source_patterns=["k"], target_patterns=["k"]
        )["k"]

        assert torch.equal(concat_tensor, recovered)

    def test_various_shapes(self):
        """Test with different expert counts and dimensions."""
        fwd = ConcatenatedToInterleaved(dim=1)
        rev = InterleavedToConcatenated(dim=1)

        for E, I, H in [(1, 4, 8), (8, 16, 32), (16, 128, 256)]:  # noqa: E741
            concat = torch.randn(E, 2 * I, H)
            interleaved = fwd.convert(
                {"k": concat}, source_patterns=["k"], target_patterns=["k"]
            )["k"]
            recovered = rev.convert(
                {"k": interleaved}, source_patterns=["k"], target_patterns=["k"]
            )["k"]
            assert torch.equal(concat, recovered), (
                f"Failed for shape ({E}, {2 * I}, {H})"
            )


class TestWeightConverterRegistration:
    def test_register_appends_interleave_op(self):
        from transformers.conversion_mapping import get_checkpoint_conversion_mapping

        register_sonicmoe_weight_converter("qwen3_moe")

        modified = get_checkpoint_conversion_mapping("qwen3_moe")
        # Find the gate_up_proj converter
        gate_up_converter = None
        for conv in modified:
            if hasattr(conv, "operations") and any(
                "gate_up_proj" in pat for pat in conv.target_patterns
            ):
                gate_up_converter = conv
                break

        assert gate_up_converter is not None
        assert isinstance(gate_up_converter.operations[-1], ConcatenatedToInterleaved)

    def test_double_registration_is_idempotent(self):
        from transformers.conversion_mapping import get_checkpoint_conversion_mapping

        register_sonicmoe_weight_converter("qwen3_moe")
        register_sonicmoe_weight_converter("qwen3_moe")

        modified = get_checkpoint_conversion_mapping("qwen3_moe")
        for conv in modified:
            if hasattr(conv, "operations") and any(
                "gate_up_proj" in pat for pat in conv.target_patterns
            ):
                interleave_count = sum(
                    isinstance(op, ConcatenatedToInterleaved) for op in conv.operations
                )
                assert interleave_count == 1, (
                    f"Expected 1 ConcatenatedToInterleaved op, got {interleave_count}"
                )
                break

    def test_register_unsupported_model_type_warns(self):
        # A model type with no conversion mapping should warn but not raise
        register_sonicmoe_weight_converter("nonexistent_model_type_xyz")


def _make_qwen_moe_block(T=8, H=16, E=4, K=2):
    """Create a mock qwen-style MoE block for routing tests."""
    gate = SimpleNamespace(
        weight=torch.randn(E, H),
        top_k=K,
        num_experts=E,
        norm_topk_prob=True,
    )
    return SimpleNamespace(gate=gate), T, H, E, K


def _make_glm_moe_block(T=8, H=16, E=16, K=4, n_group=2, topk_group=1):
    """Create a mock GLM5-style MoE block for routing tests."""
    gate = SimpleNamespace(
        weight=torch.randn(E, H),
        e_score_correction_bias=torch.zeros(E),
    )
    moe_block = SimpleNamespace(
        gate=gate,
        top_k=K,
        n_routed_experts=E,
        n_group=n_group,
        topk_group=topk_group,
        norm_topk_prob=True,
        routed_scaling_factor=1.0,
    )
    return moe_block, T, H, E, K


def _make_minimax_m2_moe_block(T=8, H=16, E=16, K=4):
    """Create a mock minimax_m2-style MoE block for routing tests.

    minimax_m2 uses sigmoid->topk WITHOUT group selection:
    - e_score_correction_bias is on the moe_block (not on gate)
    - No n_group / topk_group attributes
    - Always normalizes (norm_topk_prob defaults to True)
    - No routed_scaling_factor (defaults to 1.0)
    """
    gate = SimpleNamespace(
        weight=torch.randn(E, H),
        top_k=K,
    )
    moe_block = SimpleNamespace(
        gate=gate,
        top_k=K,
        e_score_correction_bias=torch.zeros(E),
    )
    return moe_block, T, H, E, K


class TestSoftmaxTopkRouting:
    def test_output_shapes(self):
        moe_block, T, H, E, K = _make_qwen_moe_block()
        hidden = torch.randn(T, H)

        scores, token_idx, expert_idx, logits = softmax_topk_routing(hidden, moe_block)

        assert scores.shape == (T * K,)
        assert token_idx.shape == (T * K,)
        assert expert_idx.shape == (T * K,)
        assert logits.shape == (T, E)

    def test_scores_are_float32(self):
        moe_block, T, H, E, K = _make_qwen_moe_block()
        hidden = torch.randn(T, H)

        scores, _, _, _ = softmax_topk_routing(hidden, moe_block)
        assert scores.dtype == torch.float32

    def test_token_indices_sorted_ascending(self):
        moe_block, T, H, E, K = _make_qwen_moe_block()
        hidden = torch.randn(T, H)

        _, token_idx, _, _ = softmax_topk_routing(hidden, moe_block)

        # Token indices must be sorted ascending (SonicMoE requirement)
        diffs = token_idx[1:] - token_idx[:-1]
        assert (diffs >= 0).all()

    def test_expert_indices_in_range(self):
        moe_block, T, H, E, K = _make_qwen_moe_block()
        hidden = torch.randn(T, H)

        _, _, expert_idx, _ = softmax_topk_routing(hidden, moe_block)

        assert (expert_idx >= 0).all()
        assert (expert_idx < E).all()

    def test_renormalized_scores_sum_to_one(self):
        moe_block, T, H, E, K = _make_qwen_moe_block()
        hidden = torch.randn(T, H)

        scores, _, _, _ = softmax_topk_routing(hidden, moe_block)
        per_token_sums = scores.reshape(T, K).sum(dim=-1)
        assert torch.allclose(per_token_sums, torch.ones(T), atol=1e-5)


class TestSigmoidTopkRouting:
    def test_output_shapes(self):
        moe_block, T, H, E, K = _make_glm_moe_block()
        hidden = torch.randn(T, H)

        scores, token_idx, expert_idx, logits = sigmoid_topk_routing(hidden, moe_block)

        assert scores.shape == (T * K,)
        assert token_idx.shape == (T * K,)
        assert expert_idx.shape == (T * K,)
        assert logits.shape == (T, E)

    def test_scores_are_float32(self):
        moe_block, T, H, E, K = _make_glm_moe_block()
        hidden = torch.randn(T, H)

        scores, _, _, _ = sigmoid_topk_routing(hidden, moe_block)
        assert scores.dtype == torch.float32

    def test_token_indices_sorted_ascending(self):
        moe_block, T, H, E, K = _make_glm_moe_block()
        hidden = torch.randn(T, H)

        _, token_idx, _, _ = sigmoid_topk_routing(hidden, moe_block)

        diffs = token_idx[1:] - token_idx[:-1]
        assert (diffs >= 0).all()

    def test_expert_indices_in_range(self):
        moe_block, T, H, E, K = _make_glm_moe_block()
        hidden = torch.randn(T, H)

        _, _, expert_idx, _ = sigmoid_topk_routing(hidden, moe_block)

        assert (expert_idx >= 0).all()
        assert (expert_idx < E).all()

    def test_scores_are_nonnegative(self):
        """Sigmoid outputs are in [0, 1], so scores should be non-negative."""
        moe_block, T, H, E, K = _make_glm_moe_block()
        hidden = torch.randn(T, H)

        scores, _, _, _ = sigmoid_topk_routing(hidden, moe_block)
        assert (scores >= 0).all()

    def test_scaling_factor_applied(self):
        moe_block, T, H, E, K = _make_glm_moe_block()
        hidden = torch.randn(T, H)

        # Get scores with scaling_factor=1.0
        scores_1x, _, _, _ = sigmoid_topk_routing(hidden, moe_block)

        # Get scores with scaling_factor=2.0
        moe_block.routed_scaling_factor = 2.0
        scores_2x, _, _, _ = sigmoid_topk_routing(hidden, moe_block)

        assert torch.allclose(scores_2x, scores_1x * 2.0, atol=1e-5)

    def test_group_selection_restricts_experts(self):
        """With n_group=4 and topk_group=1, only 1/4 of experts should be selectable."""
        moe_block, T, H, E, K = _make_glm_moe_block(E=16, K=2, n_group=4, topk_group=1)
        hidden = torch.randn(T, H)

        _, _, expert_idx, _ = sigmoid_topk_routing(hidden, moe_block)

        # Each token's experts should all fall within a single group (size E//n_group=4)
        expert_idx_2d = expert_idx.reshape(T, K)
        for t in range(T):
            experts = expert_idx_2d[t]
            groups = experts // (E // moe_block.n_group)
            # All selected experts should be from the same group
            assert (groups == groups[0]).all()


class TestMiniMaxM2SigmoidRouting:
    """Tests for minimax_m2 routing: sigmoid->topk without group selection."""

    def test_output_shapes(self):
        """Validates getattr defaults work: n_group=1, E from gate.weight.shape[0]."""
        moe_block, T, H, E, K = _make_minimax_m2_moe_block()
        hidden = torch.randn(T, H)

        scores, token_idx, expert_idx, logits = sigmoid_topk_routing(hidden, moe_block)

        assert scores.shape == (T * K,)
        assert token_idx.shape == (T * K,)
        assert expert_idx.shape == (T * K,)
        assert logits.shape == (T, E)

    def test_bias_on_block_not_gate(self):
        """Verify that e_score_correction_bias on the block (not gate) is used."""
        T, H, E, K = 8, 16, 8, 2
        gate = SimpleNamespace(
            weight=torch.randn(E, H),
            top_k=K,
        )
        # Large positive bias on expert 0 should make it selected more often
        bias = torch.zeros(E)
        bias[0] = 100.0
        moe_block = SimpleNamespace(
            gate=gate,
            top_k=K,
            e_score_correction_bias=bias,
        )
        hidden = torch.randn(T, H)

        _, _, expert_idx, _ = sigmoid_topk_routing(hidden, moe_block)

        # Expert 0 should appear for every token due to the large bias
        expert_idx_2d = expert_idx.reshape(T, K)
        for t in range(T):
            assert 0 in expert_idx_2d[t]


================================================
FILE: tests/integrations/test_sonicmoe_gradients.py
================================================
"""
Gradient correctness tests for SonicMoE routing functions (CPU-only).

Uses torch.autograd.gradcheck with float32 inputs to match the production
code path where routing happens in float32.
"""

import torch

from axolotl.integrations.kernels.sonicmoe.routing import (
    sigmoid_topk_routing,
    softmax_topk_routing,
)

_GC_EPS = 1e-3
_GC_ATOL = 1e-3
_GC_RTOL = 1e-3


def _make_softmax_moe_block(weight):
    gate = torch.nn.Module()
    gate.weight = weight
    gate.top_k = 2
    gate.norm_topk_prob = True

    moe_block = torch.nn.Module()
    moe_block.gate = gate
    return moe_block


def _make_sigmoid_moe_block(weight, bias):
    gate = torch.nn.Module()
    gate.weight = weight
    gate.e_score_correction_bias = bias

    moe_block = torch.nn.Module()
    moe_block.gate = gate
    moe_block.top_k = 2
    moe_block.n_routed_experts = weight.shape[0]
    moe_block.n_group = 1
    moe_block.norm_topk_prob = True
    moe_block.routed_scaling_factor = 1.0
    return moe_block


class TestSoftmaxTopkRoutingGradcheck:
    """Numerical gradient verification for softmax_topk_routing."""

    def test_gradcheck_wrt_gate_weight(self):
        T, H, E = 4, 8, 4

        hidden = torch.randn(T, H, dtype=torch.float32)

        def fn(weight):
            moe_block = _make_softmax_moe_block(weight)
            scores, _, _, _ = softmax_topk_routing(hidden, moe_block)
            return scores

        weight = torch.randn(E, H, dtype=torch.float32, requires_grad=True)
        torch.autograd.gradcheck(
            fn, (weight,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL
        )

    def test_gradcheck_wrt_hidden_states(self):
        T, H, E = 4, 8, 4

        weight = torch.randn(E, H, dtype=torch.float32)
        moe_block = _make_softmax_moe_block(weight)

        def fn(hidden):
            scores, _, _, _ = softmax_topk_routing(hidden, moe_block)
            return scores

        hidden = torch.randn(T, H, dtype=torch.float32, requires_grad=True)
        torch.autograd.gradcheck(
            fn, (hidden,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL
        )

    def test_gradcheck_wrt_router_logits(self):
        T, H, E = 4, 8, 4

        hidden = torch.randn(T, H, dtype=torch.float32)

        def fn(weight):
            moe_block = _make_softmax_moe_block(weight)
            _, _, _, router_logits = softmax_topk_routing(hidden, moe_block)
            return router_logits

        weight = torch.randn(E, H, dtype=torch.float32, requires_grad=True)
        torch.autograd.gradcheck(
            fn, (weight,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL
        )

    def test_no_norm_variant(self):
        T, H, E = 4, 8, 4

        hidden = torch.randn(T, H, dtype=torch.float32)

        def fn(weight):
            moe_block = _make_softmax_moe_block(weight)
            moe_block.gate.norm_topk_prob = False
            scores, _, _, _ = softmax_topk_routing(hidden, moe_block)
            return scores

        weight = torch.randn(E, H, dtype=torch.float32, requires_grad=True)
        torch.autograd.gradcheck(
            fn, (weight,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL
        )


class TestSigmoidTopkRoutingGradcheck:
    """Numerical gradient verification for sigmoid_topk_routing."""

    def test_gradcheck_wrt_gate_weight(self):
        T, H, E = 4, 8, 4

        hidden = torch.randn(T, H, dtype=torch.float32)
        bias = torch.zeros(E, dtype=torch.float32)

        def fn(weight):
            moe_block = _make_sigmoid_moe_block(weight, bias)
            scores, _, _, _ = sigmoid_topk_routing(hidden, moe_block)
            return scores

        weight = torch.randn(E, H, dtype=torch.float32, requires_grad=True)
        torch.autograd.gradcheck(
            fn, (weight,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL
        )

    def test_gradcheck_wrt_hidden_states(self):
        T, H, E = 4, 8, 4

        weight = torch.randn(E, H, dtype=torch.float32)
        bias = torch.zeros(E, dtype=torch.float32)
        moe_block = _make_sigmoid_moe_block(weight, bias)

        def fn(hidden):
            scores, _, _, _ = sigmoid_topk_routing(hidden, moe_block)
            return scores

        hidden = torch.randn(T, H, dtype=torch.float32, requires_grad=True)
        torch.autograd.gradcheck(
            fn, (hidden,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL
        )

    def test_gradcheck_wrt_bias(self):
        T, H, E = 4, 8, 4

        hidden = torch.randn(T, H, dtype=torch.float32)
        weight = torch.randn(E, H, dtype=torch.float32)

        def fn(bias):
            moe_block = _make_sigmoid_moe_block(weight, bias)
            scores, _, _, _ = sigmoid_topk_routing(hidden, moe_block)
            return scores

        bias = torch.zeros(E, dtype=torch.float32, requires_grad=True)
        torch.autograd.gradcheck(fn, (bias,), eps=_GC_EPS, atol=_GC_ATOL, rtol=_GC_RTOL)


================================================
FILE: tests/integrations/test_swanlab.py
================================================
# Copyright 2024 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""
Unit tests for SwanLab Integration Plugin.

Tests conflict detection, configuration validation, and multi-logger warnings.
"""

import importlib.util
import logging
import os
import time
from unittest.mock import MagicMock, patch

import pytest
from pydantic import ValidationError

from axolotl.integrations.swanlab.args import SwanLabConfig
from axolotl.integrations.swanlab.plugins import SwanLabPlugin

SWANLAB_INSTALLED = importlib.util.find_spec("swanlab") is not None


@pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed")
class TestSwanLabConfigValidators:
    """Tests for Pydantic field validators in SwanLabConfig."""

    def test_valid_swanlab_mode_cloud(self):
        """Test that 'cloud' mode is valid."""
        config = SwanLabConfig(swanlab_mode="cloud")
        assert config.swanlab_mode == "cloud"

    def test_valid_swanlab_mode_local(self):
        """Test that 'local' mode is valid."""
        config = SwanLabConfig(swanlab_mode="local")
        assert config.swanlab_mode == "local"

    def test_valid_swanlab_mode_offline(self):
        """Test that 'offline' mode is valid."""
        config = SwanLabConfig(swanlab_mode="offline")
        assert config.swanlab_mode == "offline"

    def test_valid_swanlab_mode_disabled(self):
        """Test that 'disabled' mode is valid."""
        config = SwanLabConfig(swanlab_mode="disabled")
        assert config.swanlab_mode == "disabled"

    def test_invalid_swanlab_mode(self):
        """Test that invalid mode raises ValueError."""
        with pytest.raises(ValidationError) as exc_info:
            SwanLabConfig(swanlab_mode="invalid")

        error_msg = str(exc_info.value)
        assert "Invalid swanlab_mode" in error_msg
        assert "cloud" in error_msg
        assert "local" in error_msg
        assert "offline" in error_msg
        assert "disabled" in error_msg

    def test_swanlab_mode_none_allowed(self):
        """Test that None mode is allowed (will use default)."""
        config = SwanLabConfig(swanlab_mode=None)
        assert config.swanlab_mode is None

    def test_valid_swanlab_project(self):
        """Test that valid project name is accepted."""
        config = SwanLabConfig(swanlab_project="my-project")
        assert config.swanlab_project == "my-project"

    def test_swanlab_project_none_allowed(self):
        """Test that None project is allowed."""
        config = SwanLabConfig(swanlab_project=None)
        assert config.swanlab_project is None

    def test_empty_swanlab_project_rejected(self):
        """Test that empty string project name is rejected."""
        with pytest.raises(ValidationError) as exc_info:
            SwanLabConfig(swanlab_project="")

        error_msg = str(exc_info.value)
        assert "cannot be an empty string" in error_msg

    def test_whitespace_only_project_rejected(self):
        """Test that whitespace-only project name is rejected."""
        with pytest.raises(ValidationError) as exc_info:
            SwanLabConfig(swanlab_project="   ")

        error_msg = str(exc_info.value)
        assert "cannot be an empty string" in error_msg

    def test_use_swanlab_true_requires_project(self):
        """Test that use_swanlab=True requires swanlab_project."""
        with pytest.raises(ValidationError) as exc_info:
            SwanLabConfig(use_swanlab=True, swanlab_project=None)

        error_msg = str(exc_info.value)
        assert "swanlab_project" in error_msg.lower()
        assert "not set" in error_msg.lower()

    def test_use_swanlab_true_with_project_valid(self):
        """Test that use_swanlab=True with project is valid."""
        config = SwanLabConfig(use_swanlab=True, swanlab_project="my-project")
        assert config.use_swanlab is True
        assert config.swanlab_project == "my-project"

    def test_use_swanlab_false_no_project_valid(self):
        """Test that use_swanlab=False without project is valid."""
        config = SwanLabConfig(use_swanlab=False, swanlab_project=None)
        assert config.use_swanlab is False
        assert config.swanlab_project is None

    def test_use_swanlab_none_no_project_valid(self):
        """Test that use_swanlab=None without project is valid."""
        config = SwanLabConfig(use_swanlab=None, swanlab_project=None)
        assert config.use_swanlab is None
        assert config.swanlab_project is None


@pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed")
class TestSwanLabPluginRegister:
    """Tests for SwanLabPlugin.register() conflict detection."""

    def test_register_without_use_swanlab(self):
        """Test that register works when SwanLab is not enabled."""
        plugin = SwanLabPlugin()
        cfg = {"use_swanlab": False}
        # Should not raise
        plugin.register(cfg)

    def test_register_use_swanlab_missing_project(self):
        """Test that use_swanlab=True without project raises ValueError."""
        plugin = SwanLabPlugin()
        cfg = {"use_swanlab": True}

        with pytest.raises(ValueError) as exc_info:
            plugin.register(cfg)

        error_msg = str(exc_info.value)
        assert "swanlab_project" in error_msg
        assert "not set" in error_msg
        assert "Solutions" in error_msg

    def test_register_use_swanlab_with_project_valid(self):
        """Test that use_swanlab=True with project is valid."""
        plugin = SwanLabPlugin()
        cfg = {"use_swanlab": True, "swanlab_project": "my-project"}
        # Should not raise
        plugin.register(cfg)

    def test_register_invalid_mode(self):
        """Test that invalid swanlab_mode raises ValueError."""
        plugin = SwanLabPlugin()
        cfg = {
            "use_swanlab": True,
            "swanlab_project": "my-project",
            "swanlab_mode": "invalid-mode",
        }

        with pytest.raises(ValueError) as exc_info:
            plugin.register(cfg)

        error_msg = str(exc_info.value)
        assert "Invalid swanlab_mode" in error_msg
        assert "cloud" in error_msg
        assert "local" in error_msg

    def test_register_valid_modes(self):
        """Test that all valid modes are accepted."""
        plugin = SwanLabPlugin()
        valid_modes = ["cloud", "local", "offline", "disabled"]

        for mode in valid_modes:
            cfg = {
                "use_swanlab": True,
                "swanlab_project": "my-project",
                "swanlab_mode": mode,
            }
            # Should not raise
            plugin.register(cfg)

    def test_register_auto_enable_swanlab(self):
        """Test that providing swanlab_project auto-enables use_swanlab."""
        plugin = SwanLabPlugin()
        cfg = {"swanlab_project": "my-project"}

        plugin.register(cfg)

        assert cfg["use_swanlab"] is True

    def test_register_cloud_mode_without_api_key_warns(self, caplog):
        """Test that cloud mode without API key logs warning."""
        plugin = SwanLabPlugin()
        cfg = {
            "use_swanlab": True,
            "swanlab_project": "my-project",
            "swanlab_mode": "cloud",
        }

        # Clear environment variable to ensure it's not set
        with patch.dict(os.environ, {}, clear=True):
            with caplog.at_level(logging.WARNING):
                plugin.register(cfg)

            # Should log warning about missing API key
            warning_messages = [record.message for record in caplog.records]
            assert any("API key" in msg for msg in warning_messages)


@pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed")
class TestMultiLoggerDetection:
    """Tests for multi-logger conflict detection."""

    def test_single_logger_no_warning(self, caplog):
        """Test that single logger doesn't trigger warning."""
        plugin = SwanLabPlugin()
        cfg = {"use_swanlab": True, "swanlab_project": "my-project"}

        with caplog.at_level(logging.WARNING):
            plugin.register(cfg)

        # Should not log multi-logger warning
        warning_messages = [record.message for record in caplog.records]
        assert not any("Multiple logging tools" in msg for msg in warning_messages)

    def test_two_loggers_warning(self, caplog):
        """Test that two loggers trigger warning."""
        plugin = SwanLabPlugin()
        cfg = {
            "use_swanlab": True,
            "swanlab_project": "my-project",
            "use_wandb": True,
        }

        with caplog.at_level(logging.WARNING):
            plugin.register(cfg)

        # Should log multi-logger warning
        warning_messages = [record.message for record in caplog.records]
        assert any("Multiple logging tools" in msg for msg in warning_messages)
        assert any("SwanLab" in msg and "WandB" in msg for msg in warning_messages)

    def test_three_loggers_error(self, caplog):
        """Test that three loggers trigger error-level warning."""
        plugin = SwanLabPlugin()
        cfg = {
            "use_swanlab": True,
            "swanlab_project": "my-project",
            "use_wandb": True,
            "use_mlflow": True,
        }

        with caplog.at_level(logging.ERROR):
            plugin.register(cfg)

        # Should log error-level warning
        error_messages = [
            record.message
            for record in caplog.records
            if record.levelno >= logging.ERROR
        ]
        assert any("logging tools enabled" in msg for msg in error_messages)

    def test_multi_logger_with_comet(self, caplog):
        """Test that Comet is detected in multi-logger scenario."""
        plugin = SwanLabPlugin()
        cfg = {
            "use_swanlab": True,
            "swanlab_project": "my-project",
            "comet_api_key": "test-key",
        }

        with caplog.at_level(logging.WARNING):
            plugin.register(cfg)

        # Should detect Comet
        warning_messages = [record.message for record in caplog.records]
        assert any("Comet" in msg for msg in warning_messages)

    def test_multi_logger_with_comet_project(self, caplog):
        """Test that Comet is detected via comet_project_name."""
        plugin = SwanLabPlugin()
        cfg = {
            "use_swanlab": True,
            "swanlab_project": "my-project",
            "comet_project_name": "test-project",
        }

        with caplog.at_level(logging.WARNING):
            plugin.register(cfg)

        # Should detect Comet
        warning_messages = [record.message for record in caplog.records]
        assert any("Comet" in msg for msg in warning_messages)


@pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed")
class TestSwanLabPluginPreModelLoad:
    """Tests for SwanLabPlugin.pre_model_load() runtime checks."""

    def test_pre_model_load_disabled(self):
        """Test that pre_model_load does nothing when SwanLab is disabled."""
        plugin = SwanLabPlugin()
        cfg = MagicMock()
        cfg.use_swanlab = False

        # Should not raise
        plugin.pre_model_load(cfg)

    def test_pre_model_load_import_error(self):
        """Test that missing swanlab package raises clear ImportError."""
        plugin = SwanLabPlugin()
        cfg = MagicMock()
        cfg.use_swanlab = True

        with patch(
            "builtins.__import__", side_effect=ImportError("No module named 'swanlab'")
        ):
            with pytest.raises(ImportError) as exc_info:
                plugin.pre_model_load(cfg)

            error_msg = str(exc_info.value)
            assert "SwanLab is not installed" in error_msg
            assert "pip install swanlab" in error_msg

    @patch("axolotl.utils.distributed.is_main_process")
    @patch("axolotl.utils.distributed.get_world_size")
    def test_pre_model_load_non_main_process_skips(
        self, mock_get_world_size, mock_is_main_process
    ):
        """Test that non-main process skips SwanLab initialization."""
        mock_get_world_size.return_value = 2
        mock_is_main_process.return_value = False

        plugin = SwanLabPlugin()
        cfg = MagicMock()
        cfg.use_swanlab = True

        with patch("swanlab.init") as mock_init:
            plugin.pre_model_load(cfg)
            # Should NOT call swanlab.init
            mock_init.assert_not_called()

    @patch("axolotl.utils.distributed.is_main_process")
    @patch("axolotl.utils.distributed.get_world_size")
    def test_pre_model_load_distributed_logging(
        self, mock_get_world_size, mock_is_main_process, caplog
    ):
        """Test that distributed training logs world size info."""
        mock_get_world_size.return_value = 4
        mock_is_main_process.return_value = True

        plugin = SwanLabPlugin()
        cfg = MagicMock()
        cfg.use_swanlab = True
        cfg.swanlab_project = "test-project"
        cfg.swanlab_mode = "cloud"

        with patch("swanlab.init"), patch("swanlab.__version__", "0.3.0"):
            with caplog.at_level(logging.INFO):
                plugin.pre_model_load(cfg)

            # Should log distributed training info
            info_messages = [record.message for record in caplog.records]
            assert any("world_size=4" in msg for msg in info_messages)
            assert any("Only rank 0" in msg for msg in info_messages)


@pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed")
class TestSwanLabInitKwargs:
    """Tests for SwanLab initialization with direct parameter passing."""

    def test_custom_branding_added_to_config(self):
        """Test that Axolotl custom branding is added to SwanLab config."""
        from axolotl.integrations.swanlab.plugins import SwanLabPlugin
        from axolotl.utils.dict import DictDefault

        plugin = SwanLabPlugin()
        cfg = DictDefault(
            {
                "use_swanlab": True,
                "swanlab_project": "test-project",
            }
        )

        init_kwargs = plugin._get_swanlab_init_kwargs(cfg)

        # Verify custom branding is present
        assert "config" in init_kwargs
        assert init_kwargs["config"]["UPPERFRAME"] == "🦎 Axolotl"

    def test_api_key_passed_directly(self):
        """Test that API key is passed directly to swanlab.init() instead of via env var."""
        from axolotl.integrations.swanlab.plugins import SwanLabPlugin
        from axolotl.utils.dict import DictDefault

        plugin = SwanLabPlugin()
        cfg = DictDefault(
            {
                "use_swanlab": True,
                "swanlab_project": "test-project",
                "swanlab_api_key": "test-api-key-12345",
            }
        )

        init_kwargs = plugin._get_swanlab_init_kwargs(cfg)

        # Verify API key is in init_kwargs (not set as env var)
        assert "api_key" in init_kwargs
        assert init_kwargs["api_key"] == "test-api-key-12345"

    def test_private_deployment_hosts_passed_directly(self):
        """Test that private deployment hosts are passed directly to swanlab.init()."""
        from axolotl.integrations.swanlab.plugins import SwanLabPlugin
        from axolotl.utils.dict import DictDefault

        plugin = SwanLabPlugin()
        cfg = DictDefault(
            {
                "use_swanlab": True,
                "swanlab_project": "internal-project",
                "swanlab_web_host": "https://swanlab.company.com",
                "swanlab_api_host": "https://api-swanlab.company.com",
            }
        )

        init_kwargs = plugin._get_swanlab_init_kwargs(cfg)

        # Verify private deployment hosts are in init_kwargs
        assert "web_host" in init_kwargs
        assert init_kwargs["web_host"] == "https://swanlab.company.com"
        assert "api_host" in init_kwargs
        assert init_kwargs["api_host"] == "https://api-swanlab.company.com"

    @patch("axolotl.utils.distributed.is_main_process")
    def test_full_private_deployment_init(self, mock_is_main_process):
        """Test complete initialization with private deployment configuration."""
        mock_is_main_process.return_value = True

        from axolotl.integrations.swanlab.plugins import SwanLabPlugin
        from axolotl.utils.dict import DictDefault

        plugin = SwanLabPlugin()
        cfg = DictDefault(
            {
                "use_swanlab": True,
                "swanlab_project": "secure-project",
                "swanlab_experiment_name": "experiment-001",
                "swanlab_mode": "cloud",
                "swanlab_api_key": "private-key-xyz",
                "swanlab_web_host": "https://swanlab.internal.net",
                "swanlab_api_host": "https://api.swanlab.internal.net",
                "swanlab_workspace": "research-team",
            }
        )

        with patch("swanlab.init") as mock_init:
            plugin.pre_model_load(cfg)

            # Verify swanlab.init was called with all parameters
            mock_init.assert_called_once()
            call_kwargs = mock_init.call_args[1]

            assert call_kwargs["project"] == "secure-project"
            assert call_kwargs["experiment_name"] == "experiment-001"
            assert call_kwargs["mode"] == "cloud"
            assert call_kwargs["api_key"] == "private-key-xyz"
            assert call_kwargs["web_host"] == "https://swanlab.internal.net"
            assert call_kwargs["api_host"] == "https://api.swanlab.internal.net"
            assert call_kwargs["workspace"] == "research-team"
            assert call_kwargs["config"]["UPPERFRAME"] == "🦎 Axolotl"

    def test_env_vars_not_set_for_api_params(self):
        """Test that environment variables are NOT set for API parameters."""
        import os

        from axolotl.integrations.swanlab.plugins import SwanLabPlugin
        from axolotl.utils.dict import DictDefault

        # Clear any existing env vars
        for key in [
            "SWANLAB_API_KEY",
            "SWANLAB_WEB_HOST",
            "SWANLAB_API_HOST",
            "SWANLAB_MODE",
        ]:
            os.environ.pop(key, None)

        plugin = SwanLabPlugin()
        cfg = DictDefault(
            {
                "use_swanlab": True,
                "swanlab_project": "test-project",
                "swanlab_api_key": "test-key",
                "swanlab_web_host": "https://test.com",
                "swanlab_api_host": "https://api-test.com",
                "swanlab_mode": "cloud",
            }
        )

        with (
            patch("axolotl.utils.distributed.is_main_process", return_value=True),
            patch("swanlab.init"),
        ):
            plugin.pre_model_load(cfg)

        # Verify env vars were NOT set (simplified approach)
        # The old _setup_swanlab_env() method is removed, so these shouldn't be set
        # Note: SwanLab itself might set these, but our plugin shouldn't
        # We're just testing that our plugin doesn't call _setup_swanlab_env()


@pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed")
class TestLarkNotificationIntegration:
    """Tests for Lark (Feishu) notification integration."""

    def test_lark_callback_registration_with_webhook_only(self):
        """Test Lark callback registration with webhook URL only (no secret)."""
        plugin = SwanLabPlugin()

        cfg = MagicMock()
        cfg.use_swanlab = True
        cfg.swanlab_project = "test-project"
        cfg.swanlab_mode = "local"
        cfg.swanlab_lark_webhook_url = (
            "https://open.feishu.cn/open-apis/bot/v2/hook/test-webhook"
        )
        cfg.swanlab_lark_secret = None

        with (
            patch("swanlab.init"),
            patch("swanlab.__version__", "0.3.0"),
            patch("swanlab.register_callbacks") as mock_register,
            patch("axolotl.utils.distributed.is_main_process", return_value=True),
            patch("axolotl.utils.distributed.get_world_size", return_value=1),
        ):
            # Mock LarkCallback import
            with patch("swanlab.plugin.notification.LarkCallback") as MockLarkCallback:
                mock_lark_instance = MagicMock()
                MockLarkCallback.return_value = mock_lark_instance

                plugin.pre_model_load(cfg)

                # Verify LarkCallback was instantiated with correct params
                MockLarkCallback.assert_called_once_with(
                    webhook_url="https://open.feishu.cn/open-apis/bot/v2/hook/test-webhook",
                    secret=None,
                )

                # Verify callback was registered
                mock_register.assert_called_once_with([mock_lark_instance])

    def test_lark_callback_registration_with_secret(self):
        """Test Lark callback registration with webhook URL and HMAC secret."""
        plugin = SwanLabPlugin()

        cfg = MagicMock()
        cfg.use_swanlab = True
        cfg.swanlab_project = "test-project"
        cfg.swanlab_mode = "local"
        cfg.swanlab_lark_webhook_url = (
            "https://open.feishu.cn/open-apis/bot/v2/hook/test-webhook"
        )
        cfg.swanlab_lark_secret = "test-hmac-secret"

        with (
            patch("swanlab.init"),
            patch("swanlab.__version__", "0.3.0"),
            patch("swanlab.register_callbacks") as mock_register,
            patch("axolotl.utils.distributed.is_main_process", return_value=True),
            patch("axolotl.utils.distributed.get_world_size", return_value=1),
        ):
            with patch("swanlab.plugin.notification.LarkCallback") as MockLarkCallback:
                mock_lark_instance = MagicMock()
                MockLarkCallback.return_value = mock_lark_instance

                plugin.pre_model_load(cfg)

                # Verify LarkCallback was instantiated with secret
                MockLarkCallback.assert_called_once_with(
                    webhook_url="https://open.feishu.cn/open-apis/bot/v2/hook/test-webhook",
                    secret="test-hmac-secret",
                )

                mock_register.assert_called_once_with([mock_lark_instance])

    def test_lark_callback_not_registered_without_webhook(self):
        """Test that Lark callback is NOT registered when webhook URL not provided."""
        plugin = SwanLabPlugin()

        cfg = MagicMock()
        cfg.use_swanlab = True
        cfg.swanlab_project = "test-project"
        cfg.swanlab_mode = "local"
        cfg.swanlab_lark_webhook_url = None  # No webhook
        cfg.swanlab_lark_secret = None

        with (
            patch("swanlab.init"),
            patch("swanlab.__version__", "0.3.0"),
            patch("swanlab.register_callbacks") as mock_register,
            patch("axolotl.utils.distributed.is_main_process", return_value=True),
            patch("axolotl.utils.distributed.get_world_size", return_value=1),
        ):
            plugin.pre_model_load(cfg)

            # Verify register_callbacks was NOT called
            mock_register.assert_not_called()

    def test_lark_import_error_handled_gracefully(self, caplog):
        """Test that ImportError for Lark plugin is handled gracefully."""
        plugin = SwanLabPlugin()

        cfg = MagicMock()
        cfg.use_swanlab = True
        cfg.swanlab_project = "test-project"
        cfg.swanlab_mode = "local"
        cfg.swanlab_lark_webhook_url = (
            "https://open.feishu.cn/open-apis/bot/v2/hook/test-webhook"
        )
        cfg.swanlab_lark_secret = None

        with (
            patch("swanlab.init"),
            patch("swanlab.__version__", "0.3.0"),
            patch("axolotl.utils.distributed.is_main_process", return_value=True),
            patch("axolotl.utils.distributed.get_world_size", return_value=1),
        ):
            # Mock ImportError for LarkCallback
            with patch(
                "swanlab.plugin.notification.LarkCallback",
                side_effect=ImportError(
                    "No module named 'swanlab.plugin.notification'"
                ),
            ):
                with caplog.at_level(logging.WARNING):
                    plugin.pre_model_load(cfg)

                    # Should log warning about missing Lark plugin
                    warning_messages = [record.message for record in caplog.records]
                    assert any(
                        "Failed to import SwanLab Lark plugin" in msg
                        for msg in warning_messages
                    )
                    assert any("SwanLab >= 0.3.0" in msg for msg in warning_messages)

    def test_lark_warning_for_missing_secret(self, caplog):
        """Test that warning is logged when Lark webhook has no HMAC secret."""
        plugin = SwanLabPlugin()

        cfg = MagicMock()
        cfg.use_swanlab = True
        cfg.swanlab_project = "test-project"
        cfg.swanlab_mode = "local"
        cfg.swanlab_lark_webhook_url = (
            "https://open.feishu.cn/open-apis/bot/v2/hook/test-webhook"
        )
        cfg.swanlab_lark_secret = None  # No secret

        with (
            patch("swanlab.init"),
            patch("swanlab.__version__", "0.3.0"),
            patch("swanlab.register_callbacks"),
            patch("axolotl.utils.distributed.is_main_process", return_value=True),
            patch("axolotl.utils.distributed.get_world_size", return_value=1),
        ):
            with patch("swanlab.plugin.notification.LarkCallback"):
                with caplog.at_level(logging.WARNING):
                    plugin.pre_model_load(cfg)

                    # Should log warning about missing secret
                    warning_messages = [record.message for record in caplog.records]
                    assert any(
                        "no secret configured" in msg.lower()
                        for msg in warning_messages
                    )
                    assert any("swanlab_lark_secret" in msg for msg in warning_messages)


@pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed")
class TestSwanLabPluginIntegration:
    """Integration tests for SwanLab plugin lifecycle."""

    def test_full_lifecycle_valid_config(self):
        """Test full plugin lifecycle with valid configuration."""
        plugin = SwanLabPlugin()

        # Register
        cfg_dict = {
            "use_swanlab": True,
            "swanlab_project": "test-project",
            "swanlab_mode": "local",
        }
        plugin.register(cfg_dict)

        # Pre-model load (mock SwanLab)
        cfg_obj = MagicMock()
        cfg_obj.use_swanlab = True
        cfg_obj.swanlab_project = "test-project"
        cfg_obj.swanlab_mode = "local"
        cfg_obj.swanlab_lark_webhook_url = None  # No Lark

        with (
            patch("swanlab.init") as mock_init,
            patch("swanlab.__version__", "0.3.0"),
            patch("axolotl.utils.distributed.is_main_process", return_value=True),
            patch("axolotl.utils.distributed.get_world_size", return_value=1),
        ):
            plugin.pre_model_load(cfg_obj)
            # Should call swanlab.init
            mock_init.assert_called_once()

    def test_lifecycle_with_multi_logger_warning(self, caplog):
        """Test lifecycle with multi-logger warning."""
        plugin = SwanLabPlugin()

        cfg_dict = {
            "use_swanlab": True,
            "swanlab_project": "test-project",
            "use_wandb": True,
        }

        with caplog.at_level(logging.WARNING):
            plugin.register(cfg_dict)

        # Should have multi-logger warning
        warning_messages = [record.message for record in caplog.records]
        assert any("Multiple logging tools" in msg for msg in warning_messages)

    def test_lifecycle_invalid_config_fails_early(self):
        """Test that invalid config fails at register stage."""
        plugin = SwanLabPlugin()

        cfg_dict = {
            "use_swanlab": True,
            # Missing swanlab_project
        }

        # Should fail at register, not pre_model_load
        with pytest.raises(ValueError):
            plugin.register(cfg_dict)

    def test_full_lifecycle_with_lark_notifications(self):
        """Test full lifecycle including Lark notification registration."""
        plugin = SwanLabPlugin()

        # Register
        cfg_dict = {
            "use_swanlab": True,
            "swanlab_project": "test-project",
            "swanlab_mode": "cloud",
        }
        plugin.register(cfg_dict)

        # Pre-model load with Lark config
        cfg_obj = MagicMock()
        cfg_obj.use_swanlab = True
        cfg_obj.swanlab_project = "test-project"
        cfg_obj.swanlab_mode = "cloud"
        cfg_obj.swanlab_lark_webhook_url = (
            "https://open.feishu.cn/open-apis/bot/v2/hook/test"
        )
        cfg_obj.swanlab_lark_secret = "secret123"

        with (
            patch("swanlab.init"),
            patch("swanlab.__version__", "0.3.0"),
            patch("swanlab.register_callbacks") as mock_register,
            patch("axolotl.utils.distributed.is_main_process", return_value=True),
            patch("axolotl.utils.distributed.get_world_size", return_value=1),
        ):
            with patch("swanlab.plugin.notification.LarkCallback") as MockLarkCallback:
                mock_lark_instance = MagicMock()
                MockLarkCallback.return_value = mock_lark_instance

                plugin.pre_model_load(cfg_obj)

                # Verify both SwanLab init AND Lark callback registration
                MockLarkCallback.assert_called_once()
                mock_register.assert_called_once_with([mock_lark_instance])


@pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed")
class TestCompletionLogger:
    """Tests for CompletionLogger utility class."""

    def test_completion_logger_initialization(self):
        """Test CompletionLogger initializes with correct maxlen."""
        from axolotl.integrations.swanlab.completion_logger import CompletionLogger

        logger = CompletionLogger(maxlen=64)
        assert logger.maxlen == 64
        assert len(logger) == 0

    def test_add_dpo_completion(self):
        """Test adding DPO completions to buffer."""
        from axolotl.integrations.swanlab.completion_logger import CompletionLogger

        logger = CompletionLogger(maxlen=10)

        logger.add_dpo_completion(
            step=0,
            prompt="What is AI?",
            chosen="Artificial Intelligence is...",
            rejected="AI means...",
            reward_diff=0.5,
        )

        assert len(logger) == 1
        entry = logger.data[0]
        assert entry["step"] == 0
        assert entry["prompt"] == "What is AI?"
        assert entry["chosen"] == "Artificial Intelligence is..."
        assert entry["rejected"] == "AI means..."
        assert entry["reward_diff"] == 0.5

    def test_add_kto_completion(self):
        """Test adding KTO completions to buffer."""
        from axolotl.integrations.swanlab.completion_logger import CompletionLogger

        logger = CompletionLogger(maxlen=10)

        logger.add_kto_completion(
            step=1,
            prompt="Explain quantum physics",
            completion="Quantum physics is...",
            label=True,
            reward=0.8,
        )

        assert len(logger) == 1
        entry = logger.data[0]
        assert entry["step"] == 1
        assert entry["prompt"] == "Explain quantum physics"
        assert entry["completion"] == "Quantum physics is..."
        assert entry["label"] == "desirable"
        assert entry["reward"] == 0.8

    def test_add_orpo_completion(self):
        """Test adding ORPO completions to buffer."""
        from axolotl.integrations.swanlab.completion_logger import CompletionLogger

        logger = CompletionLogger(maxlen=10)

        logger.add_orpo_completion(
            step=2,
            prompt="Write a poem",
            chosen="Roses are red...",
            rejected="Violets are blue...",
            log_odds_ratio=1.2,
        )

        assert len(logger) == 1
        entry = logger.data[0]
        assert entry["step"] == 2
        assert entry["chosen"] == "Roses are red..."
        assert entry["rejected"] == "Violets are blue..."
        assert entry["log_odds_ratio"] == 1.2

    def test_add_grpo_completion(self):
        """Test adding GRPO completions to buffer."""
        from axolotl.integrations.swanlab.completion_logger import CompletionLogger

        logger = CompletionLogger(maxlen=10)

        logger.add_grpo_completion(
            step=3,
            prompt="Solve this problem",
            completion="The answer is 42",
            reward=0.9,
            advantage=0.3,
        )

        assert len(logger) == 1
        entry = logger.data[0]
        assert entry["step"] == 3
        assert entry["completion"] == "The answer is 42"
        assert entry["reward"] == 0.9
        assert entry["advantage"] == 0.3

    def test_memory_bounded_buffer(self):
        """Test that buffer respects maxlen and drops oldest entries."""
        from axolotl.integrations.swanlab.completion_logger import CompletionLogger

        logger = CompletionLogger(maxlen=3)

        # Add 5 completions
        for i in range(5):
            logger.add_dpo_completion(
                step=i,
                prompt=f"Prompt {i}",
                chosen=f"Chosen {i}",
                rejected=f"Rejected {i}",
            )

        # Should only keep last 3
        assert len(logger) == 3
        assert logger.data[0]["step"] == 2  # Oldest kept
        assert logger.data[1]["step"] == 3
        assert logger.data[2]["step"] == 4  # Newest

    def test_log_to_swanlab_when_not_initialized(self):
        """Test logging gracefully fails when SwanLab not initialized."""
        from axolotl.integrations.swanlab.completion_logger import CompletionLogger

        logger = CompletionLogger(maxlen=10)
        logger.add_dpo_completion(
            step=0,
            prompt="Test",
            chosen="A",
            rejected="B",
        )

        with patch("swanlab.get_run", return_value=None):
            result = logger.log_to_swanlab()
            assert result is False  # Should fail gracefully

    def test_log_to_swanlab_success(self):
        """Test successful logging to SwanLab."""
        from axolotl.integrations.swanlab.completion_logger import CompletionLogger

        logger = CompletionLogger(maxlen=10)
        logger.add_dpo_completion(
            step=0,
            prompt="Test prompt",
            chosen="Chosen response",
            rejected="Rejected response",
            reward_diff=0.5,
        )

        with (
            patch("swanlab.get_run") as mock_get_run,
            patch("swanlab.log") as mock_log,
            patch("swanlab.echarts.Table") as MockTable,
        ):
            mock_get_run.return_value = MagicMock()  # SwanLab initialized
            mock_table_instance = MagicMock()
            MockTable.return_value = mock_table_instance

            result = logger.log_to_swanlab(table_name="test_table")

            assert result is True
            mock_log.assert_called_once()
            mock_table_instance.add.assert_called_once()

    def test_clear_buffer(self):
        """Test clearing the completion buffer."""
        from axolotl.integrations.swanlab.completion_logger import CompletionLogger

        logger = CompletionLogger(maxlen=10)
        logger.add_dpo_completion(
            step=0,
            prompt="Test",
            chosen="A",
            rejected="B",
        )

        assert len(logger) == 1
        logger.clear()
        assert len(logger) == 0

    def test_repr(self):
        """Test string representation."""
        from axolotl.integrations.swanlab.completion_logger import CompletionLogger

        logger = CompletionLogger(maxlen=128)
        logger.add_dpo_completion(
            step=0,
            prompt="Test",
            chosen="A",
            rejected="B",
        )

        repr_str = repr(logger)
        assert "CompletionLogger" in repr_str
        assert "maxlen=128" in repr_str
        assert "buffered=1/128" in repr_str


@pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed")
class TestSwanLabRLHFCompletionCallback:
    """Tests for SwanLabRLHFCompletionCallback."""

    def test_callback_initialization(self):
        """Test callback initializes with correct parameters."""
        from axolotl.integrations.swanlab.callbacks import SwanLabRLHFCompletionCallback

        callback = SwanLabRLHFCompletionCallback(
            log_interval=50,
            max_completions=64,
            table_name="custom_table",
        )

        assert callback.log_interval == 50
        assert callback.logger.maxlen == 64
        assert callback.table_name == "custom_table"
        assert callback.trainer_type is None

    def test_trainer_type_detection_dpo(self):
        """Test DPO trainer type is detected correctly."""
        from axolotl.integrations.swanlab.callbacks import SwanLabRLHFCompletionCallback

        callback = SwanLabRLHFCompletionCallback()

        # Mock trainer with DPO in name
        mock_trainer = MagicMock()
        mock_trainer.__class__.__name__ = "AxolotlDPOTrainer"

        callback.on_init_end(
            args=MagicMock(),
            state=MagicMock(),
            control=MagicMock(),
            trainer=mock_trainer,
        )

        assert callback.trainer_type == "dpo"

    def test_trainer_type_detection_kto(self):
        """Test KTO trainer type is detected correctly."""
        from axolotl.integrations.swanlab.callbacks import SwanLabRLHFCompletionCallback

        callback = SwanLabRLHFCompletionCallback()

        mock_trainer = MagicMock()
        mock_trainer.__class__.__name__ = "AxolotlKTOTrainer"

        callback.on_init_end(
            args=MagicMock(),
            state=MagicMock(),
            control=MagicMock(),
            trainer=mock_trainer,
        )

        assert callback.trainer_type == "kto"

    def test_on_train_end_logs_completions(self):
        """Test that completions are logged at end of training."""
        from axolotl.integrations.swanlab.callbacks import SwanLabRLHFCompletionCallback

        callback = SwanLabRLHFCompletionCallback()
        callback.trainer_type = "dpo"

        # Add some completions to buffer
        callback.logger.add_dpo_completion(
            step=0,
            prompt="Test",
            chosen="A",
            rejected="B",
        )

        with patch.object(callback.logger, "log_to_swanlab") as mock_log:
            callback.on_train_end(
                args=MagicMock(),
                state=MagicMock(global_step=100),
                control=MagicMock(),
            )

            # Should log remaining completions
            mock_log.assert_called_once()


@pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed")
class TestSwanLabPluginCompletionIntegration:
    """Integration tests for completion logging in SwanLabPlugin."""

    def test_completion_callback_registered_for_dpo_trainer(self):
        """Test that completion callback is registered for DPO trainer."""
        from axolotl.integrations.swanlab.plugins import SwanLabPlugin
        from axolotl.utils.dict import DictDefault

        plugin = SwanLabPlugin()
        plugin.swanlab_initialized = True  # Simulate SwanLab initialized

        cfg = {
            "use_swanlab": True,
            "swanlab_project": "test-project",
            "swanlab_log_completions": True,
            "swanlab_completion_log_interval": 50,
            "swanlab_completion_max_buffer": 64,
        }
        cfg_obj = DictDefault(cfg)

        # Mock DPO trainer
        mock_trainer = MagicMock()
        mock_trainer.__class__.__name__ = "AxolotlDPOTrainer"
        mock_trainer.state = MagicMock(max_steps=1000)
        mock_trainer.args = MagicMock(
            num_train_epochs=3,
            train_batch_size=4,
            gradient_accumulation_steps=2,
        )

        with patch("swanlab.config.update"):
            plugin.post_trainer_create(cfg_obj, mock_trainer)

        # Verify callback was added
        mock_trainer.add_callback.assert_called_once()
        callback = mock_trainer.add_callback.call_args[0][0]
        assert callback.__class__.__name__ == "SwanLabRLHFCompletionCallback"
        assert callback.log_interval == 50
        assert callback.logger.maxlen == 64

    def test_completion_callback_not_registered_for_non_rlhf_trainer(self):
        """Test that completion callback is NOT registered for non-RLHF trainers."""
        from axolotl.integrations.swanlab.plugins import SwanLabPlugin
        from axolotl.utils.dict import DictDefault

        plugin = SwanLabPlugin()
        plugin.swanlab_initialized = True

        cfg = {
            "use_swanlab": True,
            "swanlab_project": "test-project",
            "swanlab_log_completions": True,
        }
        cfg_obj = DictDefault(cfg)

        # Mock regular SFT trainer (not RLHF)
        mock_trainer = MagicMock()
        mock_trainer.__class__.__name__ = "AxolotlTrainer"  # Not RLHF
        mock_trainer.state = MagicMock(max_steps=1000)
        mock_trainer.args = MagicMock()

        with patch("swanlab.config.update"):
            plugin.post_trainer_create(cfg_obj, mock_trainer)

        # Callback should NOT be added for non-RLHF trainer
        mock_trainer.add_callback.assert_not_called()

    def test_completion_callback_not_registered_when_disabled(self):
        """Test that completion callback is not registered when disabled in config."""
        from axolotl.integrations.swanlab.plugins import SwanLabPlugin
        from axolotl.utils.dict import DictDefault

        plugin = SwanLabPlugin()
        plugin.swanlab_initialized = True

        cfg = {
            "use_swanlab": True,
            "swanlab_project": "test-project",
            "swanlab_log_completions": False,  # Disabled
        }
        cfg_obj = DictDefault(cfg)

        # Mock DPO trainer
        mock_trainer = MagicMock()
        mock_trainer.__class__.__name__ = "AxolotlDPOTrainer"
        mock_trainer.state = MagicMock(max_steps=1000)
        mock_trainer.args = MagicMock()

        with patch("swanlab.config.update"):
            plugin.post_trainer_create(cfg_obj, mock_trainer)

        # Callback should NOT be added when disabled
        mock_trainer.add_callback.assert_not_called()


@pytest.mark.skipif(not SWANLAB_INSTALLED, reason="swanlab package not installed")
class TestSwanLabProfiling:
    """Tests for SwanLab profiling utilities."""

    def test_profiling_context_logs_duration(self):
        """Test that profiling context logs execution duration."""
        from axolotl.integrations.swanlab.profiling import swanlab_profiling_context

        # Mock trainer with SwanLab enabled
        mock_trainer = MagicMock()
        mock_trainer.cfg = MagicMock(use_swanlab=True)
        mock_trainer.__class__.__name__ = "TestTrainer"

        with patch("swanlab.get_run") as mock_get_run, patch("swanlab.log") as mock_log:
            mock_get_run.return_value = MagicMock()  # SwanLab initialized

            with swanlab_profiling_context(mock_trainer, "test_function"):
                time.sleep(0.01)  # Simulate work

            # Verify log was called with correct metric name
            mock_log.assert_called_once()
            logged_data = mock_log.call_args[0][0]
            assert "profiling/Time taken: TestTrainer.test_function" in logged_data
            # Duration should be > 0.01 seconds
            assert (
                logged_data["profiling/Time taken: TestTrainer.test_function"] >= 0.01
            )

    def test_profiling_context_skips_when_swanlab_disabled(self):
        """Test that profiling is skipped when SwanLab is disabled."""
        from axolotl.integrations.swanlab.profiling import swanlab_profiling_context

        mock_trainer = MagicMock()
        mock_trainer.cfg = MagicMock(use_swanlab=False)  # Disabled

        with patch("swanlab.log") as mock_log:
            with swanlab_profiling_context(mock_trainer, "test_function"):
                time.sleep(0.01)

            # Should NOT log when disabled
            mock_log.assert_not_called()

    def test_profiling_context_skips_when_swanlab_not_initialized(self):
        """Test that profiling is skipped when SwanLab not initialized."""
        from axolotl.integrations.swanlab.profiling import swanlab_profiling_context

        mock_trainer = MagicMock()
        mock_trainer.cfg = MagicMock(use_swanlab=True)

        with (
            patch("swanlab.get_run", return_value=None),
            patch("swanlab.log") as mock_log,
        ):
            with swanlab_profiling_context(mock_trainer, "test_function"):
                time.sleep(0.01)

            # Should NOT log when not initialized
            mock_log.assert_not_called()

    def test_profiling_decorator(self):
        """Test swanlab_profile decorator."""
        from axolotl.integrations.swanlab.profiling import swanlab_profile

        class MockTrainer:
            def __init__(self):
                self.cfg = MagicMock(use_swanlab=True)

            @swanlab_profile
            def expensive_method(self, x):
                time.sleep(0.01)
                return x * 2

        trainer = MockTrainer()

        with patch("swanlab.get_run") as mock_get_run, patch("swanlab.log") as mock_log:
            mock_get_run.return_value = MagicMock()

            result = trainer.expensive_method(5)

            # Verify method still works correctly
            assert result == 10

            # Verify profiling was logged
            mock_log.assert_called_once()
            logged_data = mock_log.call_args[0][0]
            assert "profiling/Time taken: MockTrainer.expensive_method" in logged_data

    def test_profiling_config(self):
        """Test ProfilingConfig class."""
        from axolotl.integrations.swanlab.profiling import ProfilingConfig

        config = ProfilingConfig(
            enabled=True,
            min_duration_ms=1.0,
            log_interval=5,
        )

        # Test enabled check
        assert config.enabled is True

        # Test minimum duration filtering
        assert config.should_log("func1", 0.0001) is False  # 0.1ms < 1.0ms threshold
        assert config.should_log("func2", 0.002) is True  # 2.0ms > 1.0ms threshold

        # Test log interval
        assert config.should_log("func3", 0.002) is True  # 1st call
        assert config.should_log("func3", 0.002) is False  # 2nd call
        assert config.should_log("func3", 0.002) is False  # 3rd call
        assert config.should_log("func3", 0.002) is False  # 4th call
        assert config.should_log("func3", 0.002) is True  # 5th call (interval=5)

    def test_profiling_config_when_disabled(self):
        """Test ProfilingConfig when disabled."""
        from axolotl.integrations.swanlab.profiling import ProfilingConfig

        config = ProfilingConfig(enabled=False)

        # Should never log when disabled
        assert config.should_log("func1", 100.0) is False

    def test_profiling_context_advanced(self):
        """Test advanced profiling context with custom config."""
        from axolotl.integrations.swanlab.profiling import (
            ProfilingConfig,
            swanlab_profiling_context_advanced,
        )

        mock_trainer = MagicMock()
        mock_trainer.cfg = MagicMock(use_swanlab=True)
        mock_trainer.__class__.__name__ = "TestTrainer"

        # Config that filters out very fast operations
        config = ProfilingConfig(min_duration_ms=10.0)  # 10ms minimum

        with patch("swanlab.get_run") as mock_get_run, patch("swanlab.log") as mock_log:
            mock_get_run.return_value = MagicMock()

            # Fast operation (< 10ms) - should NOT log
            with swanlab_profiling_context_advanced(mock_trainer, "fast_op", config):
                time.sleep(0.001)  # 1ms

            mock_log.assert_not_called()

            # Slow operation (> 10ms) - should log
            with swanlab_profiling_context_advanced(mock_trainer, "slow_op", config):
                time.sleep(0.015)  # 15ms

            mock_log.assert_called_once()

    def test_profiling_with_exception(self):
        """Test that profiling still logs even when exception occurs."""
        from axolotl.integrations.swanlab.profiling import swanlab_profiling_context

        mock_trainer = MagicMock()
        mock_trainer.cfg = MagicMock(use_swanlab=True)
        mock_trainer.__class__.__name__ = "TestTrainer"

        with patch("swanlab.get_run") as mock_get_run, patch("swanlab.log") as mock_log:
            mock_get_run.return_value = MagicMock()

            try:
                with swanlab_profiling_context(mock_trainer, "error_function"):
                    time.sleep(0.01)
                    raise ValueError("Test error")
            except ValueError:
                pass  # Expected

            # Should still log duration even with exception
            mock_log.assert_called_once()


================================================
FILE: tests/monkeypatch/test_llama_attn_hijack_flash.py
================================================
"""
Unit tests for the monkeypatch utils
"""

import unittest

import torch

from axolotl.monkeypatch.utils import (
    get_cu_seqlens,
    get_cu_seqlens_from_pos_ids,
    get_max_seqlen_in_batch,
    get_unpad_data,
)


class TestMonkeyPatchUtils(unittest.TestCase):
    """
    Unit test class for monkeypatch utils
    """

    def test_get_cu_seqlens_1d(self):
        attn_mask = torch.tensor([[1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 0, 0]])
        target_res = torch.tensor([0, 4, 7, 12, 14, 16], dtype=torch.int32)
        self.assertTrue(torch.allclose(get_cu_seqlens(attn_mask)[0], target_res))

    def test_get_cu_seqlens_from_pos_ids_1d(self):
        position_ids = torch.tensor([[0, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4, 0, 1, 0, 0]])
        target_res = torch.tensor([0, 4, 7, 12, 14, 16], dtype=torch.int32)
        self.assertTrue(
            torch.allclose(get_cu_seqlens_from_pos_ids(position_ids)[0], target_res)
        )

    def test_get_cu_seqlens_from_pos_ids_2d(self):
        position_ids = torch.tensor(
            [
                [0, 1, 2, 3, 0, 1, 2, 0, 1, 2, 3, 4, 0, 1, 0, 0],
                [0, 1, 2, 3, 4, 0, 1, 2, 0, 1, 2, 3, 4, 5, 6, 0],
            ]
        )
        target_res = torch.tensor(
            [[0, 4, 7, 12, 14, 16], [0, 5, 8, 15, 16, 16]], dtype=torch.int32
        )
        self.assertTrue(
            torch.allclose(get_cu_seqlens_from_pos_ids(position_ids)[0], target_res)
        )

    def test_get_max_seqlen_in_batch(self):
        attn_mask = torch.tensor([[1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 0, 0]])
        target_res = torch.tensor([4, 3, 5, 2], dtype=torch.int32)
        self.assertTrue(torch.allclose(get_max_seqlen_in_batch(attn_mask), target_res))

    def test_get_unpad_data(self):
        attn_mask = torch.tensor([[1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 0, 0]])
        target_indices = torch.tensor([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13])
        target_cu_seqlen = torch.tensor([0, 4, 7, 12, 14], dtype=torch.int32)
        target_max_seqlen_in_batch = 5
        indices, cu_seqlen, max_seqlen_in_batch = get_unpad_data(attn_mask)
        self.assertTrue(torch.allclose(target_indices, indices))
        self.assertTrue(torch.allclose(target_cu_seqlen, cu_seqlen))
        self.assertEqual(target_max_seqlen_in_batch, max_seqlen_in_batch)

        attn_mask = torch.tensor(
            [
                [1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 0, 0],
                [1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 4, 4, 4, 5, 5, 5],
            ]
        )
        target_indices = torch.tensor(
            [
                0,
                1,
                2,
                3,
                4,
                5,
                6,
                7,
                8,
                9,
                10,
                11,
                12,
                13,
                16,
                17,
                18,
                19,
                20,
                21,
                22,
                23,
                24,
                25,
                26,
                27,
                28,
                29,
                30,
                31,
            ]
        )
        target_cu_seqlen = torch.tensor(
            [0, 4, 7, 12, 14, 17, 22, 24, 27, 30], dtype=torch.int32
        )
        target_max_seqlen_in_batch = 5
        indices, cu_seqlen, max_seqlen_in_batch = get_unpad_data(attn_mask)
        self.assertTrue(torch.allclose(target_indices, indices))
        self.assertTrue(torch.allclose(target_cu_seqlen, cu_seqlen))
        self.assertEqual(target_max_seqlen_in_batch, max_seqlen_in_batch)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/monkeypatch/test_pixtral_flash_attention_patch.py
================================================
"""Integration tests for Pixtral Flash Attention patches."""

import pytest
import torch


class TestPixtralFlashAttentionPatchIntegration:
    """Test Pixtral Flash Attention patch integration."""

    @pytest.mark.integration
    def test_pixtral_flash_attention_patch(self):
        """Test that Pixtral Flash Attention patch can be applied and works correctly."""
        try:
            from transformers import modeling_flash_attention_utils
        except ImportError:
            pytest.skip("Flash Attention utils not available")

        from axolotl.monkeypatch.models.pixtral.modeling_flash_attention_utils import (
            apply_patch_is_packed_sequence,
        )

        # Store original method
        original_is_packed_sequence = modeling_flash_attention_utils._is_packed_sequence

        # Apply patch and get unpatch function
        unpatch_fn = apply_patch_is_packed_sequence()

        # Verify patch was applied
        assert (
            modeling_flash_attention_utils._is_packed_sequence
            != original_is_packed_sequence
        ), "_is_packed_sequence was not patched"

        # Test the patched function with 1D position_ids
        patched_fn = modeling_flash_attention_utils._is_packed_sequence

        # Test 1D position_ids 1 sequence
        position_ids_1d = torch.tensor([0, 1, 2, 3])
        result = patched_fn(position_ids_1d, batch_size=1)
        assert isinstance(result, bool), "Function should return a boolean"
        assert result is False, "1D sequential position_ids should not be packed"

        # Test 1D packed 2 sequences
        position_ids_1d_packed = torch.tensor([0, 1, 2, 0, 1, 2])
        result = patched_fn(position_ids_1d_packed, batch_size=1)
        assert isinstance(result, bool), "Function should return a boolean"
        assert result is True, "1D packed position_ids should be detected as packed"

        # Test 2D packed 2 sequences
        position_ids_2d_packed = torch.tensor([[0, 1, 2, 3, 0, 1]])
        result = patched_fn(position_ids_2d_packed, batch_size=1)
        assert isinstance(result, bool), "Function should return a boolean"
        assert result is True, "2D packed position_ids should be detected as packed"

        # Test 2D 1 sequence
        position_ids_2d_normal = torch.tensor([[0, 1, 2, 3, 4, 5]])
        result = patched_fn(position_ids_2d_normal, batch_size=1)
        assert isinstance(result, bool), "Function should return a boolean"
        assert result is False, "2D sequential position_ids should not be packed"

        # Test 2D batch size 2
        position_ids_2d_normal = torch.tensor([[0, 1, 2, 3, 4, 5, 6, 7, 8]])
        result = patched_fn(position_ids_2d_normal, batch_size=2)
        assert isinstance(result, bool), "Function should return a boolean"
        assert result is False, "2D position_ids batch 2 should not be packed"

        # Test None case
        result = patched_fn(None, batch_size=1)
        assert isinstance(result, bool), "Function should return a boolean"
        assert result is False, "None position_ids should return False"

        # Test unpatch function
        unpatch_fn()
        assert (
            modeling_flash_attention_utils._is_packed_sequence
            == original_is_packed_sequence
        ), "unpatch function did not restore original method"


================================================
FILE: tests/monkeypatch/test_qwen3_next_modeling_patch.py
================================================
"""Integration tests for Qwen3 Next modeling patches."""

import pytest
import torch

# Skip entire module if qwen3_next not available
qwen3_next = pytest.importorskip("transformers.models.qwen3_next.modeling_qwen3_next")


class TestQwen3NextModelingPatchIntegration:
    """Test Qwen3 Next modeling patch integration."""

    @pytest.mark.integration
    def test_qwen3_next_decoder_layer_patch(self):
        """Test that Qwen3Next decoder layer patch can be applied."""
        from axolotl.monkeypatch.models.qwen3_next.modeling import (
            patch_qwen3_next_decoder_layer,
        )

        # Store original method
        original_forward = qwen3_next.Qwen3NextDecoderLayer.forward

        # Apply patch and get unpatch function
        unpatch_fn = patch_qwen3_next_decoder_layer()

        # Verify patch was applied
        assert qwen3_next.Qwen3NextDecoderLayer.forward != original_forward, (
            "decoder layer forward method was not patched"
        )

        # Verify the method is still callable
        assert callable(qwen3_next.Qwen3NextDecoderLayer.forward), (
            "Patched method is not callable"
        )

        # Test unpatch function
        if unpatch_fn:
            unpatch_fn()
            assert qwen3_next.Qwen3NextDecoderLayer.forward == original_forward, (
                "unpatch function did not restore original method"
            )

    @pytest.mark.integration
    def test_qwen3_next_gateddelta_layer_patch(self):
        """Test that Qwen3Next GatedDeltaNet patch can be applied."""
        from axolotl.monkeypatch.models.qwen3_next.modeling import (
            patch_qwen3_next_gateddelta_layer,
        )

        # Store original method
        original_forward = qwen3_next.Qwen3NextGatedDeltaNet.forward

        # Apply patch and get unpatch function
        unpatch_fn = patch_qwen3_next_gateddelta_layer()

        # Verify patch was applied
        assert qwen3_next.Qwen3NextGatedDeltaNet.forward != original_forward, (
            "GatedDeltaNet forward method was not patched"
        )

        # Verify the method is still callable
        assert callable(qwen3_next.Qwen3NextGatedDeltaNet.forward), (
            "Patched method is not callable"
        )

        # Test unpatch function
        if unpatch_fn:
            unpatch_fn()
            assert qwen3_next.Qwen3NextGatedDeltaNet.forward == original_forward, (
                "unpatch function did not restore original method"
            )

    @pytest.mark.integration
    def test_qwen3_next_imports_patch(self):
        """Test that Qwen3Next imports patch can be applied without errors."""
        from axolotl.monkeypatch.models.qwen3_next.modeling import (
            patch_qwen3_next_imports,
        )

        # Apply patch - should not raise any exceptions even if modules unavailable
        unpatch_fn = patch_qwen3_next_imports()

        # Test that unpatch function is returned (or None if skipped)
        assert unpatch_fn is None or callable(unpatch_fn), (
            "patch_qwen3_next_imports should return None or callable unpatch function"
        )

    @pytest.mark.integration
    def test_qwen3_next_modeling_packing_patch(self):
        """Test that all Qwen3Next modeling patches can be applied together."""
        from axolotl.monkeypatch.models.qwen3_next.modeling import (
            patch_qwen3_next_modeling_packing,
        )

        # This should not raise any exceptions
        patch_qwen3_next_modeling_packing()


@pytest.mark.integration
def test_get_cu_seqlens_utility():
    """Test the get_cu_seqlens utility function."""
    from axolotl.monkeypatch.models.qwen3_next.modeling import get_cu_seqlens

    # Test with simple position_ids
    position_ids = torch.tensor([[0, 1, 2, 0, 1]])
    cu_seqlens = get_cu_seqlens(position_ids)
    assert cu_seqlens.dtype == torch.int32, "Should be int32 dtype"

    # Should return tensor with start positions and total length
    expected = torch.tensor([0, 3, 5], dtype=torch.int32)
    assert torch.equal(cu_seqlens, expected), f"Expected {expected}, got {cu_seqlens}"


================================================
FILE: tests/monkeypatch/test_trainer_accelerator_args.py
================================================
"""
Unit tests for trainer accelerator args monkeypatch
"""

import unittest

from axolotl.monkeypatch.trainer_accelerator_args import (
    check_create_accelerate_code_is_patchable,
)


class TestTrainerAcceleratorArgs(unittest.TestCase):
    """
    Unit test class for trainer accelerator args monkeypatch
    """

    def test_check_create_accelerate_code_is_patchable(self):
        """
        Test that the upstream transformers code is still patchable.
        This will fail if the patched code changes upstream.
        """
        assert check_create_accelerate_code_is_patchable()


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/monkeypatch/test_trainer_context_parallel_patch.py
================================================
"""Tests for the HF Trainer context parallel patch."""

import pytest
from transformers import Trainer

from axolotl.monkeypatch.transformers.trainer_context_parallel import (
    GUARD_PATTERN,
    PATCHED_GUARD,
    patch_prepare_context_parallel_inputs,
)


@pytest.fixture
def restore_trainer_prepare_method():
    """Ensure Trainer._prepare_context_parallel_inputs is restored after a test."""
    original_method = getattr(
        Trainer,
        "_original_prepare_context_parallel_inputs",
        Trainer._prepare_context_parallel_inputs,
    )
    patched_attr_present = hasattr(
        Trainer, "_axolotl_prepare_context_parallel_inputs_patched"
    )

    yield

    Trainer._prepare_context_parallel_inputs = original_method
    if patched_attr_present:
        delattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched")
    if hasattr(Trainer, "_original_prepare_context_parallel_inputs"):
        delattr(Trainer, "_original_prepare_context_parallel_inputs")
    if hasattr(Trainer, "_axolotl_prepare_context_parallel_inputs_source"):
        delattr(Trainer, "_axolotl_prepare_context_parallel_inputs_source")


def test_patch_attention_guard(restore_trainer_prepare_method):
    """Patch should swap the guard to allow sdpa or flash attention."""
    # Ensure we start from the unpatched method
    if hasattr(Trainer, "_original_prepare_context_parallel_inputs"):
        Trainer._prepare_context_parallel_inputs = (
            Trainer._original_prepare_context_parallel_inputs
        )
        delattr(Trainer, "_original_prepare_context_parallel_inputs")
    if hasattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched"):
        delattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched")

    patch_prepare_context_parallel_inputs()

    patched_method = Trainer._prepare_context_parallel_inputs
    assert patched_method is not None
    assert getattr(Trainer, "_axolotl_prepare_context_parallel_inputs_patched", False)

    source = Trainer._axolotl_prepare_context_parallel_inputs_source
    assert GUARD_PATTERN not in source
    assert PATCHED_GUARD in source


def test_patch_is_idempotent(restore_trainer_prepare_method):
    """Calling the patch twice should leave the same patched function in place."""
    patch_prepare_context_parallel_inputs()
    first_patched = Trainer._prepare_context_parallel_inputs

    patch_prepare_context_parallel_inputs()
    second_patched = Trainer._prepare_context_parallel_inputs

    assert first_patched is second_patched


================================================
FILE: tests/monkeypatch/test_trainer_loss_calc.py
================================================
"""Unit tests for trainer loss calc monkeypatch."""

import unittest

from axolotl.monkeypatch.transformers.trainer_loss_calc import (
    check_evaluation_loop_is_patchable,
    check_maybe_log_save_evaluate_is_patchable,
)


class TestTrainerLossCalc(unittest.TestCase):
    """
    Unit test class for trainer loss calc monkeypatch
    """

    def test_trainer_loss_calc_is_patchable(self):
        """
        Test that the upstream transformers code is still patchable. This will fail if
        the patched code changes upstream.
        """
        assert check_evaluation_loop_is_patchable()
        assert check_maybe_log_save_evaluate_is_patchable()


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/monkeypatch/test_trl_vllm.py
================================================
"""Unit tests for TRL vLLM monkeypatches.

Tests:
- split_tensor_dict: scalar type preservation (int/float/bool)
- shuffle_sequence_dict: scalar type preservation
- extract_logprobs: NaN → 0.0 replacement
- VLLMClient.batch_update_named_params: method exists after patch
- VLLMGeneration: weight_sync_chunk_size attribute after patch
- Patch idempotency: applying patch twice doesn't break anything
"""

import unittest
from dataclasses import dataclass
from unittest.mock import MagicMock

import torch


class TestSplitTensorDict(unittest.TestCase):
    """Tests for patched split_tensor_dict."""

    def setUp(self):
        from axolotl.monkeypatch.trainer.trl_vllm import _patched_split_tensor_dict

        self.split = _patched_split_tensor_dict

    def test_scalar_int_preserved(self):
        d = {"a": torch.randn(4, 3), "count": 42}
        chunks = self.split(d, 2)
        self.assertEqual(len(chunks), 2)
        self.assertEqual(chunks[0]["count"], 42)
        self.assertEqual(chunks[1]["count"], 42)

    def test_scalar_float_preserved(self):
        d = {"a": torch.randn(6, 2), "lr": 1e-5}
        chunks = self.split(d, 3)
        for c in chunks:
            self.assertEqual(c["lr"], 1e-5)

    def test_scalar_bool_preserved(self):
        d = {"a": torch.randn(4, 2), "flag": True}
        chunks = self.split(d, 2)
        for c in chunks:
            self.assertTrue(c["flag"])

    def test_none_preserved(self):
        d = {"a": torch.randn(4, 2), "b": None}
        chunks = self.split(d, 2)
        for c in chunks:
            self.assertIsNone(c["b"])

    def test_tensor_split(self):
        t = torch.arange(8).reshape(4, 2)
        d = {"a": t, "n": 10}
        chunks = self.split(d, 2)
        self.assertEqual(chunks[0]["a"].shape, (2, 2))
        self.assertEqual(chunks[1]["a"].shape, (2, 2))
        torch.testing.assert_close(chunks[0]["a"], t[:2])
        torch.testing.assert_close(chunks[1]["a"], t[2:])

    def test_0d_tensor_preserved(self):
        d = {"a": torch.randn(4, 2), "scalar_t": torch.tensor(3.14)}
        chunks = self.split(d, 2)
        for c in chunks:
            self.assertAlmostEqual(c["scalar_t"].item(), 3.14, places=5)

    def test_list_split(self):
        d = {"a": torch.randn(4, 2), "names": ["a", "b", "c", "d"]}
        chunks = self.split(d, 2)
        self.assertEqual(chunks[0]["names"], ["a", "b"])
        self.assertEqual(chunks[1]["names"], ["c", "d"])


class TestShuffleSequenceDict(unittest.TestCase):
    """Tests for patched shuffle_sequence_dict."""

    def setUp(self):
        from axolotl.monkeypatch.trainer.trl_vllm import _patched_shuffle_sequence_dict

        self.shuffle = _patched_shuffle_sequence_dict

    def test_scalar_int_preserved(self):
        d = {"a": torch.randn(4, 3), "count": 42}
        result = self.shuffle(d)
        self.assertEqual(result["count"], 42)

    def test_scalar_float_preserved(self):
        d = {"a": torch.randn(4, 3), "lr": 1e-5}
        result = self.shuffle(d)
        self.assertEqual(result["lr"], 1e-5)

    def test_scalar_bool_preserved(self):
        d = {"a": torch.randn(4, 3), "flag": False}
        result = self.shuffle(d)
        self.assertFalse(result["flag"])

    def test_none_preserved(self):
        d = {"a": torch.randn(4, 3), "b": None}
        result = self.shuffle(d)
        self.assertIsNone(result["b"])

    def test_tensor_permuted(self):
        torch.manual_seed(42)
        t = torch.arange(4).float()
        d = {"a": t}
        result = self.shuffle(d)
        # Same elements, possibly different order
        self.assertEqual(sorted(result["a"].tolist()), sorted(t.tolist()))
        self.assertEqual(result["a"].shape, t.shape)

    def test_list_permuted(self):
        torch.manual_seed(42)
        d = {"a": torch.randn(3, 2), "names": ["x", "y", "z"]}
        result = self.shuffle(d)
        self.assertEqual(sorted(result["names"]), ["x", "y", "z"])
        self.assertEqual(len(result["names"]), 3)

    def test_0d_tensor_preserved(self):
        d = {"a": torch.randn(4, 2), "scalar_t": torch.tensor(3.14)}
        result = self.shuffle(d)
        self.assertAlmostEqual(result["scalar_t"].item(), 3.14, places=5)


class TestExtractLogprobs(unittest.TestCase):
    """Tests for patched extract_logprobs (NaN → 0.0)."""

    def setUp(self):
        from axolotl.monkeypatch.trainer.trl_vllm import _patched_extract_logprobs

        self.extract = _patched_extract_logprobs

    def _make_output(self, logprob_values):
        """Create a mock vLLM RequestOutput with given logprob values."""

        @dataclass
        class LogprobItem:
            logprob: float
            rank: int

        @dataclass
        class SeqOutput:
            logprobs: list[dict[int, LogprobItem]] | None

        @dataclass
        class RequestOutput:
            outputs: list[SeqOutput]

        logprobs_list = []
        for vals in logprob_values:
            lp_dict = {i: LogprobItem(logprob=v, rank=i) for i, v in enumerate(vals)}
            logprobs_list.append(lp_dict)

        return RequestOutput(outputs=[SeqOutput(logprobs=logprobs_list)])

    def test_nan_replaced_with_zero(self):
        output = self._make_output([[float("nan"), 0.5], [-0.3, float("nan")]])
        logprobs, token_ids = self.extract([output])
        self.assertEqual(logprobs[0][0][0], 0.0)  # NaN → 0.0
        self.assertEqual(logprobs[0][0][1], 0.5)
        self.assertEqual(logprobs[0][1][0], -0.3)
        self.assertEqual(logprobs[0][1][1], 0.0)  # NaN → 0.0

    def test_normal_values_preserved(self):
        output = self._make_output([[-0.5, -1.2], [-0.1, -2.0]])
        logprobs, token_ids = self.extract([output])
        self.assertAlmostEqual(logprobs[0][0][0], -0.5)
        self.assertAlmostEqual(logprobs[0][0][1], -1.2)

    def test_none_logprobs_returns_none(self):
        @dataclass
        class SeqOutput:
            logprobs: None = None

        @dataclass
        class RequestOutput:
            outputs: list

        output = RequestOutput(outputs=[SeqOutput()])
        logprobs, token_ids = self.extract([output])
        self.assertIsNone(logprobs)
        self.assertIsNone(token_ids)

    def test_token_ids_extracted(self):
        output = self._make_output([[-0.5]])
        logprobs, token_ids = self.extract([output])
        self.assertEqual(token_ids[0][0], [0])  # token_id=0 from enumerate


class TestPatchApplication(unittest.TestCase):
    """Tests for patch_trl_vllm() application."""

    def test_batch_update_added_to_client(self):
        from axolotl.monkeypatch.trainer.trl_vllm import patch_trl_vllm

        patch_trl_vllm()
        from trl.generation.vllm_client import VLLMClient

        self.assertTrue(hasattr(VLLMClient, "batch_update_named_params"))

    def test_extract_logprobs_patched(self):
        from axolotl.monkeypatch.trainer.trl_vllm import (
            _patched_extract_logprobs,
            patch_trl_vllm,
        )

        patch_trl_vllm()
        from trl.generation import vllm_generation

        self.assertIs(vllm_generation.extract_logprobs, _patched_extract_logprobs)

    def test_utils_patched(self):
        from axolotl.monkeypatch.trainer.trl_vllm import (
            _patched_shuffle_sequence_dict,
            _patched_split_tensor_dict,
            patch_trl_vllm,
        )

        patch_trl_vllm()
        import trl.trainer.utils

        self.assertIs(trl.trainer.utils.split_tensor_dict, _patched_split_tensor_dict)
        self.assertIs(
            trl.trainer.utils.shuffle_sequence_dict, _patched_shuffle_sequence_dict
        )

    def test_patch_idempotent(self):
        from axolotl.monkeypatch.trainer.trl_vllm import patch_trl_vllm

        patch_trl_vllm()
        patch_trl_vllm()  # second call should not error
        from trl.generation.vllm_client import VLLMClient

        self.assertTrue(hasattr(VLLMClient, "batch_update_named_params"))


class TestBatchUpdateChunking(unittest.TestCase):
    """Tests for batch_update_named_params chunking logic."""

    def test_no_chunk_single_batch(self):
        from axolotl.monkeypatch.trainer.trl_vllm import _batch_update_named_params

        # Test that with chunk_size=None, all params go in one chunk
        client = MagicMock()
        client.base_url = "http://localhost:8000"
        client.session.post.return_value = MagicMock(status_code=200)
        client.communicator = MagicMock()
        client.communicator.group = MagicMock()
        client.rank = 0

        params = [
            ("layer.0.weight", torch.randn(10, 10)),
            ("layer.1.weight", torch.randn(10, 10)),
        ]
        _batch_update_named_params(client, params, chunk_size=None)

        # Should make exactly 1 HTTP call
        self.assertEqual(client.session.post.call_count, 1)

    def test_chunk_splits_params(self):
        from axolotl.monkeypatch.trainer.trl_vllm import _batch_update_named_params

        client = MagicMock()
        client.base_url = "http://localhost:8000"
        client.session.post.return_value = MagicMock(status_code=200)
        client.communicator = MagicMock()
        client.communicator.group = MagicMock()
        client.rank = 0

        params = [
            ("a", torch.randn(100)),  # 100 elements
            ("b", torch.randn(100)),  # 100 elements
            ("c", torch.randn(100)),  # 100 elements
        ]
        _batch_update_named_params(client, params, chunk_size=150)

        # Should make 2 HTTP calls: [a,b] then [c] (100+100 > 150 triggers split)
        # Actually: a=100 < 150, a+b=200 > 150 → chunk [a], then b=100 < 150,
        # b+c=200 > 150 → chunk [b], then [c]. So 3 calls.
        # Wait: first a added (100 < 150), then b: 100+100=200 > 150, so chunk=[a],
        # new chunk starts with b (100 < 150), then c: 100+100=200 > 150, so chunk=[b],
        # final chunk=[c]. 3 HTTP calls.
        self.assertEqual(client.session.post.call_count, 3)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/monkeypatch/test_voxtral_modeling_patch.py
================================================
"""Integration tests for Voxtral modeling patches."""

import pytest


class TestVoxtralModelingPatchIntegration:
    """Test Voxtral modeling patch integration."""

    @pytest.mark.integration
    def test_voxtral_conditional_generation_patch(self):
        """Test that Voxtral conditional generation patch can be applied."""
        try:
            from transformers.models.voxtral.modeling_voxtral import (
                VoxtralForConditionalGeneration,
            )
        except ImportError:
            pytest.skip("VoxtralForConditionalGeneration not available")

        from axolotl.monkeypatch.models.voxtral.modeling import (
            patch_voxtral_conditional_generation_forward,
        )

        # Store original method
        original_forward = VoxtralForConditionalGeneration.forward

        # Apply patch and get unpatch function
        unpatch_fn = patch_voxtral_conditional_generation_forward()

        # Verify patch was applied
        assert VoxtralForConditionalGeneration.forward != original_forward, (
            "forward method was not patched"
        )

        # Verify the method is still callable
        assert callable(VoxtralForConditionalGeneration.forward), (
            "Patched method is not callable"
        )

        # Test unpatch function
        unpatch_fn()
        assert VoxtralForConditionalGeneration.forward == original_forward, (
            "unpatch function did not restore original method"
        )


================================================
FILE: tests/patched/test_validation.py
================================================
"""Module for testing the validation module"""

import os
import warnings
from typing import Optional

import pytest
from pydantic import ValidationError

from axolotl.loaders.utils import check_model_config
from axolotl.utils import is_comet_available
from axolotl.utils.config import validate_config
from axolotl.utils.dict import DictDefault
from axolotl.utils.mlflow_ import setup_mlflow_env_vars
from axolotl.utils.schemas.config import AxolotlConfigWCapabilities
from axolotl.utils.wandb_ import setup_wandb_env_vars

warnings.filterwarnings("error")


@pytest.fixture(name="minimal_cfg")
def fixture_cfg():
    return DictDefault(
        {
            "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6",
            "learning_rate": 0.000001,
            "datasets": [
                {
                    "path": "mhenrichsen/alpaca_2k_test",
                    "type": "alpaca",
                }
            ],
            "micro_batch_size": 1,
            "gradient_accumulation_steps": 1,
        }
    )


class BaseValidation:
    """
    Base validation module to setup the log capture
    """

    _caplog: Optional[pytest.LogCaptureFixture] = None

    @pytest.fixture(autouse=True)
    def inject_fixtures(self, caplog):
        self._caplog = caplog


class TestValidation(BaseValidation):
    """
    Test the validation module
    """

    def test_defaults(self, minimal_cfg):
        test_cfg = DictDefault(
            {
                "weight_decay": None,
            }
            | minimal_cfg
        )
        cfg = validate_config(test_cfg)

        assert cfg.train_on_inputs is False
        assert cfg.weight_decay is None

    def test_zero3_qlora_use_reentrant_false(self, minimal_cfg):
        test_cfg = DictDefault(
            {
                "deepspeed": "deepspeed_configs/zero3_bf16.json",
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": False},
                "load_in_4bit": True,
                "adapter": "qlora",
            }
            | minimal_cfg
        )

        with self._caplog.at_level("WARNING"):
            validate_config(test_cfg)
            assert (
                "qlora + zero3 with use_reentrant: false may result in a CheckpointError about recomputed values"
                in self._caplog.records[0].message
            )

    def test_deepspeed_empty(self, minimal_cfg):
        test_cfg = DictDefault(
            {
                "deepspeed": "",
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": False},
                "load_in_4bit": True,
                "adapter": "qlora",
            }
            | minimal_cfg
        )

        _ = validate_config(test_cfg)

    def test_deepspeed_not_set(self, minimal_cfg):
        test_cfg = DictDefault(
            {
                "deepspeed": None,
                "gradient_checkpointing": True,
                "gradient_checkpointing_kwargs": {"use_reentrant": False},
                "load_in_4bit": True,
                "adapter": "qlora",
            }
            | minimal_cfg
        )

        _ = validate_config(test_cfg)

    def test_datasets_min_length(self):
        cfg = DictDefault(
            {
                "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6",
                "learning_rate": 0.000001,
                "datasets": [],
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
            }
        )

        with pytest.raises(
            ValidationError,
            match=r".*List should have at least 1 item after validation*",
        ):
            validate_config(cfg)

    def test_datasets_min_length_empty(self):
        cfg = DictDefault(
            {
                "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6",
                "learning_rate": 0.000001,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
            }
        )

        with pytest.raises(
            ValueError, match=r".*either datasets or pretraining_dataset is required*"
        ):
            validate_config(cfg)

    def test_pretrain_dataset_min_length(self):
        cfg = DictDefault(
            {
                "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6",
                "learning_rate": 0.000001,
                "pretraining_dataset": [],
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "max_steps": 100,
            }
        )

        with pytest.raises(
            ValidationError,
            match=r".*List should have at least 1 item after validation*",
        ):
            validate_config(cfg)

    def test_valid_pretrain_dataset(self):
        cfg = DictDefault(
            {
                "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6",
                "learning_rate": 0.000001,
                "pretraining_dataset": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    }
                ],
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "max_steps": 100,
            }
        )

        validate_config(cfg)

    def test_valid_sft_dataset(self):
        cfg = DictDefault(
            {
                "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6",
                "learning_rate": 0.000001,
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    }
                ],
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
            }
        )

        validate_config(cfg)

    def test_batch_size_unused_warning(self):
        cfg = DictDefault(
            {
                "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6",
                "learning_rate": 0.000001,
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    }
                ],
                "micro_batch_size": 4,
                "batch_size": 32,
            }
        )

        with self._caplog.at_level("WARNING"):
            validate_config(cfg)
            assert "batch_size is not recommended" in self._caplog.records[0].message

    def test_batch_size_more_params(self):
        cfg = DictDefault(
            {
                "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6",
                "learning_rate": 0.000001,
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    }
                ],
                "batch_size": 32,
            }
        )

        with pytest.raises(ValueError, match=r".*At least two of*"):
            validate_config(cfg)

    def test_lr_as_float(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "learning_rate": "5e-5",
                }
            )
            | minimal_cfg
        )

        new_cfg = validate_config(cfg)

        assert new_cfg.learning_rate == 0.00005

    def test_model_config_remap(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "model_config": {"model_type": "mistral"},
                }
            )
            | minimal_cfg
        )

        new_cfg = validate_config(cfg)
        assert new_cfg.overrides_of_model_config["model_type"] == "mistral"

    def test_model_type_remap(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "model_type": "AutoModelForCausalLM",
                }
            )
            | minimal_cfg
        )

        new_cfg = validate_config(cfg)
        assert new_cfg.type_of_model == "AutoModelForCausalLM"

    def test_reward_model_defaults(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "reward_model": True,
                }
            )
            | minimal_cfg
        )

        new_cfg = validate_config(cfg)
        assert new_cfg.num_labels == 1
        assert new_cfg.type_of_model == "AutoModelForSequenceClassification"

    def test_process_reward_model_defaults(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "process_reward_model": True,
                }
            )
            | minimal_cfg
        )

        new_cfg = validate_config(cfg)
        assert new_cfg.num_labels == 2
        assert new_cfg.type_of_model == "AutoModelForTokenClassification"

    def test_model_revision_remap(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "model_revision": "main",
                }
            )
            | minimal_cfg
        )

        new_cfg = validate_config(cfg)
        assert new_cfg.revision_of_model == "main"

    def test_qlora(self, minimal_cfg):
        base_cfg = (
            DictDefault(
                {
                    "adapter": "qlora",
                }
            )
            | minimal_cfg
        )

        cfg = (
            DictDefault(
                {
                    "load_in_8bit": True,
                }
            )
            | base_cfg
        )

        with pytest.raises(ValueError, match=r".*8bit.*"):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "gptq": True,
                }
            )
            | base_cfg
        )

        with pytest.raises(ValueError, match=r".*gptq.*"):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "load_in_4bit": False,
                }
            )
            | base_cfg
        )

        with pytest.raises(ValueError, match=r".*4bit.*"):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "load_in_4bit": True,
                }
            )
            | base_cfg
        )

        validate_config(cfg)

    def test_qlora_merge(self, minimal_cfg):
        base_cfg = (
            DictDefault(
                {
                    "adapter": "qlora",
                    "merge_lora": True,
                }
            )
            | minimal_cfg
        )

        cfg = (
            DictDefault(
                {
                    "load_in_8bit": True,
                }
            )
            | base_cfg
        )

        with pytest.raises(ValueError, match=r".*8bit.*"):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "gptq": True,
                }
            )
            | base_cfg
        )

        with pytest.raises(ValueError, match=r".*gptq.*"):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "load_in_4bit": True,
                }
            )
            | base_cfg
        )

        with pytest.raises(ValueError, match=r".*4bit.*"):
            validate_config(cfg)

    def test_hf_use_auth_token(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "push_dataset_to_hub": "namespace/repo",
                }
            )
            | minimal_cfg
        )

        with pytest.raises(ValueError, match=r".*hf_use_auth_token.*"):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "push_dataset_to_hub": "namespace/repo",
                    "hf_use_auth_token": True,
                }
            )
            | minimal_cfg
        )
        validate_config(cfg)

    def test_gradient_accumulations_or_batch_size(self):
        cfg = DictDefault(
            {
                "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6",
                "learning_rate": 0.000001,
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    }
                ],
                "gradient_accumulation_steps": 1,
                "batch_size": 1,
            }
        )

        with pytest.raises(
            ValueError, match=r".*gradient_accumulation_steps or batch_size.*"
        ):
            validate_config(cfg)

    def test_falcon_fsdp(self, minimal_cfg):
        regex_exp = r".*FSDP is not supported for falcon models.*"

        # Check for lower-case
        cfg = (
            DictDefault(
                {
                    "base_model": "tiiuae/falcon-7b",
                    "fsdp": ["full_shard", "auto_wrap"],
                }
            )
            | minimal_cfg
        )

        with pytest.raises(ValueError, match=regex_exp):
            validate_config(cfg)

        # Check for upper-case
        cfg = (
            DictDefault(
                {
                    "base_model": "Falcon-7b",
                    "fsdp": ["full_shard", "auto_wrap"],
                }
            )
            | minimal_cfg
        )

        with pytest.raises(ValueError, match=regex_exp):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "base_model": "tiiuae/falcon-7b",
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

    def test_mpt_gradient_checkpointing(self, minimal_cfg):
        regex_exp = r".*gradient_checkpointing is not supported for MPT models*"

        # Check for lower-case
        cfg = (
            DictDefault(
                {
                    "base_model": "mosaicml/mpt-7b",
                    "gradient_checkpointing": True,
                }
            )
            | minimal_cfg
        )

        with pytest.raises(ValueError, match=regex_exp):
            validate_config(cfg)

    def test_flash_optimum(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "flash_optimum": True,
                    "adapter": "lora",
                    "bf16": False,
                }
            )
            | minimal_cfg
        )

        with self._caplog.at_level("WARNING"):
            validate_config(cfg)
            assert any(
                "BetterTransformers probably doesn't work with PEFT adapters"
                in record.message
                for record in self._caplog.records
            )

        cfg = (
            DictDefault(
                {
                    "flash_optimum": True,
                    "bf16": False,
                }
            )
            | minimal_cfg
        )

        with self._caplog.at_level("WARNING"):
            validate_config(cfg)
            assert any(
                "probably set bfloat16 or float16" in record.message
                for record in self._caplog.records
            )

        cfg = (
            DictDefault(
                {
                    "flash_optimum": True,
                    "fp16": True,
                }
            )
            | minimal_cfg
        )
        regex_exp = r".*AMP is not supported.*"

        with pytest.raises(ValueError, match=regex_exp):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "flash_optimum": True,
                    "bf16": True,
                }
            )
            | minimal_cfg
        )
        regex_exp = r".*AMP is not supported.*"

        with pytest.raises(ValueError, match=regex_exp):
            validate_config(cfg)

    def test_adamw_hyperparams(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "optimizer": None,
                    "adam_epsilon": 0.0001,
                }
            )
            | minimal_cfg
        )

        with self._caplog.at_level("WARNING"):
            validate_config(cfg)
            assert any(
                "adamw hyperparameters found, but no adamw optimizer set"
                in record.message
                for record in self._caplog.records
            )

        cfg = (
            DictDefault(
                {
                    "optimizer": "adafactor",
                    "adam_beta1": 0.0001,
                }
            )
            | minimal_cfg
        )

        with self._caplog.at_level("WARNING"):
            validate_config(cfg)
            assert any(
                "adamw hyperparameters found, but no adamw optimizer set"
                in record.message
                for record in self._caplog.records
            )

        cfg = (
            DictDefault(
                {
                    "optimizer": "adamw_bnb_8bit",
                    "adam_beta1": 0.9,
                    "adam_beta2": 0.99,
                    "adam_epsilon": 0.0001,
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "optimizer": "adafactor",
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

    def test_deprecated_packing(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "max_packed_sequence_len": 1024,
                }
            )
            | minimal_cfg
        )
        with pytest.raises(
            DeprecationWarning,
            match=r"`max_packed_sequence_len` is no longer supported",
        ):
            validate_config(cfg)

    def test_packing(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "sample_packing": True,
                    "pad_to_sequence_len": False,
                    "flash_attention": True,
                }
            )
            | minimal_cfg
        )
        with self._caplog.at_level("WARNING"):
            validate_config(cfg)
            assert any(
                "`pad_to_sequence_len: true` is recommended when using sample_packing"
                in record.message
                for record in self._caplog.records
            )

    def test_packing_autoset(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "sample_packing": True,
                    "pad_to_sequence_len": None,
                    "flash_attention": True,
                }
            )
            | minimal_cfg
        )
        with self._caplog.at_level("INFO"):
            cfg = validate_config(cfg)
            assert any(
                "Setting `pad_to_sequence_len: true` to prevent memory leaks when sample_packing"
                in record.message
                for record in self._caplog.records
            )
            assert cfg.pad_to_sequence_len is True

    def test_merge_lora_no_bf16_fail(self, minimal_cfg):
        """
        This is assumed to be run on a CPU machine, so bf16 is not supported.
        """

        cfg = (
            DictDefault(
                {
                    "bf16": True,
                    "capabilities": {"bf16": False},
                    "env_capabilities": {
                        "torch_version": "2.6.0",
                    },
                }
            )
            | minimal_cfg
        )

        with pytest.raises(ValueError, match=r".*AMP is not supported on this GPU*"):
            AxolotlConfigWCapabilities(**cfg.to_dict())

        cfg = (
            DictDefault(
                {
                    "bf16": True,
                    "merge_lora": True,
                    "capabilities": {"bf16": False},
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

    def test_no_conflict_save_strategy(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "save_strategy": "epoch",
                    "save_steps": 10,
                }
            )
            | minimal_cfg
        )

        with pytest.raises(
            ValueError, match=r".*save_strategy and save_steps mismatch.*"
        ):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "save_strategy": "no",
                    "save_steps": 10,
                }
            )
            | minimal_cfg
        )

        with pytest.raises(
            ValueError, match=r".*save_strategy and save_steps mismatch.*"
        ):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "save_strategy": "steps",
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "save_strategy": "steps",
                    "save_steps": 10,
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "save_steps": 10,
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "save_strategy": "no",
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

    def test_no_conflict_eval_strategy(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "eval_strategy": "epoch",
                    "eval_steps": 10,
                }
            )
            | minimal_cfg
        )

        with pytest.raises(
            ValueError, match=r".*eval_strategy and eval_steps mismatch.*"
        ):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "eval_strategy": "no",
                    "eval_steps": 10,
                }
            )
            | minimal_cfg
        )

        with pytest.raises(
            ValueError, match=r".*eval_strategy and eval_steps mismatch.*"
        ):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "eval_strategy": "steps",
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "eval_strategy": "steps",
                    "eval_steps": 10,
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "eval_steps": 10,
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "eval_strategy": "no",
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "eval_strategy": "epoch",
                    "val_set_size": 0,
                }
            )
            | minimal_cfg
        )

        with pytest.raises(
            ValueError,
            match=r".*eval_steps and eval_strategy are not supported with val_set_size == 0.*",
        ):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "eval_steps": 10,
                    "val_set_size": 0,
                }
            )
            | minimal_cfg
        )

        with pytest.raises(
            ValueError,
            match=r".*eval_steps and eval_strategy are not supported with val_set_size == 0.*",
        ):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "val_set_size": 0,
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "eval_steps": 10,
                    "val_set_size": 0.01,
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "eval_strategy": "epoch",
                    "val_set_size": 0.01,
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

    def test_eval_table_size_conflict_eval_packing(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "sample_packing": True,
                    "eval_table_size": 100,
                    "flash_attention": True,
                }
            )
            | minimal_cfg
        )

        with pytest.raises(
            ValueError, match=r".*Please set 'eval_sample_packing' to false.*"
        ):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "sample_packing": True,
                    "eval_sample_packing": False,
                    "flash_attention": True,
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "sample_packing": False,
                    "eval_table_size": 100,
                    "flash_attention": True,
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "sample_packing": True,
                    "eval_table_size": 100,
                    "eval_sample_packing": False,
                    "flash_attention": True,
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

    def test_load_in_x_bit_without_adapter(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "load_in_4bit": True,
                }
            )
            | minimal_cfg
        )

        with pytest.raises(
            ValueError,
            match=r".*load_in_8bit and load_in_4bit are not supported without setting an adapter.*",
        ):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "load_in_8bit": True,
                }
            )
            | minimal_cfg
        )

        with pytest.raises(
            ValueError,
            match=r".*load_in_8bit and load_in_4bit are not supported without setting an adapter.*",
        ):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "load_in_4bit": True,
                    "adapter": "qlora",
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "load_in_8bit": True,
                    "adapter": "lora",
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

    def test_warmup_step_no_conflict(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "warmup_steps": 10,
                    "warmup_ratio": 0.1,
                }
            )
            | minimal_cfg
        )

        with pytest.raises(
            ValueError,
            match=r".*warmup_steps and warmup_ratio are mutually exclusive*",
        ):
            validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "warmup_steps": 10,
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

        cfg = (
            DictDefault(
                {
                    "warmup_ratio": 0.1,
                }
            )
            | minimal_cfg
        )

        validate_config(cfg)

    def test_unfrozen_parameters_w_peft_layers_to_transform(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "adapter": "lora",
                    "unfrozen_parameters": [
                        "model.layers.2[0-9]+.block_sparse_moe.gate.*"
                    ],
                    "peft_layers_to_transform": [0, 1],
                }
            )
            | minimal_cfg
        )

        with pytest.raises(
            ValueError,
            match=r".*can have unexpected behavior*",
        ):
            validate_config(cfg)

    def test_hub_model_id_save_value_warns_save_stragey_no(self, minimal_cfg):
        cfg = DictDefault({"hub_model_id": "test", "save_strategy": "no"}) | minimal_cfg

        with self._caplog.at_level("WARNING"):
            validate_config(cfg)
            assert len(self._caplog.records) == 1

    def test_hub_model_id_save_value_warns_random_value(self, minimal_cfg):
        cfg = (
            DictDefault({"hub_model_id": "test", "save_strategy": "test"}) | minimal_cfg
        )

        with self._caplog.at_level("WARNING"):
            validate_config(cfg)
            assert len(self._caplog.records) == 1

    def test_hub_model_id_save_value_steps(self, minimal_cfg):
        cfg = (
            DictDefault({"hub_model_id": "test", "save_strategy": "steps"})
            | minimal_cfg
        )

        with self._caplog.at_level("WARNING"):
            validate_config(cfg)
            assert len(self._caplog.records) == 0

    def test_hub_model_id_save_value_epochs(self, minimal_cfg):
        cfg = (
            DictDefault({"hub_model_id": "test", "save_strategy": "epoch"})
            | minimal_cfg
        )

        with self._caplog.at_level("WARNING"):
            validate_config(cfg)
            assert len(self._caplog.records) == 0

    def test_hub_model_id_save_value_none(self, minimal_cfg):
        cfg = DictDefault({"hub_model_id": "test", "save_strategy": None}) | minimal_cfg

        with self._caplog.at_level("WARNING"):
            validate_config(cfg)
            assert len(self._caplog.records) == 0

    def test_hub_model_id_save_value_no_set_save_strategy(self, minimal_cfg):
        cfg = DictDefault({"hub_model_id": "test"}) | minimal_cfg

        with self._caplog.at_level("WARNING"):
            validate_config(cfg)
            assert len(self._caplog.records) == 0

    def test_dpo_beta_deprecation(self, minimal_cfg):
        cfg = DictDefault({"dpo_beta": 0.2}) | minimal_cfg

        with self._caplog.at_level("WARNING"):
            new_cfg = validate_config(cfg)
            assert new_cfg["rl_beta"] == 0.2
            assert new_cfg["dpo_beta"] is None
            assert len(self._caplog.records) == 1

    def test_eval_strategy_remap(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "evaluation_strategy": "steps",
                }
            )
            | minimal_cfg
        )

        with self._caplog.at_level("WARNING"):
            new_cfg = validate_config(cfg)
            assert new_cfg.eval_strategy == "steps"
            assert (
                "evaluation_strategy is deprecated, use eval_strategy instead"
                in self._caplog.records[0].message
            )

    def test_torch_version_adopt_req(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "optimizer": "adopt_adamw",
                }
            )
            | minimal_cfg
        )

        with pytest.raises(
            ValueError,
            match=r".*ADOPT optimizer is incompatible with torch version*",
        ):
            env_capabilities = {"torch_version": "2.3.0"}
            capabilities = {"bf16": False}
            _ = validate_config(
                cfg, capabilities=capabilities, env_capabilities=env_capabilities
            )

        env_capabilities = {"torch_version": "2.6.0"}
        capabilities = {"bf16": False}
        _ = validate_config(
            cfg, capabilities=capabilities, env_capabilities=env_capabilities
        )

        env_capabilities = {"torch_version": "2.5.2"}
        capabilities = {"bf16": False}
        _ = validate_config(
            cfg, capabilities=capabilities, env_capabilities=env_capabilities
        )

    def test_cfg_throws_error_with_s2_attention_and_sample_packing(self, minimal_cfg):
        test_cfg = DictDefault(
            {
                "s2_attention": True,
                "sample_packing": True,
            }
            | minimal_cfg
        )
        with pytest.raises(
            ValidationError,
            match=r".*shifted-sparse attention does not currently support sample packing*",
        ):
            validate_config(test_cfg)


class TestTorchCompileValidation(BaseValidation):
    """
    test suite for when torch_compile is set to 'auto'
    """

    def test_torch_compile_auto(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "torch_compile": "auto",
                }
            )
            | minimal_cfg
        )

        env_capabilities = {"torch_version": "2.6.0"}
        capabilities = {"bf16": True}
        updated_cfg = validate_config(
            cfg, capabilities=capabilities, env_capabilities=env_capabilities
        )

        assert updated_cfg.torch_compile is True

        env_capabilities = {"torch_version": "2.4.1"}
        capabilities = {"bf16": True}
        updated_cfg = validate_config(
            cfg, capabilities=capabilities, env_capabilities=env_capabilities
        )

        assert updated_cfg.torch_compile is False

        env_capabilities = {}
        capabilities = {"bf16": True}
        updated_cfg = validate_config(
            cfg, capabilities=capabilities, env_capabilities=env_capabilities
        )

        assert updated_cfg.torch_compile is False


class TestSampleOptimConfigValidation(BaseValidation):
    """
    test configurations for sample optimizations like batch flattening
    """

    def test_batch_flattening_auto_enables(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "flash_attention": True,
                    "sample_packing": None,
                    "micro_batch_size": 2,
                    "batch_flattening": "auto",
                }
            )
            | minimal_cfg
        )

        new_cfg = validate_config(cfg)
        assert new_cfg["batch_flattening"] is True

    def test_batch_flattening_auto_no_fa(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "flash_attention": False,
                    "sample_packing": None,
                    "micro_batch_size": 2,
                    "batch_flattening": "auto",
                }
            )
            | minimal_cfg
        )

        new_cfg = validate_config(cfg)
        assert new_cfg["batch_flattening"] is False

    def test_batch_flattening_auto_mbsz_1(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "flash_attention": True,
                    "sample_packing": None,
                    "micro_batch_size": 1,
                    "batch_flattening": "auto",
                }
            )
            | minimal_cfg
        )

        new_cfg = validate_config(cfg)
        assert new_cfg["batch_flattening"] is False

    def test_batch_flattening_auto_packing(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "flash_attention": True,
                    "sample_packing": True,
                    "micro_batch_size": 2,
                    "batch_flattening": "auto",
                }
            )
            | minimal_cfg
        )

        new_cfg = validate_config(cfg)
        assert new_cfg["batch_flattening"] is False


class TestValidationCheckModelConfig(BaseValidation):
    """
    Test the validation for the config when the model config is available
    """

    def test_llama_add_tokens_adapter(self, minimal_cfg):
        cfg = (
            DictDefault(
                {"adapter": "qlora", "load_in_4bit": True, "tokens": ["<|imstart|>"]}
            )
            | minimal_cfg
        )
        model_config = DictDefault({"model_type": "llama"})

        with pytest.raises(
            ValueError,
            match=r".*`lora_modules_to_save` not properly set when adding new tokens*",
        ):
            check_model_config(cfg, model_config)

        cfg = (
            DictDefault(
                {
                    "adapter": "qlora",
                    "load_in_4bit": True,
                    "tokens": ["<|imstart|>"],
                    "lora_modules_to_save": ["embed_tokens"],
                }
            )
            | minimal_cfg
        )

        with pytest.raises(
            ValueError,
            match=r".*`lora_modules_to_save` not properly set when adding new tokens*",
        ):
            check_model_config(cfg, model_config)

        cfg = (
            DictDefault(
                {
                    "adapter": "qlora",
                    "load_in_4bit": True,
                    "tokens": ["<|imstart|>"],
                    "lora_modules_to_save": ["embed_tokens", "lm_head"],
                }
            )
            | minimal_cfg
        )

        check_model_config(cfg, model_config)

    def test_phi_add_tokens_adapter(self, minimal_cfg):
        cfg = (
            DictDefault(
                {"adapter": "qlora", "load_in_4bit": True, "tokens": ["<|imstart|>"]}
            )
            | minimal_cfg
        )
        model_config = DictDefault({"model_type": "phi"})

        with pytest.raises(
            ValueError,
            match=r".*`lora_modules_to_save` not properly set when adding new tokens*",
        ):
            check_model_config(cfg, model_config)

        cfg = (
            DictDefault(
                {
                    "adapter": "qlora",
                    "load_in_4bit": True,
                    "tokens": ["<|imstart|>"],
                    "lora_modules_to_save": ["embd.wte", "lm_head.linear"],
                }
            )
            | minimal_cfg
        )

        with pytest.raises(
            ValueError,
            match=r".*`lora_modules_to_save` not properly set when adding new tokens*",
        ):
            check_model_config(cfg, model_config)

        cfg = (
            DictDefault(
                {
                    "adapter": "qlora",
                    "load_in_4bit": True,
                    "tokens": ["<|imstart|>"],
                    "lora_modules_to_save": ["embed_tokens", "lm_head"],
                }
            )
            | minimal_cfg
        )

        check_model_config(cfg, model_config)


class TestValidationWandb(BaseValidation):
    """
    Validation test for wandb
    """

    def test_wandb_set_run_id_to_name(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "wandb_run_id": "foo",
                }
            )
            | minimal_cfg
        )

        with self._caplog.at_level("WARNING"):
            new_cfg = validate_config(cfg)
            assert any(
                "wandb_run_id sets the ID of the run. If you would like to set the name, please use wandb_name instead."
                in record.message
                for record in self._caplog.records
            )

            assert new_cfg.wandb_name == "foo" and new_cfg.wandb_run_id == "foo"

        cfg = (
            DictDefault(
                {
                    "wandb_name": "foo",
                }
            )
            | minimal_cfg
        )

        new_cfg = validate_config(cfg)

        assert new_cfg.wandb_name == "foo" and new_cfg.wandb_run_id is None

    def test_wandb_sets_env(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "wandb_project": "foo",
                    "wandb_name": "bar",
                    "wandb_run_id": "bat",
                    "wandb_entity": "baz",
                    "wandb_mode": "online",
                    "wandb_watch": "false",
                    "wandb_log_model": "checkpoint",
                }
            )
            | minimal_cfg
        )

        new_cfg = validate_config(cfg)

        setup_wandb_env_vars(new_cfg)

        assert os.environ.get("WANDB_PROJECT", "") == "foo"
        assert os.environ.get("WANDB_NAME", "") == "bar"
        assert os.environ.get("WANDB_RUN_ID", "") == "bat"
        assert os.environ.get("WANDB_ENTITY", "") == "baz"
        assert os.environ.get("WANDB_MODE", "") == "online"
        assert os.environ.get("WANDB_WATCH", "") == "false"
        assert os.environ.get("WANDB_LOG_MODEL", "") == "checkpoint"

        os.environ.pop("WANDB_PROJECT", None)
        os.environ.pop("WANDB_NAME", None)
        os.environ.pop("WANDB_RUN_ID", None)
        os.environ.pop("WANDB_ENTITY", None)
        os.environ.pop("WANDB_MODE", None)
        os.environ.pop("WANDB_WATCH", None)
        os.environ.pop("WANDB_LOG_MODEL", None)

    def test_wandb_set_disabled(self, minimal_cfg):
        cfg = DictDefault({}) | minimal_cfg
        new_cfg = validate_config(cfg)
        setup_wandb_env_vars(new_cfg)
        assert new_cfg.use_wandb is None

        cfg = (
            DictDefault(
                {
                    "wandb_project": "foo",
                }
            )
            | minimal_cfg
        )

        new_cfg = validate_config(cfg)
        setup_wandb_env_vars(new_cfg)
        assert new_cfg.use_wandb is True

        os.environ.pop("WANDB_PROJECT", None)


@pytest.mark.skipif(is_comet_available() is False, reason="comet_ml is not installed")
class TestValidationComet(BaseValidation):
    """
    Validation test for comet
    """

    def test_comet_sets_env(self, minimal_cfg):
        from axolotl.utils.comet_ import setup_comet_env_vars

        comet_config = {
            "comet_api_key": "foo",
            "comet_workspace": "some_workspace",
            "comet_project_name": "some_project",
            "comet_experiment_key": "some_experiment_key",
            "comet_mode": "get_or_create",
            "comet_online": False,
            "comet_experiment_config": {
                "auto_histogram_activation_logging": False,
                "auto_histogram_epoch_rate": 2,
                "auto_histogram_gradient_logging": True,
                "auto_histogram_tensorboard_logging": False,
                "auto_histogram_weight_logging": True,
                "auto_log_co2": False,
                "auto_metric_logging": True,
                "auto_metric_step_rate": 15,
                "auto_output_logging": False,
                "auto_param_logging": True,
                "comet_disabled": False,
                "display_summary_level": 2,
                "distributed_node_identifier": "some_distributed_node_identifier",
                "log_code": True,
                "log_env_cpu": False,
                "log_env_details": True,
                "log_env_disk": False,
                "log_env_gpu": True,
                "log_env_host": False,
                "log_env_network": True,
                "log_git_metadata": False,
                "log_git_patch": True,
                "log_graph": False,
                "name": "some_name",
                "offline_directory": "some_offline_directory",
                "parse_args": True,
                "tags": ["tag1", "tag2"],
            },
        }

        cfg = DictDefault(comet_config) | minimal_cfg

        new_cfg = validate_config(cfg)

        setup_comet_env_vars(new_cfg)

        comet_env = {
            key: value for key, value in os.environ.items() if key.startswith("COMET_")
        }

        assert (
            len(comet_env)
            == len(comet_config) + len(comet_config["comet_experiment_config"]) - 1
        )

        assert comet_env == {
            "COMET_API_KEY": "foo",
            "COMET_AUTO_LOG_CLI_ARGUMENTS": "true",
            "COMET_AUTO_LOG_CO2": "false",
            "COMET_AUTO_LOG_CODE": "true",
            "COMET_AUTO_LOG_DISABLE": "false",
            "COMET_AUTO_LOG_ENV_CPU": "false",
            "COMET_AUTO_LOG_ENV_DETAILS": "true",
            "COMET_AUTO_LOG_ENV_DISK": "false",
            "COMET_AUTO_LOG_ENV_GPU": "true",
            "COMET_AUTO_LOG_ENV_HOST": "false",
            "COMET_AUTO_LOG_ENV_NETWORK": "true",
            "COMET_AUTO_LOG_GIT_METADATA": "false",
            "COMET_AUTO_LOG_GIT_PATCH": "true",
            "COMET_AUTO_LOG_GRAPH": "false",
            "COMET_AUTO_LOG_HISTOGRAM_ACTIVATIONS": "false",
            "COMET_AUTO_LOG_HISTOGRAM_EPOCH_RATE": "2",
            "COMET_AUTO_LOG_HISTOGRAM_GRADIENTS": "true",
            "COMET_AUTO_LOG_HISTOGRAM_TENSORBOARD": "false",
            "COMET_AUTO_LOG_HISTOGRAM_WEIGHTS": "true",
            "COMET_AUTO_LOG_METRIC_STEP_RATE": "15",
            "COMET_AUTO_LOG_METRICS": "true",
            "COMET_AUTO_LOG_OUTPUT_LOGGER": "false",
            "COMET_AUTO_LOG_PARAMETERS": "true",
            "COMET_DISPLAY_SUMMARY_LEVEL": "2",
            "COMET_DISTRIBUTED_NODE_IDENTIFIER": "some_distributed_node_identifier",
            "COMET_EXPERIMENT_KEY": "some_experiment_key",
            "COMET_OFFLINE_DIRECTORY": "some_offline_directory",
            "COMET_PROJECT_NAME": "some_project",
            "COMET_START_EXPERIMENT_NAME": "some_name",
            "COMET_START_EXPERIMENT_TAGS": "tag1,tag2",
            "COMET_START_MODE": "get_or_create",
            "COMET_START_ONLINE": "false",
            "COMET_WORKSPACE": "some_workspace",
        }

        for key in comet_env.keys():
            os.environ.pop(key, None)


class TestValidationMLflow(BaseValidation):
    """
    Validation test for MLflow
    """

    def test_hf_mlflow_artifacts_config_sets_env(self, minimal_cfg):
        cfg = (
            DictDefault(
                {
                    "hf_mlflow_log_artifacts": True,
                }
            )
            | minimal_cfg
        )

        new_cfg = validate_config(cfg)

        assert new_cfg.hf_mlflow_log_artifacts is True

        # Check it's not already present in env
        assert "HF_MLFLOW_LOG_ARTIFACTS" not in os.environ

        setup_mlflow_env_vars(new_cfg)

        assert os.environ.get("HF_MLFLOW_LOG_ARTIFACTS") == "true"

        os.environ.pop("HF_MLFLOW_LOG_ARTIFACTS", None)

    def test_mlflow_not_used_by_default(self, minimal_cfg):
        cfg = DictDefault({}) | minimal_cfg

        new_cfg = validate_config(cfg)

        setup_mlflow_env_vars(new_cfg)

        assert cfg.use_mlflow is not True

        cfg = (
            DictDefault(
                {
                    "mlflow_experiment_name": "foo",
                }
            )
            | minimal_cfg
        )

        new_cfg = validate_config(cfg)

        setup_mlflow_env_vars(new_cfg)

        assert new_cfg.use_mlflow is True

        os.environ.pop("MLFLOW_EXPERIMENT_NAME", None)


class TestDataloaderValidation(BaseValidation):
    """
    tests for dataloader_* sane defaults
    """

    def test_dataloader_auto_defaults(self, minimal_cfg):
        cfg = minimal_cfg

        new_cfg = validate_config(cfg, {"n_gpu": 8}, {"torch_version": "2.6.0"})

        assert new_cfg.dataloader_num_workers == 8
        assert new_cfg.dataloader_pin_memory is True
        assert new_cfg.dataloader_prefetch_factor == 256


================================================
FILE: tests/prompt_strategies/__init__.py
================================================


================================================
FILE: tests/prompt_strategies/conftest.py
================================================
"""
shared fixtures for prompt strategies tests
"""

import pytest
from datasets import Dataset
from transformers import AutoTokenizer

from axolotl.prompt_strategies.jinja_template_analyzer import JinjaTemplateAnalyzer
from axolotl.utils.chat_templates import _CHAT_TEMPLATES

from tests.hf_offline_utils import enable_hf_offline


@pytest.fixture(name="assistant_dataset")
def fixture_assistant_dataset():
    return Dataset.from_list(
        [
            {
                "messages": [
                    {"role": "user", "content": "hello"},
                    {"role": "assistant", "content": "hello"},
                    {"role": "user", "content": "goodbye"},
                    {"role": "assistant", "content": "goodbye"},
                ]
            }
        ]
    )


@pytest.fixture(name="sharegpt_dataset")
def fixture_sharegpt_dataset():
    return Dataset.from_list(
        [
            {
                "conversations": [
                    {"from": "human", "value": "hello"},
                    {"from": "gpt", "value": "hello"},
                    {"from": "human", "value": "goodbye"},
                    {"from": "gpt", "value": "goodbye"},
                ]
            }
        ]
    )


@pytest.fixture(name="basic_dataset")
def fixture_basic_dataset():
    return Dataset.from_list(
        [
            {
                "conversations": [
                    {"from": "system", "value": "You are an AI assistant."},
                    {"from": "human", "value": "Hello"},
                    {"from": "assistant", "value": "Hi there!"},
                    {"from": "human", "value": "How are you?"},
                    {"from": "assistant", "value": "I'm doing well, thank you!"},
                ]
            }
        ]
    )


@pytest.fixture(name="toolcalling_dataset")
def fixture_toolcalling_dataset():
    return Dataset.from_list(
        [
            {
                "messages": [
                    {
                        "role": "system",
                        "content": "You are a bot that responds to weather queries. You should reply with the unit used in the queried location.",
                    },
                    {
                        "role": "user",
                        "content": "Hey, what's the temperature in Paris right now?",
                    },
                    {
                        "role": "assistant",
                        "tool_calls": [
                            {
                                "type": "function",
                                "function": {
                                    "name": "get_current_temperature",
                                    "arguments": {
                                        "location": "Paris, France",
                                        "unit": "celsius",
                                    },
                                },
                            }
                        ],
                    },
                    {
                        "role": "tool",
                        "name": "get_current_temperature",
                        "content": "22.0",
                    },
                    {
                        "role": "assistant",
                        "content": "The temperature in Paris is 22.0 degrees Celsius.",
                    },
                ]
            }
        ]
    )


@pytest.fixture(name="llama3_tokenizer", scope="session", autouse=True)
@enable_hf_offline
def fixture_llama3_tokenizer(
    download_llama3_8b_instruct_model_fixture,
):
    tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B-Instruct")

    return tokenizer


@pytest.fixture(name="smollm2_tokenizer", scope="session", autouse=True)
@enable_hf_offline
def fixture_smollm2_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM2-135M")
    return tokenizer


@pytest.fixture(name="mistralv03_tokenizer", scope="session", autouse=True)
@enable_hf_offline
def fixture_mistralv03_tokenizer(
    download_mlx_mistral_7b_model_fixture,
):
    tokenizer = AutoTokenizer.from_pretrained(
        "mlx-community/Mistral-7B-Instruct-v0.3-4bit"
    )
    return tokenizer


@pytest.fixture(name="phi35_tokenizer", scope="session", autouse=True)
@enable_hf_offline
def fixture_phi35_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct")
    return tokenizer


@pytest.fixture(name="phi4_tokenizer", scope="session", autouse=True)
@enable_hf_offline
def fixture_phi4_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-4-reasoning")
    return tokenizer


@pytest.fixture(name="gemma2_tokenizer", scope="session", autouse=True)
def fixture_gemma2_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("mlx-community/gemma-2-9b-it-4bit")

    return tokenizer


@pytest.fixture(name="magistral_tokenizer")
def fixture_magistral_tokenizer():
    from axolotl.utils.mistral import HFMistralTokenizer

    tokenizer = HFMistralTokenizer.from_pretrained("mistralai/Magistral-Small-2506")
    return tokenizer


@pytest.fixture(name="devstral_tokenizer")
def fixture_devstral_tokenizer():
    from axolotl.utils.mistral import HFMistralTokenizer

    tokenizer = HFMistralTokenizer.from_pretrained("mistralai/Devstral-Small-2505")
    return tokenizer


@pytest.fixture(name="devstral_1_1_tokenizer")
def fixture_devstral_1_1_tokenizer():
    from axolotl.utils.mistral import HFMistralTokenizer

    tokenizer = HFMistralTokenizer.from_pretrained("mistralai/Devstral-Small-2507")
    return tokenizer


@pytest.fixture(name="qwen3_tokenizer")
@enable_hf_offline
def qwen3_tokenizer_fixture(
    download_qwen3_half_billion_model,
):  # pylint: disable=unused-argument,redefined-outer-name
    tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-0.6B")

    return tokenizer


@pytest.fixture(name="mistralv03_tokenizer_chat_template_jinja")
def fixture_mistralv03_chat_template_jinja_w_system() -> str:
    return '{%- if messages[0]["role"] == "system" %}\n    {%- set system_message = messages[0]["content"] %}\n    {%- set loop_messages = messages[1:] %}\n{%- else %}\n    {%- set loop_messages = messages %}\n{%- endif %}\n{%- if not tools is defined %}\n    {%- set tools = none %}\n{%- endif %}\n{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %}\n\n{#- This block checks for alternating user/assistant messages, skipping tool calling messages #}\n{%- set ns = namespace() %}\n{%- set ns.index = 0 %}\n{%- for message in loop_messages %}\n    {%- if not (message.role == "tool" or message.role == "tool_results" or (message.tool_calls is defined and message.tool_calls is not none)) %}\n        {%- if (message["role"] == "user") != (ns.index % 2 == 0) %}\n            {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }}\n        {%- endif %}\n        {%- set ns.index = ns.index + 1 %}\n    {%- endif %}\n{%- endfor %}\n\n{{- bos_token }}\n{%- for message in loop_messages %}\n    {%- if message["role"] == "user" %}\n        {%- if tools is not none and (message == user_messages[-1]) %}\n            {{- "[AVAILABLE_TOOLS] [" }}\n            {%- for tool in tools %}\n                {%- set tool = tool.function %}\n                {{- \'{"type": "function", "function": {\' }}\n                {%- for key, val in tool.items() if key != "return" %}\n                    {%- if val is string %}\n                        {{- \'"\' + key + \'": "\' + val + \'"\' }}\n                    {%- else %}\n                        {{- \'"\' + key + \'": \' + val|tojson }}\n                    {%- endif %}\n                    {%- if not loop.last %}\n                        {{- ", " }}\n                    {%- endif %}\n                {%- endfor %}\n                {{- "}}" }}\n                {%- if not loop.last %}\n                    {{- ", " }}\n                {%- else %}\n                    {{- "]" }}\n                {%- endif %}\n            {%- endfor %}\n            {{- "[/AVAILABLE_TOOLS]" }}\n            {%- endif %}\n        {%- if loop.first and system_message is defined %}\n            {{- "[INST] " + system_message + "\\n\\n" + message["content"] + "[/INST]" }}\n        {%- else %}\n            {{- "[INST] " + message["content"] + "[/INST]" }}\n        {%- endif %}\n    {%- elif message.tool_calls is defined and message.tool_calls is not none %}\n        {{- "[TOOL_CALLS] [" }}\n        {%- for tool_call in message.tool_calls %}\n            {%- set out = tool_call.function|tojson %}\n            {{- out[:-1] }}\n            {%- if not tool_call.id is defined or tool_call.id|length != 9 %}\n                {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}\n            {%- endif %}\n            {{- \', "id": "\' + tool_call.id + \'"}\' }}\n            {%- if not loop.last %}\n                {{- ", " }}\n            {%- else %}\n                {{- "]" + eos_token }}\n            {%- endif %}\n        {%- endfor %}\n    {%- elif message["role"] == "assistant" %}\n        {{- " " + message["content"]|trim + eos_token}}\n    {%- elif message["role"] == "tool_results" or message["role"] == "tool" %}\n        {%- if message.content is defined and message.content.content is defined %}\n            {%- set content = message.content.content %}\n        {%- else %}\n            {%- set content = message.content %}\n        {%- endif %}\n        {{- \'[TOOL_RESULTS] {"content": \' + content|string + ", " }}\n        {%- if not message.tool_call_id is defined or message.tool_call_id|length != 9 %}\n            {{- raise_exception("Tool call IDs should be alphanumeric strings with length 9!") }}\n        {%- endif %}\n        {{- \'"call_id": "\' + message.tool_call_id + \'"}[/TOOL_RESULTS]\' }}\n    {%- else %}\n        {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }}\n    {%- endif %}\n{%- endfor %}\n'


@pytest.fixture(name="gemma2_tokenizer_chat_template_jinja")
def fixture_gemma2_chat_template_jinja_w_system() -> str:
    return "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"


@pytest.fixture(name="llama3_2_vision_chat_template_jinja")
def fixture_llama3_2_vision_with_hardcoded_date() -> str:
    """Hardcodes the date in the template to avoid the need for date logic in the prompt"""

    template = _CHAT_TEMPLATES["llama3_2_vision"]

    old_date_logic = """{%- if not date_string is defined %}
    {%- if strftime_now is defined %}
        {%- set date_string = strftime_now("%d %b %Y") %}
    {%- else %}
        {%- set date_string = "26 Jul 2024" %}
    {%- endif %}
{%- endif %}"""

    new_date_logic = """{%- set date_string = "17 Dec 2024" %}"""

    modified_template = template.replace(old_date_logic, new_date_logic)

    return modified_template


@pytest.fixture(name="chat_template_jinja_with_optional_fields")
def fixture_chat_template_jinja_with_optional_fields() -> str:
    return """{% for message in messages %}
{{'<|im_start|>'}}{{ message['role'] }}
{% if message['thoughts'] is defined %}[Thoughts: {{ message['thoughts'] }}]{% endif %}
{% if message['tool_calls'] is defined %}[Tool: {{ message['tool_calls'][0]['type'] }}]{% endif %}
{{ message['content'] }}{{'<|im_end|>'}}
{% endfor %}"""


@pytest.fixture(name="basic_jinja_template_analyzer")
def basic_jinja_template_analyzer():
    return JinjaTemplateAnalyzer(
        """{% for message in messages %}{% if message['role'] == 'system' and message['content'] %}{{'<|system|>
' + message['content'] + '<|end|>
'}}{% elif message['role'] == 'user' %}{{'<|user|>
' + message['content'] + '<|end|>
'}}{% elif message['role'] == 'assistant' %}{{'<|assistant|>
' + message['content'] + '<|end|>
'}}{% endif %}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>
' }}{% else %}{{ eos_token }}{% endif %}"""
    )


@pytest.fixture(name="mistral_jinja_template_analyzer")
def mistral_jinja_template_analyzer(mistralv03_tokenizer_chat_template_jinja):
    return JinjaTemplateAnalyzer(mistralv03_tokenizer_chat_template_jinja)


================================================
FILE: tests/prompt_strategies/messages/__init__.py
================================================


================================================
FILE: tests/prompt_strategies/messages/test_chat.py
================================================
"""
tests for chat_template prompt strategy
"""

import unittest

from axolotl.prompt_strategies.messages.chat import load
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__, log_level="DEBUG")


class TestMessagesChatLlama3:
    """
    Test class for assistant style datasets with llama-3 prompts using the messages chat llama3 strategy.
    """

    def test_llama3_load(self, llama3_tokenizer, assistant_dataset):
        LOG.info("Loading llama-3 tokenizer with assistant dataset")
        strategy = load(
            llama3_tokenizer,
            DictDefault(
                {
                    "train_on_inputs": False,
                    "sequence_len": 512,
                }
            ),
            DictDefault(
                {
                    "chat_template": "llama3",
                    "message_field_role": "role",
                    "message_field_content": "content",
                    "field_messages": "messages",
                }
            ),
        )
        res = strategy.wrap_dataset(assistant_dataset)
        input_ids = res[0]["input_ids"]
        # fmt: off
        expected_input_ids = [
            128000,  # bos
            128006, 882, 128007,  # user header
            271, 15339, 128009,  # user prompt eot
            128006, 78191, 128007,  # assistant header
            271, 15339, 128009,  # assistant response eot
            128006, 882, 128007,
            271, 19045, 29474, 128009,
            128006, 78191, 128007,
            271, 19045, 29474, 128009,
        ]
        # fmt: on
        LOG.debug(f"Expected input_ids: {expected_input_ids}")
        LOG.debug(f"Actual input_ids: {input_ids}")
        assert input_ids == expected_input_ids, (
            f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
        )


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/prompt_strategies/test_alpaca.py
================================================
"""
Test module for alpaca integration w chatml
"""

import pytest
from datasets import Dataset
from tokenizers import AddedToken
from transformers import AutoTokenizer

from axolotl.datasets import TokenizedPromptDataset
from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
from axolotl.prompters import AlpacaPrompter, PromptStyle

from tests.hf_offline_utils import enable_hf_offline


@pytest.fixture(name="alpaca_dataset")
def fixture_alpaca_dataset():
    return Dataset.from_list(
        [
            {
                "instruction": "Evaluate this sentence for spelling and grammar mistakes",
                "input": "He finnished his meal and left the resturant",
                "output": "He finished his meal and left the restaurant.",
            }
        ]
    )


@pytest.fixture(name="tokenizer")
@enable_hf_offline
def fixture_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(
        "casperhansen/mistral-7b-instruct-v0.1-awq"
    )
    tokenizer.add_special_tokens(
        {
            "eos_token": AddedToken(
                "<|im_end|>", rstrip=False, lstrip=False, normalized=False
            )
        }
    )
    tokenizer.add_tokens(
        [
            AddedToken("<|im_start|>", rstrip=False, lstrip=False, normalized=False),
        ]
    )

    return tokenizer


class TestAlpacaChatml:
    """
    Test class for alpaca prompter
    """

    def test_no_double_im_end(self, alpaca_dataset, tokenizer):
        strategy = AlpacaPromptTokenizingStrategy(
            AlpacaPrompter(prompt_style=PromptStyle.CHATML.value),
            tokenizer,
            False,  # train_on_inputs
            2048,  # sequence_len
        )

        dataset_wrapper = TokenizedPromptDataset(
            strategy, alpaca_dataset, process_count=1
        )

        input_ids = dataset_wrapper[0]["input_ids"]
        # fmt: off
        assert input_ids == [
            1,  # Bos
            32001, 1587, 13, 20548, 336, 349, 396, 13126, 369, 13966, 264, 3638, 28725, 5881, 1360, 395, 396, 2787, 369, 5312, 3629, 2758, 28723, 12018, 264, 2899, 369, 6582, 1999, 2691, 274, 272, 2159, 28723, 32000, 28705, 13,  # instruction
            32001, 2188, 13, 16627, 11931, 456, 12271, 354, 668, 3572, 304, 18756, 3479, 17179, 13, 2428, 854, 28711, 1497, 516, 11314, 304, 1749, 272, 1846, 324, 440, 32000, 28705, 13,  # input
            32001, 13892, 13, 650, 5967, 516, 11314, 304, 1749, 272, 9926, 28723, 32000,  # output
        ]
        # fmt: on

    def test_no_train_on_input(self, alpaca_dataset, tokenizer):
        strategy = AlpacaPromptTokenizingStrategy(
            AlpacaPrompter(prompt_style=PromptStyle.CHATML.value),
            tokenizer,
            False,  # train_on_inputs
            2048,  # sequence_len
        )

        dataset_wrapper = TokenizedPromptDataset(
            strategy, alpaca_dataset, process_count=1
        )

        labels = dataset_wrapper[0]["labels"]
        # fmt: off
        assert labels == [
            -100,  # bos
            -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,  # instruction
            -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,  # input
            -100, -100, -100, 650, 5967, 516, 11314, 304, 1749, 272, 9926, 28723, 32000,  # Output
        ]
        # fmt: on

    def test_w_train_on_input(self, alpaca_dataset, tokenizer):
        strategy = AlpacaPromptTokenizingStrategy(
            AlpacaPrompter(prompt_style=PromptStyle.CHATML.value),
            tokenizer,
            True,  # train_on_inputs
            2048,  # sequence_len
        )

        dataset_wrapper = TokenizedPromptDataset(
            strategy, alpaca_dataset, process_count=1
        )

        labels = dataset_wrapper[0]["labels"]
        # fmt: off
        assert labels == [
            1,  # Bos
            32001, 1587, 13, 20548, 336, 349, 396, 13126, 369, 13966, 264, 3638, 28725, 5881, 1360, 395, 396, 2787, 369, 5312, 3629, 2758, 28723, 12018, 264, 2899, 369, 6582, 1999, 2691, 274, 272, 2159, 28723, 32000, 28705, 13,  # instruction
            32001, 2188, 13, 16627, 11931, 456, 12271, 354, 668, 3572, 304, 18756, 3479, 17179, 13, 2428, 854, 28711, 1497, 516, 11314, 304, 1749, 272, 1846, 324, 440, 32000, 28705, 13,  # input
            32001, 13892, 13, 650, 5967, 516, 11314, 304, 1749, 272, 9926, 28723, 32000,  # output
        ]
        # fmt: on


================================================
FILE: tests/prompt_strategies/test_chat_template_ds_schema_unification.py
================================================
"""
Tests for chat template prompt strategy with schema unification for none fields
"""

import json

import pytest
from datasets import Dataset

from axolotl.prompt_strategies.chat_template import StrategyLoader
from axolotl.utils.dict import DictDefault


@pytest.fixture(name="messages_w_tools")
def fixture_messages_w_tools():
    jsons = """
{"messages":[{"role":"user","content":"move to (0, 1)"},{"role":"assistant","content":"","tool_calls":[{"function":{"name":"move","arguments":{"x":0,"y":1}}}]}],"tools":[{"type":"function","function":{"name":"move","description":"Move to a given location measured in meters","parameters":{"type":"object","properties":{"x":{"type":"number","description":"The x coordinate of the location, negative values are to the left, positive values are to the right"},"y":{"type":"number","description":"The y coordinate of the location, negative values are backward, positive values are forward"}},"required":["x","y"]}}},{"type":"function","function":{"name":"turn","description":"Turn the robot to a given direction","parameters":{"type":"object","properties":{"theta":{"type":"integer","description":"The angle to turn to, in degrees, positive values are counter-clockwise, negative values are clockwise"}},"required":["theta"]}}},{"type":"function","function":{"name":"invalid_prompt","description":"call when the user's prompt is invalid","parameters":{"type":"object","properties":{"message":{"type":"string","description":"why the prompt is invalid"}},"required":["message"]}}}],"add_generation_prompt":false}
{"messages":[{"role":"user","content":"turn 270 degree"},{"role":"assistant","content":"","tool_calls":[{"function":{"name":"turn","arguments":{"theta": 270}}}]}],"tools":[{"type":"function","function":{"name":"move","description":"Move to a given location measured in meters","parameters":{"type":"object","properties":{"x":{"type":"number","description":"The x coordinate of the location, negative values are to the left, positive values are to the right"},"y":{"type":"number","description":"The y coordinate of the location, negative values are backward, positive values are forward"}},"required":["x","y"]}}},{"type":"function","function":{"name":"turn","description":"Turn the robot to a given direction","parameters":{"type":"object","properties":{"theta":{"type":"integer","description":"The angle to turn to, in degrees, positive values are counter-clockwise, negative values are clockwise"}},"required":["theta"]}}},{"type":"function","function":{"name":"invalid_prompt","description":"call when the user's prompt is invalid","parameters":{"type":"object","properties":{"message":{"type":"string","description":"why the prompt is invalid"}},"required":["message"]}}}],"add_generation_prompt":false}
{"messages":[{"role":"user","content":"jump high"},{"role":"assistant","content":"","tool_calls":[{"function":{"name":"invalid_prompt","arguments":{"message": "jump is not a valid action"}}}]}],"tools":[{"type":"function","function":{"name":"move","description":"Move to a given location measured in meters","parameters":{"type":"object","properties":{"x":{"type":"number","description":"The x coordinate of the location, negative values are to the left, positive values are to the right"},"y":{"type":"number","description":"The y coordinate of the location, negative values are backward, positive values are forward"}},"required":["x","y"]}}},{"type":"function","function":{"name":"turn","description":"Turn the robot to a given direction","parameters":{"type":"object","properties":{"theta":{"type":"integer","description":"The angle to turn to, in degrees, positive values are counter-clockwise, negative values are clockwise"}},"required":["theta"]}}},{"type":"function","function":{"name":"invalid_prompt","description":"call when the user's prompt is invalid","parameters":{"type":"object","properties":{"message":{"type":"string","description":"why the prompt is invalid"}},"required":["message"]}}}],"add_generation_prompt":false}
    """.strip().split("\n")
    rows = [json.loads(row) for row in jsons]
    return Dataset.from_list(rows)


@pytest.fixture(name="qwen3_prompt_strategy")
def qwen3_chat_template_strategy(qwen3_tokenizer):
    cfg = DictDefault(
        sequence_len=2048,
        chat_template="qwen3",
        eot_tokens=["<|im_end|>"],
    )
    ds_cfg = DictDefault(
        type="chat_template",
    )
    load = StrategyLoader()
    strat = load(qwen3_tokenizer, cfg, ds_cfg)
    return strat


class TestSchemaUnification:
    """
    Test class on handling null fields for tool calling
    """

    def test_schema_unification_single_prompt(
        self, messages_w_tools, qwen3_prompt_strategy, qwen3_tokenizer
    ):
        for row in messages_w_tools:
            inputs = qwen3_prompt_strategy.tokenize_prompt(row)
            decoded = qwen3_tokenizer.decode(inputs["input_ids"])
            tool_call = decoded.split("<tool_call>")[-1].split("</tool_call>")[0]
            assert '"message": null' not in tool_call
            assert '"theta": null' not in tool_call

    def test_schema_unification_batched(
        self, messages_w_tools, qwen3_prompt_strategy, qwen3_tokenizer
    ):
        rows = messages_w_tools.map(qwen3_prompt_strategy.tokenize_prompt, batched=True)
        for row in rows:
            decoded = qwen3_tokenizer.decode(row["input_ids"])
            tool_call = decoded.split("<tool_call>")[-1].split("</tool_call>")[0]
            assert '"message": null' not in tool_call
            assert '"theta": null' not in tool_call


================================================
FILE: tests/prompt_strategies/test_chat_template_utils.py
================================================
"""
Tests for utils in axolotl.utils.chat_templates
"""

import unittest

import pytest
from transformers import AutoTokenizer

from axolotl.utils.chat_templates import (
    _CHAT_TEMPLATES,
    extract_chat_template_args,
    get_chat_template,
)

from tests.hf_offline_utils import enable_hf_offline


@pytest.fixture(name="llama3_tokenizer")
@enable_hf_offline
def fixture_llama3_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("NousResearch/Meta-Llama-3-8B")

    return tokenizer


class TestGetChatTemplateUtils:
    """
    Tests the get_chat_template function.
    """

    def test_known_chat_template(self):
        chat_template_str = get_chat_template("llama3")
        assert chat_template_str == _CHAT_TEMPLATES["llama3"]

    def test_invalid_chat_template(self):
        with pytest.raises(ValueError) as exc:
            get_chat_template("invalid_template")
            assert str(exc) == "Template 'invalid_template' not found."

    def test_tokenizer_default_no_tokenizer(self):
        with pytest.raises(ValueError):
            get_chat_template("tokenizer_default", tokenizer=None)

    def test_tokenizer_default_no_chat_template_on_tokenizer(self, llama3_tokenizer):
        with pytest.raises(ValueError):
            get_chat_template("tokenizer_default", tokenizer=llama3_tokenizer)

    def test_tokenizer_default_with_chat_template_on_tokenizer(self, llama3_tokenizer):
        llama3_tokenizer.chat_template = "test_template"
        chat_template_str = get_chat_template(
            "tokenizer_default", tokenizer=llama3_tokenizer
        )
        assert chat_template_str == "test_template"

    def test_tokenizer_default_fallback_no_tokenizer(self):
        with pytest.raises(ValueError):
            get_chat_template("tokenizer_default_fallback_test", tokenizer=None)

    def test_tokenizer_default_fallback_no_chat_template_on_tokenizer(
        self, llama3_tokenizer
    ):
        chat_template_str = get_chat_template(
            "tokenizer_default_fallback_chatml", tokenizer=llama3_tokenizer
        )
        assert chat_template_str == get_chat_template("chatml")

    def test_tokenizer_default_fallback_with_chat_template_on_tokenizer(
        self, llama3_tokenizer
    ):
        llama3_tokenizer.chat_template = "test_template"
        chat_template_str = get_chat_template(
            "tokenizer_default_fallback_chatml", tokenizer=llama3_tokenizer
        )
        assert chat_template_str == "test_template"

    def test_jinja_template_mode(self):
        jinja_template = "example_jinja_template"
        chat_template_str = get_chat_template("jinja", jinja_template=jinja_template)
        assert chat_template_str == jinja_template

    def test_jinja_template_mode_no_jinja_template(self):
        with pytest.raises(ValueError):
            get_chat_template("jinja", jinja_template=None)

    def test_extract_chat_template_args(self):
        # No ds_cfg
        chat_template_choice, chat_template_jinja = extract_chat_template_args(
            cfg={"chat_template": "chatml"},
        )
        assert chat_template_choice == "chatml"
        assert chat_template_jinja is None

        # ds_cfg provided
        chat_template_choice, chat_template_jinja = extract_chat_template_args(
            cfg={
                "chat_template": "jinja",
                "chat_template_jinja": "global_jinja_template",
            },
            ds_cfg={"chat_template": "llama3", "chat_template_jinja": None},
        )
        assert chat_template_choice == "llama3"
        assert chat_template_jinja is None

        # ds_cfg provided with jinja template
        chat_template_choice, chat_template_jinja = extract_chat_template_args(
            cfg={"chat_template": "chatml", "chat_template_jinja": None},
            ds_cfg={
                "chat_template": "jinja",
                "chat_template_jinja": "ds_jinja_template",
            },
        )
        assert chat_template_choice == "jinja"
        assert chat_template_jinja == "ds_jinja_template"

        # ds_cfg provided with no chat_template
        chat_template_choice, chat_template_jinja = extract_chat_template_args(
            cfg={
                "chat_template": "jinja",
                "chat_template_jinja": "global_jinja_template",
            },
            ds_cfg={"chat_template": None, "chat_template_jinja": "ds_jinja_template"},
        )
        assert chat_template_choice == "jinja"
        assert chat_template_jinja == "global_jinja_template"


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/prompt_strategies/test_chat_templates.py
================================================
"""
tests for chat_template prompt strategy
"""

import unittest

from axolotl.prompt_strategies.chat_template import (
    ChatTemplatePrompter,
    ChatTemplateStrategy,
    load,
)
from axolotl.prompters import IGNORE_TOKEN_ID
from axolotl.utils.chat_templates import get_chat_template
from axolotl.utils.dict import DictDefault
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__)


class TestAssistantChatTemplateLlama3:
    """
    Test class for assistant style datasets with llama-3 prompts using the chat_template strategy.
    """

    def test_llama3_load(self, llama3_tokenizer, assistant_dataset):
        LOG.info("Loading llama-3 tokenizer with assistant dataset")
        strategy = load(
            llama3_tokenizer,
            DictDefault(
                {
                    "train_on_inputs": False,
                    "sequence_len": 512,
                }
            ),
            DictDefault(
                {
                    "chat_template": "llama3",
                    "message_field_role": "role",
                    "message_field_content": "content",
                    "message_property_mappings": {
                        "role": "role",
                        "content": "content",
                    },
                    "roles": {
                        "user": ["user"],
                        "assistant": ["assistant"],
                        "system": ["system"],
                    },
                    "field_messages": "messages",
                }
            ),
        )
        res = strategy.tokenize_prompt(assistant_dataset[0])
        input_ids = res["input_ids"]
        # fmt: off
        expected_input_ids = [
            128000,  # bos
            128006, 882, 128007,  # user header
            271, 15339, 128009,  # user prompt eot
            128006, 78191, 128007,  # assistant header
            271, 15339, 128009,  # assistant response eot
            128006, 882, 128007,
            271, 19045, 29474, 128009,
            128006, 78191, 128007,
            271, 19045, 29474, 128009,
        ]
        # fmt: on
        LOG.debug(f"Expected input_ids: {expected_input_ids}")
        LOG.debug(f"Actual input_ids: {input_ids}")
        assert input_ids == expected_input_ids, (
            f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
        )

    def test_llama3(self, llama3_tokenizer, assistant_dataset):
        LOG.info("Testing llama-3 with assistant dataset")
        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                llama3_tokenizer,
                chat_template=get_chat_template("llama3"),
                message_property_mappings={
                    "role": "role",
                    "content": "content",
                },
                roles={
                    "user": ["user"],
                    "assistant": ["assistant"],
                    "system": ["system"],
                },
            ),
            tokenizer=llama3_tokenizer,
            train_on_inputs=False,
            sequence_len=512,
        )

        res = strategy.tokenize_prompt(assistant_dataset[0])
        input_ids = res["input_ids"]
        # fmt: off
        expected_input_ids = [
            128000,  # bos
            128006, 882, 128007,  # user header
            271, 15339, 128009,  # user prompt eot
            128006, 78191, 128007,  # assistant header
            271, 15339, 128009,   # assistant response eot
            128006, 882, 128007,
            271, 19045, 29474, 128009,
            128006, 78191, 128007,
            271, 19045, 29474, 128009,
        ]
        # fmt: on
        LOG.debug(f"Expected input_ids: {expected_input_ids}")
        LOG.debug(f"Actual input_ids: {input_ids}")
        assert input_ids == expected_input_ids, (
            f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
        )

    def test_phi35(self, phi35_tokenizer, assistant_dataset):
        LOG.info("Testing phi-3.5 with assistant dataset")
        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                phi35_tokenizer,
                chat_template=get_chat_template("phi_35"),
                message_property_mappings={
                    "role": "role",
                    "content": "content",
                },
                roles={
                    "user": ["user"],
                    "assistant": ["assistant"],
                    "system": ["system"],
                },
            ),
            tokenizer=phi35_tokenizer,
            train_on_inputs=False,
            sequence_len=512,
        )

        res = strategy.tokenize_prompt(assistant_dataset[0])
        input_ids = res["input_ids"]
        labels = res["labels"]
        # fmt: off
        expected_input_ids = [
            32010,  # user
            22172, 32007,  # user eot
            32001,  # assistant
            22172, 32007,  # assistant eot
            32010,  # user
            1781, 26966, 32007,  # user eot
            32001,  # assistant
            1781, 26966, 32007,  # assistant eot
        ]
        expected_labels = [
            -100,  # user
            -100, -100,  # user eot
            -100,  # assistant
            -100, -100,  # assistant eot,
            -100,  # user
            -100, -100, -100,  # user eot
            -100,  # assistant
            1781, 26966, 32007,  # assistant eot
        ]
        # fmt: on
        LOG.debug(f"Expected input_ids: {expected_input_ids}")
        LOG.debug(f"Actual input_ids: {input_ids}")
        assert input_ids == expected_input_ids, (
            f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
        )

        LOG.debug(f"Expected labels : {expected_labels}")
        LOG.debug(f"Actual labels : {labels}")
        assert labels == expected_labels, (
            f"Input IDs mismatch: {labels} != {expected_labels}"
        )

    def test_llama3_with_training_data(self, llama3_tokenizer, assistant_dataset):
        LOG.info("Testing llama-3 with assistant dataset including training data")
        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                llama3_tokenizer,
                chat_template=get_chat_template("llama3"),
                message_field_training="training",
                message_property_mappings={
                    "role": "role",
                    "content": "content",
                },
                roles={
                    "user": ["user"],
                    "assistant": ["assistant"],
                    "system": ["system"],
                },
            ),
            tokenizer=llama3_tokenizer,
            train_on_inputs=False,
            train_on_eos="none",
            sequence_len=512,
            roles_to_train=["assistant"],
        )

        prompt_tokens = strategy.prompter.build_prompt(
            assistant_dataset[0]["messages"], False
        )
        prompt = llama3_tokenizer.decode(prompt_tokens, skip_special_tokens=False)
        LOG.debug(f"Generated prompt: {prompt}")
        res = strategy.tokenize_prompt(assistant_dataset[0])
        labels = res["labels"]
        input_ids = res["input_ids"]
        # fmt: off
        expected_labels = [
            IGNORE_TOKEN_ID,  # bos
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # user header
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # user prompt eot
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # assistant header
            IGNORE_TOKEN_ID, 15339, IGNORE_TOKEN_ID,  # assistant response eot
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,
            IGNORE_TOKEN_ID, 19045, 29474, IGNORE_TOKEN_ID,
        ]
        # fmt: on

        LOG.debug(f"Expected labels: {expected_labels}")
        LOG.debug(f"Actual labels: {labels}")
        assert labels == expected_labels, (
            f"Labels mismatch:\n"
            f"Expected: {expected_labels}\n"
            f"Actual: {labels}\n"
            f"Input IDs: {input_ids}\n"
        )


class TestSharegptChatTemplateLlama3:
    """
    Test class for ShareGPT style datasets with llama-3 prompts using the chat_template strategy.
    """

    def test_llama3_assistant(self, llama3_tokenizer, sharegpt_dataset):
        LOG.info("Testing ShareGPT style datasets with llama-3 assistant prompts")

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                llama3_tokenizer,
                chat_template=get_chat_template("llama3"),
                message_property_mappings={
                    "role": "from",
                    "content": "value",
                },
                field_messages="conversations",
            ),
            tokenizer=llama3_tokenizer,
            train_on_inputs=False,
            train_on_eos="none",
            sequence_len=512,
            roles_to_train=["gpt"],
        )

        res = strategy.tokenize_prompt(sharegpt_dataset[0])
        input_ids = res["input_ids"]
        labels = res["labels"]
        # fmt: off
        expected_input_ids = [
            128000,  # bos
            128006, 882, 128007,  # user header
            271, 15339, 128009,  # user prompt eot
            128006, 78191, 128007,  # assistant header
            271, 15339, 128009,  # assistant response eot
            128006, 882, 128007,
            271, 19045, 29474, 128009,
            128006, 78191, 128007,
            271, 19045, 29474, 128009,
        ]
        expected_labels = [
            IGNORE_TOKEN_ID,  # bos
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # user header
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # user prompt eot
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # assistant header
            IGNORE_TOKEN_ID, 15339, IGNORE_TOKEN_ID,  # assistant response eot
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,
            IGNORE_TOKEN_ID, 19045, 29474, IGNORE_TOKEN_ID,
        ]
        # fmt: on

        LOG.debug(f"Expected input_ids: {expected_input_ids}")
        LOG.debug(f"Actual input_ids: {input_ids}")
        LOG.debug(f"Expected labels: {expected_labels}")
        LOG.debug(f"Actual labels: {labels}")

        assert input_ids == expected_input_ids, (
            f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
        )
        assert labels == expected_labels, (
            f"Labels mismatch: {labels} != {expected_labels}"
        )

    def test_llama3_human(self, llama3_tokenizer, sharegpt_dataset):
        LOG.info("Testing ShareGPT style datasets with llama-3 human prompts")

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                llama3_tokenizer,
                chat_template=get_chat_template("llama3"),
                message_property_mappings={
                    "role": "from",
                    "content": "value",
                },
                field_messages="conversations",
            ),
            tokenizer=llama3_tokenizer,
            train_on_inputs=False,
            train_on_eos="none",
            sequence_len=512,
            roles_to_train=["human"],
        )

        res = strategy.tokenize_prompt(sharegpt_dataset[0])
        input_ids = res["input_ids"]
        labels = res["labels"]
        # fmt: off
        expected_input_ids = [
            128000,  # bos
            128006, 882, 128007,  # user header
            271, 15339, 128009,  # user prompt eot
            128006, 78191, 128007,  # assistant header
            271, 15339, 128009,  # assistant response eot
            128006, 882, 128007,
            271, 19045, 29474, 128009,
            128006, 78191, 128007,
            271, 19045, 29474, 128009,
        ]
        expected_labels = [
            IGNORE_TOKEN_ID,  # bos
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # user header
            IGNORE_TOKEN_ID, 15339, IGNORE_TOKEN_ID,  # user prompt eot
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # assistant header
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # assistant response eot
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,
            IGNORE_TOKEN_ID, 19045, 29474, IGNORE_TOKEN_ID,
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,
        ]
        # fmt: on

        LOG.debug(f"Expected input_ids: {expected_input_ids}")
        LOG.debug(f"Actual input_ids: {input_ids}")
        LOG.debug(f"Expected labels: {expected_labels}")
        LOG.debug(f"Actual labels: {labels}")

        assert input_ids == expected_input_ids, (
            f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
        )
        assert labels == expected_labels, (
            f"Labels mismatch: {labels} != {expected_labels}"
        )

    def test_llama3_system_human(self, llama3_tokenizer, basic_dataset):
        LOG.info("Testing ShareGPT style datasets with llama-3 system/human prompts")

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                llama3_tokenizer,
                chat_template=get_chat_template("llama3"),
                message_property_mappings={
                    "role": "from",
                    "content": "value",
                },
                field_messages="conversations",
            ),
            tokenizer=llama3_tokenizer,
            train_on_inputs=False,
            train_on_eos="none",
            sequence_len=512,
            roles_to_train=["system", "human"],
        )

        res = strategy.tokenize_prompt(basic_dataset[0])
        input_ids = res["input_ids"]
        labels = res["labels"]
        # fmt: off
        expected_input_ids = [
            128000,  # bos
            128006, 9125, 128007,
            271, 2675, 527, 459, 15592, 18328, 13, 128009,
            128006, 882, 128007,  # user header
            271, 9906, 128009,  # user prompt eot
            128006, 78191, 128007,  # assistant header
            271, 13347, 1070, 0, 128009,  # assistant response eot
            128006, 882, 128007,
            271, 4438, 527, 499, 30, 128009,
            128006, 78191, 128007,
            271, 40, 2846, 3815, 1664, 11, 9901, 499, 0, 128009,
        ]
        expected_labels = [
            IGNORE_TOKEN_ID,  # bos
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # system header
            IGNORE_TOKEN_ID, 2675, 527, 459, 15592, 18328, 13, IGNORE_TOKEN_ID,  # system prompt eot
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # user header
            IGNORE_TOKEN_ID, 9906, IGNORE_TOKEN_ID,  # user prompt eot
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # assistant header
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # assistant response eot
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,
            IGNORE_TOKEN_ID, 4438, 527, 499, 30, IGNORE_TOKEN_ID,
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,
        ]
        # fmt: on

        LOG.debug(f"Expected input_ids: {expected_input_ids}")
        LOG.debug(f"Actual input_ids: {input_ids}")
        LOG.debug(f"Expected labels: {expected_labels}")
        LOG.debug(f"Actual labels: {labels}")

        assert input_ids == expected_input_ids, (
            f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
        )
        assert labels == expected_labels, (
            f"Labels mismatch: {labels} != {expected_labels}"
        )


class TestAssistantToolCallingChatTemplateLlama32Vision:
    """
    Test class for assistant style datasets with tool_calling prompts using the llama-32_vision chat template.
    """

    def test_llama32vision_train_on_assistant(
        self, llama3_tokenizer, toolcalling_dataset, llama3_2_vision_chat_template_jinja
    ):
        LOG.info(
            "Testing assistant style datasets with tool_calling with llama-32 chat template, training on assistant"
        )

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                llama3_tokenizer,
                chat_template=get_chat_template(
                    "jinja", jinja_template=llama3_2_vision_chat_template_jinja
                ),
                message_property_mappings={"role": "role", "content": "content"},
            ),
            tokenizer=llama3_tokenizer,
            train_on_inputs=False,
            train_on_eos="turn",
            sequence_len=512,
            roles_to_train=["assistant"],
        )

        res = strategy.tokenize_prompt(toolcalling_dataset[0])

        input_ids = res["input_ids"]
        labels = res["labels"]

        # fmt: off
        expected_input_ids = [
            128000,  # bos
            128006, 9125, 128007, 271,  # system header
            38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 198, 15724, 2696, 25, 220, 1114, 3799, 220, 2366, 19, 271,  # system date prompt
            2675, 527, 264, 11164, 430, 31680, 311, 9282, 20126, 13, 1472, 1288, 10052, 449, 279, 5089, 1511, 304, 279, 79002, 3813, 13, 128009,  # system message
            128006, 882, 128007, 271,  # user header
            19182, 11, 1148, 596, 279, 9499, 304, 12366, 1314, 1457, 30, 128009,  # user message
            128006, 78191, 128007, 271,  # assistant header
            5018, 609, 794, 330, 456, 11327, 54625, 498, 330, 14105, 794, 5324, 2588, 794, 330, 60704, 11, 9822, 498, 330, 3928, 794, 330, 66, 41347, 32075, 128009,  # assistant message
            128006, 23799, 4690, 128007, 271,  # tool header
            1, 1313, 13, 15, 1, 128009,  # tool message
            128006, 78191, 128007, 271,  # assistant header
            791, 9499, 304, 12366, 374, 220, 1313, 13, 15, 12628, 62447, 13, 128009  # assistant message
        ]

        expected_labels = [
            IGNORE_TOKEN_ID,  # bos
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # system header
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # system date prompt
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # system message
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # user header
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # user message
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # assistant header
            5018, 609, 794, 330, 456, 11327, 54625, 498, 330, 14105, 794, 5324, 2588, 794, 330, 60704, 11, 9822, 498, 330, 3928, 794, 330, 66, 41347, 32075, 128009,  # assistant message
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # tool header
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # tool message
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # assistant header
            791, 9499, 304, 12366, 374, 220, 1313, 13, 15, 12628, 62447, 13, 128009  # assistant message
        ]
        # fmt: on

        assert input_ids == expected_input_ids, (
            f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
        )

        assert labels == expected_labels, (
            f"Labels mismatch: {labels} != {expected_labels}"
        )

    def test_llama32vision_train_on_tools(
        self, llama3_tokenizer, toolcalling_dataset, llama3_2_vision_chat_template_jinja
    ):
        LOG.info(
            "Testing assistant style datasets with tool_calling with llama-32 chat template, training on tools"
        )

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                llama3_tokenizer,
                chat_template=get_chat_template(
                    "jinja", jinja_template=llama3_2_vision_chat_template_jinja
                ),
                message_property_mappings={"role": "role", "content": "content"},
            ),
            tokenizer=llama3_tokenizer,
            train_on_inputs=False,
            train_on_eos="turn",
            sequence_len=512,
            roles_to_train=["assistant", "tool"],
        )

        res = strategy.tokenize_prompt(toolcalling_dataset[0])

        input_ids = res["input_ids"]
        labels = res["labels"]

        # fmt: off
        expected_input_ids = [
            128000,  # bos
            128006, 9125, 128007, 271,  # system header
            38766, 1303, 33025, 2696, 25, 6790, 220, 2366, 18, 198, 15724, 2696, 25, 220, 1114, 3799, 220, 2366, 19, 271,  # system date prompt
            2675, 527, 264, 11164, 430, 31680, 311, 9282, 20126, 13, 1472, 1288, 10052, 449, 279, 5089, 1511, 304, 279, 79002, 3813, 13, 128009,  # system message
            128006, 882, 128007, 271,  # user header
            19182, 11, 1148, 596, 279, 9499, 304, 12366, 1314, 1457, 30, 128009,  # user message
            128006, 78191, 128007, 271,  # assistant header
            5018, 609, 794, 330, 456, 11327, 54625, 498, 330, 14105, 794, 5324, 2588, 794, 330, 60704, 11, 9822, 498, 330, 3928, 794, 330, 66, 41347, 32075, 128009,  # assistant message
            128006, 23799, 4690, 128007, 271,  # tool header
            1, 1313, 13, 15, 1, 128009,  # tool message
            128006, 78191, 128007, 271,  # assistant header
            791, 9499, 304, 12366, 374, 220, 1313, 13, 15, 12628, 62447, 13, 128009  # assistant message
        ]

        expected_labels = [
            IGNORE_TOKEN_ID,  # bos
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # system header
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # system date prompt
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # system message
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # user header
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # user message
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # assistant header
            5018, 609, 794, 330, 456, 11327, 54625, 498, 330, 14105, 794, 5324, 2588, 794, 330, 60704, 11, 9822, 498, 330, 3928, 794, 330, 66, 41347, 32075, 128009,  # assistant message
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # tool header
            IGNORE_TOKEN_ID, 1313, 13, 15, IGNORE_TOKEN_ID, 128009,  # tool message
            IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID, IGNORE_TOKEN_ID,  # assistant header
            791, 9499, 304, 12366, 374, 220, 1313, 13, 15, 12628, 62447, 13, 128009  # assistant message
        ]
        # fmt: on

        assert input_ids == expected_input_ids, (
            f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
        )

        assert labels == expected_labels, (
            f"Labels mismatch: {labels} != {expected_labels}"
        )


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/prompt_strategies/test_chat_templates_advanced.py
================================================
"""
tests for chat_template prompt strategy
"""

from copy import deepcopy

import pytest
from datasets import Dataset
from tokenizers import AddedToken
from transformers import PreTrainedTokenizer

from axolotl.prompt_strategies.chat_template import (
    ChatTemplatePrompter,
    ChatTemplateStrategy,
)
from axolotl.prompters import IGNORE_TOKEN_ID
from axolotl.utils.chat_templates import get_chat_template
from axolotl.utils.logging import get_logger

from tests.hf_offline_utils import enable_hf_offline

LOG = get_logger(__name__)

PARAMETRIZE_KEYS = "tokenizer, chat_template, chat_template_jinja, eos_token"
PARAMETRIZE_PARAMS = [
    ("llama3_tokenizer", "llama3", None, None),
    ("llama3_tokenizer", "chatml", None, "<|im_end|>"),
    (
        "mistralv03_tokenizer",
        "jinja",
        "mistralv03_tokenizer_chat_template_jinja",
        "[/INST]",
    ),
    (
        "gemma2_tokenizer",
        "jinja",
        "gemma2_tokenizer_chat_template_jinja",
        "<end_of_turn>",
    ),
    # ("phi35_tokenizer", "phi_35", None, "<|end|>"),  # seems to be broken w transformers v5
    ("phi4_tokenizer", "phi_4", None, "<|im_end|>"),
]


@pytest.mark.parametrize(
    PARAMETRIZE_KEYS,
    PARAMETRIZE_PARAMS,
)
class TestChatTemplateConfigurations:
    """
    Test class for various configurations of ChatTemplateStrategy.
    """

    @staticmethod
    def setup_tokenizer(
        tokenizer_name,
        chat_template,
        chat_template_jinja=None,
        eos_token=None,
        request=None,
        eot_token=None,
    ) -> tuple[PreTrainedTokenizer, str]:
        """
        Helper function to set up the tokenizer and chat template for the test.
        """
        tokenizer = deepcopy(request.getfixturevalue(tokenizer_name))
        if chat_template == "jinja":
            chat_template_jinja = request.getfixturevalue(chat_template_jinja)
        if eos_token:
            tokenizer.add_special_tokens(
                {
                    "eos_token": AddedToken(
                        eos_token, rstrip=False, lstrip=False, normalized=False
                    )
                }
            )
            if tokenizer.__class__.__name__ in (
                "LlamaTokenizerFast",
                "CodeLlamaTokenizerFast",
            ):
                tokenizer.update_post_processor()

        if eot_token:
            tokenizer.add_special_tokens({"additional_special_tokens": [eot_token]})

        return tokenizer, chat_template_jinja

    def _should_skip_turn(self, tokenizer, turn, turn_idx, start_idx, end_idx):
        """Helper method to determine if a turn should be skipped in testing.
        This is used to skip system messages for Mistral as the template does not output them without more turns.
        """
        if (
            turn_idx == 0
            and turn.get("from") in ["system", "context"]
            and ("mistral" in tokenizer.name_or_path.lower())
        ):
            assert start_idx == -1 and end_idx == -1, (
                "Expected system message to be skipped"
            )
            return True
        return False

    @enable_hf_offline
    def test_train_on_inputs_true(
        self,
        tokenizer,
        chat_template,
        chat_template_jinja,
        eos_token,
        basic_dataset,
        request,
    ):
        LOG.info("Testing with train_on_inputs=True")

        tokenizer, chat_template_jinja = self.setup_tokenizer(
            tokenizer, chat_template, chat_template_jinja, eos_token, request
        )

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                tokenizer,
                chat_template=get_chat_template(
                    chat_template, jinja_template=chat_template_jinja
                ),
                message_property_mappings={"role": "from", "content": "value"},
                field_messages="conversations",
            ),
            tokenizer=tokenizer,
            train_on_inputs=True,
            sequence_len=512,
            roles_to_train=["assistant"],
        )

        res = strategy.tokenize_prompt(basic_dataset[0])
        turns = strategy.get_conversation_thread(basic_dataset[0])
        labels = res["labels"]
        input_ids = res["input_ids"]

        # Verify assistant responses are labeled
        for i, turn in enumerate(basic_dataset[0]["conversations"]):
            start_idx, end_idx = strategy.find_turn(turns=turns, turn_idx=i)

            if self._should_skip_turn(tokenizer, turn, i, start_idx, end_idx):
                continue

            decoded_response = tokenizer.decode(input_ids[start_idx:end_idx])
            response = turn["value"]

            assert response in decoded_response, (
                f"Response {response} not found in index {start_idx}:{end_idx} "
                f"decoded:{decoded_response}"
            )

            assert all(
                label != IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
            ), (
                f"Expected labels for input '{response}' to be ignored, but got {labels[start_idx:end_idx]}"
            )

        LOG.debug("Full labels: %s", labels)
        LOG.debug("Full input_ids: %s", input_ids)

    def test_train_on_inputs_false(
        self,
        tokenizer,
        chat_template,
        chat_template_jinja,
        eos_token,
        basic_dataset,
        request,
    ):
        LOG.info("Testing with train_on_inputs=False, on assistant only")

        tokenizer, chat_template_jinja = self.setup_tokenizer(
            tokenizer, chat_template, chat_template_jinja, eos_token, request
        )

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                tokenizer,
                chat_template=get_chat_template(
                    chat_template, jinja_template=chat_template_jinja
                ),
                message_property_mappings={"role": "from", "content": "value"},
                field_messages="conversations",
            ),
            tokenizer=tokenizer,
            train_on_inputs=False,
            sequence_len=512,
            roles_to_train=["assistant"],
        )

        res = strategy.tokenize_prompt(basic_dataset[0])
        turns = strategy.get_conversation_thread(basic_dataset[0])
        labels = res["labels"]
        input_ids = res["input_ids"]

        # Process all turns and verify correct labeling based on role
        for i, turn in enumerate(basic_dataset[0]["conversations"]):
            start_idx, end_idx = strategy.find_turn(turns=turns, turn_idx=i)

            if self._should_skip_turn(tokenizer, turn, i, start_idx, end_idx):
                continue

            decoded_response = tokenizer.decode(input_ids[start_idx:end_idx])
            response = turn["value"]

            assert response in decoded_response, (
                f"Response {response} not found in index {start_idx}:{end_idx} "
                f"decoded:{decoded_response}"
            )

            # Verify that assistant responses are labeled and other inputs are not
            is_assistant = turn["from"] == "assistant"
            if is_assistant:
                assert all(
                    label != IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
                ), (
                    f"Expected labels for assistant response '{response}' to be set, but got {labels[start_idx:end_idx]}"
                )
            else:
                assert all(
                    label == IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
                ), (
                    f"Expected labels for human input '{response}' to be IGNORE_TOKEN_ID, but got {labels[start_idx:end_idx]}"
                )

    def test_roles_to_train_human_assistant_only(
        self,
        tokenizer,
        chat_template,
        chat_template_jinja,
        eos_token,
        basic_dataset,
        request,
    ):
        LOG.info("Testing roles_to_train with human assistant only")

        tokenizer, chat_template_jinja = self.setup_tokenizer(
            tokenizer, chat_template, chat_template_jinja, eos_token, request
        )

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                tokenizer,
                chat_template=get_chat_template(
                    chat_template, jinja_template=chat_template_jinja
                ),
                message_property_mappings={"role": "from", "content": "value"},
                field_messages="conversations",
            ),
            tokenizer=tokenizer,
            train_on_inputs=False,
            sequence_len=512,
            roles_to_train=["assistant", "human"],
        )

        res = strategy.tokenize_prompt(basic_dataset[0])
        turns = strategy.get_conversation_thread(basic_dataset[0])
        labels = res["labels"]
        input_ids = res["input_ids"]

        # Process all turns and verify correct labeling based on role
        for i, turn in enumerate(basic_dataset[0]["conversations"]):
            start_idx, end_idx = strategy.find_turn(turns=turns, turn_idx=i)

            if self._should_skip_turn(tokenizer, turn, i, start_idx, end_idx):
                continue

            decoded_response = tokenizer.decode(input_ids[start_idx:end_idx])
            response = turn["value"]

            assert response in decoded_response, (
                f"Response {response} not found in index {start_idx}:{end_idx} "
                f"decoded:{decoded_response}"
            )

            # Verify that non-system responses are labeled and system are not
            should_be_labelled = turn["from"] != "system"
            if should_be_labelled:
                assert all(
                    label != IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
                ), (
                    f"Expected labels for assistant response '{response}' to be set, but got {labels[start_idx:end_idx]}"
                )
            else:
                assert all(
                    label == IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
                ), (
                    f"Expected labels for human input '{response}' to be IGNORE_TOKEN_ID, but got {labels[start_idx:end_idx]}"
                )

    def test_roles_to_train_all(
        self,
        tokenizer,
        chat_template,
        chat_template_jinja,
        eos_token,
        basic_dataset,
        request,
    ):
        LOG.info("Testing roles_to_train with all roles")

        tokenizer, chat_template_jinja = self.setup_tokenizer(
            tokenizer, chat_template, chat_template_jinja, eos_token, request
        )

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                tokenizer,
                chat_template=get_chat_template(
                    chat_template, jinja_template=chat_template_jinja
                ),
                message_property_mappings={"role": "from", "content": "value"},
                field_messages="conversations",
            ),
            tokenizer=tokenizer,
            train_on_inputs=True,
            sequence_len=512,
            roles_to_train=["human", "assistant"],
        )

        res = strategy.tokenize_prompt(basic_dataset[0])
        turns = strategy.get_conversation_thread(basic_dataset[0])
        labels = res["labels"]
        input_ids = res["input_ids"]

        # Verify that all responses are labeled (except for special tokens)
        for i, turn in enumerate(basic_dataset[0]["conversations"]):
            response = turn["value"]

            start_idx, end_idx = strategy.find_turn(turns=turns, turn_idx=i)

            if self._should_skip_turn(tokenizer, turn, i, start_idx, end_idx):
                continue

            decoded_response = tokenizer.decode(input_ids[start_idx:end_idx])
            assert response in decoded_response, (
                f"Response {response} not found in index {start_idx}:{end_idx} decoded:{decoded_response}"
            )

            assert all(
                label != IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
            ), (
                f"Expected labels for response '{response}' to be set, but got {labels[start_idx:end_idx]}"
            )

    def test_empty_roles_to_train(
        self,
        tokenizer,
        chat_template,
        chat_template_jinja,
        eos_token,
        basic_dataset,
        request,
    ):
        LOG.info("Testing with empty roles_to_train")

        tokenizer, chat_template_jinja = self.setup_tokenizer(
            tokenizer, chat_template, chat_template_jinja, eos_token, request
        )

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                tokenizer,
                chat_template=get_chat_template(
                    chat_template, jinja_template=chat_template_jinja
                ),
                message_property_mappings={"role": "from", "content": "value"},
                field_messages="conversations",
            ),
            tokenizer=tokenizer,
            train_on_inputs=False,
            sequence_len=512,
            roles_to_train=[],
            train_on_eos="none",  # Add this line
        )

        res = strategy.tokenize_prompt(basic_dataset[0])
        labels = res["labels"]

        # Verify that no labels are set when roles_to_train is empty
        LOG.debug("Full labels: %s", labels)
        assert all(label == IGNORE_TOKEN_ID for label in labels), (
            "Expected all labels to be IGNORE_TOKEN_ID when roles_to_train is empty"
        )

    def test_train_on_eos_all(
        self,
        tokenizer,
        chat_template,
        chat_template_jinja,
        eos_token,
        basic_dataset,
        request,
    ):
        LOG.info("Testing with train_on_eos='all'")

        tokenizer, chat_template_jinja = self.setup_tokenizer(
            tokenizer, chat_template, chat_template_jinja, eos_token, request
        )

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                tokenizer,
                chat_template=get_chat_template(
                    chat_template, jinja_template=chat_template_jinja
                ),
                message_property_mappings={"role": "from", "content": "value"},
                field_messages="conversations",
            ),
            tokenizer=tokenizer,
            train_on_inputs=False,
            sequence_len=512,
            roles_to_train=["assistant"],
            train_on_eos="all",
        )

        res = strategy.tokenize_prompt(basic_dataset[0])
        labels = res["labels"]
        input_ids = res["input_ids"]

        eos_token_id = tokenizer.eos_token_id
        eos_indices = [
            i for i, token_id in enumerate(input_ids) if token_id == eos_token_id
        ]

        assert len(eos_indices) > 0, "Expected at least one EOS token in the input"
        for eos_idx in eos_indices:
            assert labels[eos_idx] != IGNORE_TOKEN_ID, (
                f"Expected EOS token at index {eos_idx} to be labeled"
            )

    def test_train_on_eos_turn(
        self,
        tokenizer,
        chat_template,
        chat_template_jinja,
        eos_token,
        basic_dataset,
        request,
    ):
        LOG.info("Testing with train_on_eos='turn'")

        tokenizer, chat_template_jinja = self.setup_tokenizer(
            tokenizer, chat_template, chat_template_jinja, eos_token, request
        )

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                tokenizer,
                chat_template=get_chat_template(
                    chat_template, jinja_template=chat_template_jinja
                ),
                message_property_mappings={"role": "from", "content": "value"},
                field_messages="conversations",
            ),
            tokenizer=tokenizer,
            train_on_inputs=False,
            sequence_len=512,
            roles_to_train=["assistant"],
            train_on_eos="turn",
        )
        res = strategy.tokenize_prompt(basic_dataset[0])
        turns = strategy.get_conversation_thread(basic_dataset[0])
        labels = res["labels"]
        input_ids = res["input_ids"]

        eos_token_id = tokenizer.eos_token_id
        # Process all turns and verify EOS token labeling
        for i, turn in enumerate(basic_dataset[0]["conversations"]):
            start_idx, end_idx = strategy.find_turn(turns=turns, turn_idx=i)

            if self._should_skip_turn(tokenizer, turn, i, start_idx, end_idx):
                continue

            decoded_response = tokenizer.decode(input_ids[start_idx:end_idx])
            response = turn["value"]

            assert response in decoded_response, (
                f"Response {response} not found in index {start_idx}:{end_idx} "
                f"decoded:{decoded_response}"
            )

            # Find the EOS token after this turn
            eos_idx = end_idx
            while eos_idx < len(input_ids) and input_ids[eos_idx] != eos_token_id:
                eos_idx += 1

            assert eos_idx < len(input_ids), (
                f"Could not find EOS token after '{response}'"
            )

            LOG.debug(
                f"Turn {i}: role={turn['from']}, content='{turn['value']}', start_idx={start_idx}, end_idx={end_idx}, eos_idx={eos_idx}"
            )

            LOG.debug(
                f"Labels for turn {i}: {labels[start_idx:end_idx]}, EOS label: {labels[eos_idx]}"
            )

            # Verify EOS token labeling based on role
            is_assistant = turn["from"] == "assistant"
            if is_assistant:
                assert labels[eos_idx] != IGNORE_TOKEN_ID, (
                    f"Expected EOS token after assistant response '{response}' to be labeled"
                )
            else:
                assert labels[eos_idx] == IGNORE_TOKEN_ID, (
                    f"Expected EOS token after non-assistant input '{response}' to not be labeled"
                )

    def test_train_on_eos_last(
        self,
        tokenizer,
        chat_template,
        chat_template_jinja,
        eos_token,
        basic_dataset,
        request,
    ):
        LOG.info("Testing with train_on_eos='last'")

        tokenizer, chat_template_jinja = self.setup_tokenizer(
            tokenizer, chat_template, chat_template_jinja, eos_token, request
        )

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                tokenizer,
                chat_template=get_chat_template(
                    chat_template, jinja_template=chat_template_jinja
                ),
                message_property_mappings={"role": "from", "content": "value"},
                field_messages="conversations",
            ),
            tokenizer=tokenizer,
            train_on_inputs=False,
            sequence_len=512,
            roles_to_train=["assistant"],
            train_on_eos="last",
        )

        res = strategy.tokenize_prompt(basic_dataset[0])
        labels = res["labels"]
        input_ids = res["input_ids"]

        eos_token_id = tokenizer.eos_token_id
        eos_indices = [
            i for i, token_id in enumerate(input_ids) if token_id == eos_token_id
        ]

        assert len(eos_indices) > 0, "Expected at least one EOS token in the input"
        last_eos_idx = eos_indices[-1]

        # Check that only the last EOS token is labeled
        for idx in eos_indices[:-1]:
            assert labels[idx] == IGNORE_TOKEN_ID, (
                f"Expected EOS token at index {idx} to not be labeled"
            )
        assert labels[last_eos_idx] != IGNORE_TOKEN_ID, (
            f"Expected last EOS token at index {last_eos_idx} to be labeled"
        )

    def test_train_on_eos_none(
        self,
        tokenizer,
        chat_template,
        chat_template_jinja,
        eos_token,
        basic_dataset,
        request,
    ):
        LOG.info("Testing with train_on_eos='none'")

        tokenizer, chat_template_jinja = self.setup_tokenizer(
            tokenizer, chat_template, chat_template_jinja, eos_token, request
        )

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                tokenizer,
                chat_template=get_chat_template(
                    chat_template, jinja_template=chat_template_jinja
                ),
                message_property_mappings={"role": "from", "content": "value"},
                field_messages="conversations",
            ),
            tokenizer=tokenizer,
            train_on_inputs=False,
            sequence_len=512,
            roles_to_train=["assistant"],
            train_on_eos="none",
        )

        res = strategy.tokenize_prompt(basic_dataset[0])
        labels = res["labels"]
        input_ids = res["input_ids"]

        eos_token_id = tokenizer.eos_token_id
        eos_indices = [
            i for i, token_id in enumerate(input_ids) if token_id == eos_token_id
        ]

        assert len(eos_indices) > 0, "Expected at least one EOS token in the input"
        for eos_idx in eos_indices:
            assert labels[eos_idx] == IGNORE_TOKEN_ID, (
                f"Expected EOS token at index {eos_idx} to not be labeled"
            )

    def test_drop_system_message(
        self,
        tokenizer,
        chat_template,
        chat_template_jinja,
        eos_token,
        basic_dataset,
        request,
    ):
        LOG.info("Testing with drop_system_message=True")
        tokenizer, chat_template_jinja = self.setup_tokenizer(
            tokenizer, chat_template, chat_template_jinja, eos_token, request
        )

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                tokenizer,
                chat_template=get_chat_template(
                    chat_template, jinja_template=chat_template_jinja
                ),
                drop_system_message=True,
                message_property_mappings={"role": "from", "content": "value"},
                field_messages="conversations",
            ),
            tokenizer=tokenizer,
            train_on_inputs=False,
            sequence_len=512,
            roles_to_train=["assistant"],
        )

        res = strategy.tokenize_prompt(basic_dataset[0])
        input_ids = res["input_ids"]

        # Check if system message is not present in input_ids
        system_message = "You are an AI assistant."
        decoded_message = tokenizer.decode(input_ids)
        assert system_message not in decoded_message, (
            "Expected system message to be dropped"
        )

    def test_custom_roles(
        self,
        tokenizer,
        chat_template,
        chat_template_jinja,
        eos_token,
        request,
    ):
        LOG.info("Testing with custom roles mapping")
        custom_roles = {
            "user": ["human", "user"],
            "assistant": ["ai", "assistant"],
            "system": ["context"],
        }
        tokenizer, chat_template_jinja = self.setup_tokenizer(
            tokenizer, chat_template, chat_template_jinja, eos_token, request
        )

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                tokenizer,
                chat_template=get_chat_template(
                    chat_template, jinja_template=chat_template_jinja
                ),
                roles=custom_roles,
                message_property_mappings={"role": "from", "content": "value"},
            ),
            tokenizer=tokenizer,
            train_on_inputs=False,
            sequence_len=512,
            roles_to_train=["ai"],
        )

        # Create a new dataset with modified role names
        modified_conversations = [
            {"from": "context", "value": "You are an AI assistant."},
            {"from": "human", "value": "Hello"},
            {"from": "ai", "value": "Hi there!"},
            {"from": "human", "value": "How are you?"},
            {"from": "ai", "value": "I'm doing well, thank you!"},
        ]

        modified_dataset = Dataset.from_dict({"messages": [modified_conversations]})

        res = strategy.tokenize_prompt(modified_dataset[0])
        turns = strategy.get_conversation_thread(modified_dataset[0])
        labels = res["labels"]
        input_ids = res["input_ids"]

        # Process all turns and verify labeling
        for i, turn in enumerate(modified_dataset[0]["messages"]):
            start_idx, end_idx = strategy.find_turn(turns=turns, turn_idx=i)

            if self._should_skip_turn(tokenizer, turn, i, start_idx, end_idx):
                continue

            decoded_response = tokenizer.decode(input_ids[start_idx:end_idx])
            response = turn["value"]

            assert response in decoded_response, (
                f"Response {response} not found in index {start_idx}:{end_idx} "
                f"decoded:{decoded_response}"
            )

            # Check if responses are labeled correctly based on role
            is_ai = turn["from"] == "ai"
            if is_ai:
                assert all(
                    label != IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
                ), f"Expected labels for AI response '{response}' to be set"
            else:
                assert all(
                    label == IGNORE_TOKEN_ID for label in labels[start_idx:end_idx]
                ), (
                    f"Expected labels for non-AI message '{response}' to be IGNORE_TOKEN_ID"
                )

    def test_message_field_training(
        self,
        tokenizer,
        chat_template,
        chat_template_jinja,
        eos_token,
        request,
    ):
        LOG.info("Testing with message_field_training")

        tokenizer, chat_template_jinja = self.setup_tokenizer(
            tokenizer, chat_template, chat_template_jinja, eos_token, request
        )

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                tokenizer,
                chat_template=get_chat_template(
                    chat_template, jinja_template=chat_template_jinja
                ),
                message_field_training="train",
                message_field_training_detail="train_detail",
                message_property_mappings={"role": "from", "content": "value"},
            ),
            tokenizer=tokenizer,
            train_on_inputs=False,
            sequence_len=512,
            roles_to_train=[],
        )

        # Create a new dataset with the train and train_detail fields
        modified_conversation = [
            {"from": "system", "value": "You are an AI assistant.", "train": False},
            {"from": "human", "value": "Hello", "train": False},
            {"from": "assistant", "value": "Hello", "train": True},
            {"from": "human", "value": "How are you?", "train": True},
            {
                "from": "assistant",
                "value": "I'm doing very well, thank you!",
                "train_detail": [
                    {"begin_offset": 0, "end_offset": 8, "train": False},
                    {"begin_offset": 9, "end_offset": 18, "train": True},
                    {"begin_offset": 19, "end_offset": 30, "train": False},
                ],
            },
            {
                "from": "human",
                "value": "I'm doing very well, thank you!",
                "train": False,
            },
            {"from": "assistant", "value": "Hi there!", "train": True},
        ]

        modified_dataset = Dataset.from_dict({"messages": [modified_conversation]})

        res = strategy.tokenize_prompt(modified_dataset[0])
        turns = strategy.get_conversation_thread(modified_dataset[0])
        labels = res["labels"]
        input_ids = res["input_ids"]

        def verify_labels(labels_span, should_train, context_message):
            """Helper to verify if a span of labels matches expected training state"""
            if should_train:
                assert all(label != IGNORE_TOKEN_ID for label in labels_span), (
                    f"Expected all labels for {context_message} to be set, but got {labels_span}"
                )
            else:
                assert all(label == IGNORE_TOKEN_ID for label in labels_span), (
                    f"Expected all labels for {context_message} to be {IGNORE_TOKEN_ID}, but got {labels_span}"
                )

        # Process all turns and verify labeling
        for i, turn in enumerate(modified_dataset[0]["messages"]):
            start_idx, end_idx = strategy.find_turn(turns=turns, turn_idx=i)

            if self._should_skip_turn(tokenizer, turn, i, start_idx, end_idx):
                continue

            decoded_response = tokenizer.decode(input_ids[start_idx:end_idx])
            response = turn["value"]

            assert response in decoded_response, (
                f"Response {response} not found in index {start_idx}:{end_idx} "
                f"decoded:{decoded_response}"
            )

            LOG.debug(
                f"Processing turn {i}: role={turn['from']}, content='{turn['value']}', "
                f"start_idx={start_idx}, end_idx={end_idx}"
            )

            if turn.get("train_detail", None) is not None:
                # Handle detailed token-level training control
                tokenized_output = tokenizer(
                    turn["value"], return_offsets_mapping=True, add_special_tokens=False
                )
                assert tokenized_output["input_ids"] == input_ids[start_idx:end_idx], (
                    f"Tokenized input mismatch for turn: {turn['value']}\n"
                    f"Expected: {input_ids[start_idx:end_idx]}\nActual: {tokenized_output['input_ids']}\n"
                    f"This will likely be a mismatch between template content and encoded content"
                )

                token_offsets = tokenized_output["offset_mapping"]

                # Adjust token offsets
                for j in range(len(token_offsets) - 1):
                    token_offsets[j] = (
                        token_offsets[j][0],
                        token_offsets[j + 1][0] - 1,
                    )
                token_offsets[-1] = (token_offsets[-1][0], len(turn["value"]) - 1)

                adjusted_train_details = strategy.prompter.adjust_train_details(
                    turn["train_detail"], token_offsets
                )

                LOG.debug(f"Original train_details: {turn['train_detail']}")
                LOG.debug(f"Adjusted train_details: {adjusted_train_details}")

                # Get and verify token offsets
                turn_tokens = input_ids[start_idx:end_idx]
                token_offsets_unmasked = strategy.prompter.get_offsets_for_train_detail(
                    text=turn["value"],
                    train_details=adjusted_train_details,
                    mask_untrainable=False,
                )

                for i, offset in enumerate(token_offsets_unmasked):
                    assert token_offsets[i][0] == offset, (
                        f"Token start offsets mismatch for turn: {turn['value']}\n"
                        f"Expected: {token_offsets[i][0]}\nActual: {offset}"
                    )

                token_offsets_masked = strategy.prompter.get_offsets_for_train_detail(
                    text=turn["value"],
                    train_details=adjusted_train_details,
                    mask_untrainable=True,
                )
                LOG.debug(f"Token offsets: {token_offsets_masked}")

                # Verify expected labels against actual labels
                expected_labels = [IGNORE_TOKEN_ID] * len(turn_tokens)
                for i, offset in enumerate(token_offsets_masked):
                    if offset != IGNORE_TOKEN_ID:
                        expected_labels[i] = turn_tokens[i]
                actual_labels = labels[
                    start_idx : start_idx + len(token_offsets_masked)
                ]
                assert actual_labels == expected_labels, (
                    f"Labels mismatch for turn: {turn['value']}\nExpected: {expected_labels}\nActual: {actual_labels}"
                )

                # Verify each detail section
                for detail in adjusted_train_details:
                    detail_start = start_idx + next(
                        j
                        for j, offset in enumerate(token_offsets_unmasked)
                        if offset >= detail["begin_offset"]
                    )
                    detail_end = start_idx + next(
                        (
                            j
                            for j, offset in enumerate(token_offsets_unmasked)
                            if offset > detail["end_offset"]
                        ),
                        len(token_offsets),
                    )

                    detail_text = turn["value"][
                        detail["begin_offset"] : detail["end_offset"] + 1
                    ]
                    detail_labels = labels[detail_start:detail_end]

                    context = (
                        f"detail (ind {detail_start}:{detail_end}): '{detail_text}'\n"
                        f"decoded: '{tokenizer.decode(input_ids[detail_start:detail_end])}')"
                    )
                    verify_labels(detail_labels, detail["train"], context)
            else:
                # Handle regular turn-level training control
                should_train = turn.get("train", False)
                turn_labels = labels[start_idx:end_idx]
                context = (
                    f"turn (ind {start_idx}:{end_idx}): '{turn['value']}'\n"
                    f"decoded: '{decoded_response}')"
                )
                verify_labels(turn_labels, should_train, context)

        LOG.debug(f"Final labels: {labels}")
        LOG.debug(f"Final input_ids: {input_ids}")

    def test_get_chat_template_variables(
        self, tokenizer, chat_template, chat_template_jinja, eos_token, request
    ):
        LOG.info("Testing get_chat_template_variables")

        actual_tokenizer, actual_jinja_template = self.setup_tokenizer(
            tokenizer, chat_template, chat_template_jinja, eos_token, request
        )

        prompter = ChatTemplatePrompter(
            actual_tokenizer,
            chat_template=get_chat_template(
                chat_template, jinja_template=actual_jinja_template
            ),
            message_property_mappings={"from": "role", "value": "content"},
        )

        variables = prompter.get_chat_template_msg_variables(
            (
                actual_jinja_template
                if actual_jinja_template
                else actual_tokenizer.get_chat_template()
            ),
            "messages",
        )

        # Special case for Mistral with additional tool variables
        if chat_template == "jinja" and tokenizer == "mistralv03_tokenizer":
            expected_variables = {"role", "content", "tool_call_id", "tool_calls"}
        # Most chat templates use the standard role and content variables
        elif chat_template in ["llama3", "chatml", "phi_35", "phi_4"] or (
            chat_template == "jinja" and tokenizer == "gemma2_tokenizer"
        ):
            expected_variables = {"role", "content"}
        else:
            LOG.warning(
                f"Unsupported chat template: {chat_template} with {chat_template_jinja}"
            )
            raise ValueError(
                f"Unsupported chat template: {chat_template} with {chat_template_jinja}"
            )

        assert variables == expected_variables, (
            f"Expected variables: {expected_variables} from {tokenizer}/{chat_template}\n"
            f"Got: {variables}\n"
            f"Chat template: {actual_jinja_template}"
        )

    def test_eot_tokens_conflict_with_eos_token(
        self,
        tokenizer,
        chat_template,
        chat_template_jinja,
        eos_token,
        basic_dataset,
        request,
    ):
        """Test that an error is raised when eot_tokens contains eos_token and train_on_eot/train_on_eos conflict"""
        LOG.info(
            "Testing conflict between eot_tokens containing eos_token and train_on_eot/train_on_eos mismatch"
        )

        tokenizer, chat_template_jinja = self.setup_tokenizer(
            tokenizer, chat_template, chat_template_jinja, eos_token, request
        )

        # Create a situation where eot_tokens contains eos_token
        eot_tokens = [
            tokenizer.eos_token,
            "[/INST]",
        ]  # Deliberately including eos_token

        # Create conflicting train_on_eos and train_on_eot settings
        with pytest.raises(
            ValueError,
            match=".*eos_token is in eot_tokens and train_on_eos != train_on_eot.*",
        ):
            ChatTemplateStrategy(
                ChatTemplatePrompter(
                    tokenizer,
                    chat_template=get_chat_template(
                        chat_template, jinja_template=chat_template_jinja
                    ),
                    message_property_mappings={"role": "from", "content": "value"},
                    field_messages="conversations",
                ),
                tokenizer=tokenizer,
                train_on_inputs=False,
                sequence_len=512,
                roles_to_train=["assistant"],
                train_on_eos="none",  # Setting to none
                train_on_eot="turn",  # Different from train_on_eos
                eot_tokens=eot_tokens,
            )

    def test_eot_token_backward_compatibility(
        self,
        tokenizer,
        chat_template,
        chat_template_jinja,
        eos_token,
        basic_dataset,
        request,
    ):
        """Test that eot_tokens inherits from eos_token when not specified"""
        LOG.info("Testing backward compatibility that eot_token inherits eos_token")

        tokenizer, chat_template_jinja = self.setup_tokenizer(
            tokenizer, chat_template, chat_template_jinja, eos_token, request
        )

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                tokenizer,
                chat_template=get_chat_template(
                    chat_template, jinja_template=chat_template_jinja
                ),
                message_property_mappings={"role": "from", "content": "value"},
                field_messages="conversations",
            ),
            tokenizer=tokenizer,
            train_on_inputs=False,
            sequence_len=512,
            roles_to_train=["assistant"],
            train_on_eos="turn",  # Setting train_on_eos to "turn"
        )

        # In backward compatibility mode, eot_tokens should be derived from eos_token
        assert strategy.eot_tokens == [tokenizer.eos_token], (
            f"Expected eot_tokens to inherit from eos_token, got {strategy.eot_tokens}"
        )
        assert strategy.train_on_eot == "turn", (
            f"Expected train_on_eot to inherit from train_on_eos, got {strategy.train_on_eot}"
        )

    def test_token_not_in_template(
        self,
        tokenizer,
        chat_template,
        chat_template_jinja,
        eos_token,
        basic_dataset,
        request,
    ):
        """Test runs even when tokens are not found in the template"""
        LOG.info("Testing runs even when tokens are not found in template")

        tokenizer, chat_template_jinja = self.setup_tokenizer(
            tokenizer, chat_template, chat_template_jinja, eos_token, request
        )

        # Create a non-existent token that definitely won't be in the template
        non_existent_token = "[DEFINITELY_NOT_IN_TEMPLATE]"
        tokenizer.add_special_tokens(
            {"additional_special_tokens": [non_existent_token]}
        )

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                tokenizer,
                chat_template=get_chat_template(
                    chat_template, jinja_template=chat_template_jinja
                ),
                message_property_mappings={"role": "from", "content": "value"},
                field_messages="conversations",
            ),
            tokenizer=tokenizer,
            train_on_inputs=False,
            sequence_len=512,
            roles_to_train=["assistant"],
            eot_tokens=[non_existent_token],
        )

        # Force template check by calling tokenize_prompt
        strategy.tokenize_prompt(basic_dataset[0])

        # We can also check that a warning was logged, but there's
        # caplog conflicts when running with other tests
        # assert any(
        #     "not found in chat_template" in record.message for record in self._caplog.records
        # ), "Expected warning about token not found in template was not logged"

    def test_custom_eot_tokens(
        self,
        tokenizer,
        chat_template,
        chat_template_jinja,
        eos_token,
        basic_dataset,
        request,
    ):
        """Test with custom EOT tokens to ensure proper masking and training"""
        LOG.info("Testing with custom EOT tokens")

        tokenizer, chat_template_jinja = self.setup_tokenizer(
            tokenizer, chat_template, chat_template_jinja, None, request
        )

        # Add custom EOT tokens to the tokenizer
        custom_eot = "[EOT]"
        tokenizer.add_special_tokens({"additional_special_tokens": [custom_eot]})

        # Create a custom chat template that uses our EOT token
        custom_template = """{% for message in messages %}{% if message['role'] == 'system' %}{{ message['content'] }}{% elif message['role'] == 'user' %}User: {{ message['content'] }}{% elif message['role'] == 'assistant' %}Assistant: {{ message['content'] }}[EOT]{% endif %}{% endfor %}"""

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                tokenizer,
                chat_template=custom_template,
                message_property_mappings={"role": "from", "content": "value"},
                field_messages="conversations",
            ),
            tokenizer=tokenizer,
            train_on_inputs=False,
            sequence_len=512,
            roles_to_train=["assistant"],
            train_on_eot="turn",  # Train on EOT token after each turn
            eot_tokens=[custom_eot],
        )

        res = strategy.tokenize_prompt(basic_dataset[0])
        labels = res["labels"]
        input_ids = res["input_ids"]

        # Find indices of the EOT token
        eot_token_id = tokenizer.convert_tokens_to_ids(custom_eot)
        eot_indices = [
            i for i, token_id in enumerate(input_ids) if token_id == eot_token_id
        ]

        assert len(eot_indices) > 0, "Expected at least one EOT token in the input"

        # Verify labeling for EOT tokens based on role
        turns = strategy.get_conversation_thread(basic_dataset[0])
        assistant_turn_indices = []
        non_assistant_turn_indices = []

        for i, turn in enumerate(basic_dataset[0]["conversations"]):
            start_idx, end_idx = strategy.find_turn(turns=turns, turn_idx=i)
            if start_idx != -1 and end_idx != -1:  # If turn is found
                if turn["from"] == "assistant":
                    assistant_turn_indices.append((start_idx, end_idx))
                else:
                    non_assistant_turn_indices.append((start_idx, end_idx))

        # Check EOT tokens after assistant turns are labeled
        for eot_idx in eot_indices:
            is_after_assistant = any(
                start_idx <= eot_idx <= end_idx + 1  # +1 to include the EOT token
                for start_idx, end_idx in assistant_turn_indices
            )

            if is_after_assistant:
                assert labels[eot_idx] != IGNORE_TOKEN_ID, (
                    f"Expected EOT token after assistant turn at index {eot_idx} to be labeled"
                )
            else:
                assert labels[eot_idx] == IGNORE_TOKEN_ID, (
                    f"Expected EOT token not after assistant turn at index {eot_idx} to not be labeled"
                )

    def test_multiple_train_on_eot_settings(
        self,
        tokenizer,
        chat_template,
        chat_template_jinja,
        eos_token,
        basic_dataset,
        request,
    ):
        """Test different train_on_eot settings"""
        LOG.info("Testing different train_on_eot settings")

        tokenizer, chat_template_jinja = self.setup_tokenizer(
            tokenizer, chat_template, chat_template_jinja, eos_token, request
        )

        # Create a list to test different train_on_eot settings
        test_settings = [
            ("none", lambda idx, is_assistant: False),  # Never train on EOT
            ("all", lambda idx, is_assistant: True),  # Always train on EOT
            (
                "turn",
                lambda idx, is_assistant: is_assistant,
            ),  # Train on EOT after assistant turns
            ("last", lambda idx, is_last: is_last),  # Only train on last EOT
        ]

        for setting, expected_train_func in test_settings:
            LOG.info(f"Testing train_on_eot='{setting}'")

            strategy = ChatTemplateStrategy(
                ChatTemplatePrompter(
                    tokenizer,
                    chat_template=get_chat_template(
                        chat_template, jinja_template=chat_template_jinja
                    ),
                    message_property_mappings={"role": "from", "content": "value"},
                    field_messages="conversations",
                ),
                tokenizer=tokenizer,
                train_on_inputs=False,
                sequence_len=512,
                roles_to_train=["assistant"],
                train_on_eot=setting,
                eot_tokens=[
                    tokenizer.eos_token
                ],  # Use eos_token as the EOT token for simplicity
            )

            res = strategy.tokenize_prompt(basic_dataset[0])
            turns = strategy.get_conversation_thread(basic_dataset[0])
            labels = res["labels"]
            input_ids = res["input_ids"]

            eos_token_id = tokenizer.eos_token_id
            eos_indices = [
                i for i, token_id in enumerate(input_ids) if token_id == eos_token_id
            ]

            assert len(eos_indices) > 0, (
                "Expected at least one EOS/EOT token in the input"
            )

            # Check labeling for each EOS/EOT token
            for idx, eos_idx in enumerate(eos_indices):
                # Find which turn this EOS token belongs to
                preceding_turn = None
                for i, turn in enumerate(basic_dataset[0]["conversations"]):
                    start_idx, end_idx = strategy.find_turn(turns=turns, turn_idx=i)
                    if (
                        start_idx != -1
                        and end_idx != -1
                        and start_idx <= eos_idx <= end_idx + 1
                    ):
                        preceding_turn = turn
                        break

                is_assistant = (
                    preceding_turn is not None and preceding_turn["from"] == "assistant"
                )
                is_last = idx == len(eos_indices) - 1

                expected_label = not expected_train_func(
                    idx, is_assistant if setting != "last" else is_last
                )

                if expected_label:
                    assert labels[eos_idx] == IGNORE_TOKEN_ID, (
                        f"Expected EOT token at index {eos_idx} to not be labeled with train_on_eot='{setting}'"
                    )
                else:
                    assert labels[eos_idx] != IGNORE_TOKEN_ID, (
                        f"Expected EOT token at index {eos_idx} to be labeled with train_on_eot='{setting}'"
                    )


class TestChatTemplateToolCalling:
    """
    Test class for tool calling functionality with chat templates.
    """

    def test_tool_calling_with_llama4_template(
        self,
        llama3_tokenizer,
    ):
        LOG.info("Testing tool calling with llama3 tokenizer and llama4 chat template")

        # Create tool calling dataset
        tool_calling_dataset = [
            {
                "tools": [
                    {
                        "type": "function",
                        "function": {
                            "name": "xml_escape",
                            "description": 'Replaces any "<", ">", or "&" characters in the input string with their corresponding XML entities.',
                            "parameters": {
                                "type": "object",
                                "properties": {
                                    "s": {
                                        "type": "string",
                                        "description": "The input string to be XML-escaped.",
                                    }
                                },
                                "required": ["s"],
                            },
                        },
                    },
                    {
                        "type": "function",
                        "function": {
                            "name": "multiples",
                            "description": "Generates a list of all the multiples of a number that are less than a given limit.",
                            "parameters": {
                                "type": "object",
                                "properties": {
                                    "number": {
                                        "type": "integer",
                                        "description": "The number to find multiples of.",
                                    },
                                    "limit": {
                                        "type": "integer",
                                        "description": "The upper limit for the multiples.",
                                    },
                                },
                                "required": ["number", "limit"],
                            },
                        },
                    },
                ],
                "messages": [
                    {
                        "role": "user",
                        "content": "Can you help me find multiples of 5 that are less than 20?",
                    },
                    {
                        "role": "assistant",
                        "tool_calls": [
                            {
                                "type": "function",
                                "function": {
                                    "name": "multiples",
                                    "arguments": {
                                        "number": 5,
                                        "limit": 20,
                                    },
                                },
                            }
                        ],
                    },
                    {"role": "tool", "name": "multiples", "content": "5,10,15"},
                    {
                        "role": "assistant",
                        "content": "The multiples of 5 less than 20 are: 5, 10, and 15.",
                    },
                ],
            }
        ]

        # Setup tokenizer with llama4 chat template
        tokenizer = deepcopy(llama3_tokenizer)

        # Add EOS token to the tokenizer
        eot_token = "<|eot_id|>"
        tokenizer.add_special_tokens({"additional_special_tokens": [eot_token]})

        strategy = ChatTemplateStrategy(
            ChatTemplatePrompter(
                tokenizer,
                chat_template=get_chat_template("llama4"),
                message_property_mappings={"role": "role", "content": "content"},
                field_messages="messages",
                field_tools="tools",
            ),
            tokenizer=tokenizer,
            train_on_inputs=False,
            sequence_len=512,
            roles_to_train=["assistant"],
            eot_tokens=[eot_token],
        )

        res = strategy.tokenize_prompt(tool_calling_dataset[0])
        input_ids = res["input_ids"]
        labels = res["labels"]

        # Verify that the input_ids contain expected tokens
        assert len(input_ids) > 0, "Input IDs should not be empty"
        assert len(labels) == len(input_ids), "Labels should match input_ids length"

        # Decode the full conversation to verify structure
        decoded_conversation = tokenizer.decode(input_ids)

        # Verify tool calling structure is present in the decoded conversation
        assert '"type": "function",' in decoded_conversation, (
            "Tool type function should be in conversation"
        )
        assert '"name": "multiples",' in decoded_conversation, (
            "Tool function name should be in conversation"
        )

        assert (
            '<|python_start|><|python_end|>{"name": "multiples", "parameters": {"number": 5, "limit": 20}}<|eot|>'
            in decoded_conversation
        ), "Assistant tool call should be in conversation"
        assert "<|header_start|>ipython<|header_end|>" in decoded_conversation, (
            "IPython header should be in conversation"
        )
        assert '"5,10,15"' in decoded_conversation, (
            "Tool response should be in conversation"
        )

        # Get conversation turns to verify labeling
        turns = strategy.get_conversation_thread(tool_calling_dataset[0])
        tools = strategy._get_tools(tool_calling_dataset[0])

        # Check that assistant responses are properly labeled
        for i, turn in enumerate(tool_calling_dataset[0]["messages"]):
            if turn["role"] == "assistant":
                start_idx, end_idx = strategy.find_turn(
                    turns=turns, turn_idx=i, tools=tools
                )

                assert start_idx != -1 and end_idx != -1, (
                    f"Assistant turn {i} should be found"
                )

                # Verify that assistant responses have proper labels
                turn_labels = labels[start_idx:end_idx]
                assert all(label != IGNORE_TOKEN_ID for label in turn_labels), (
                    f"Assistant turn {i} should be unmasked"
                )


================================================
FILE: tests/prompt_strategies/test_chat_templates_mistral.py
================================================
"""Test chat templates for mistral-common wrapper tokenizer"""

import unittest
from typing import TYPE_CHECKING

import pytest

if TYPE_CHECKING:
    from transformers import PreTrainedTokenizer

    from axolotl.utils.mistral import HFMistralTokenizer


# fmt: off
@pytest.mark.parametrize(
    ("tokenizer_str", "assistant_toolcall_ids", "tool_result_ids"),
    (
        ("magistral_tokenizer", (9, 44627, 3684, 33, 19881, 1049, 1050, 1051, 1052, 1053, 32, 19227, 12856, 2811, 1032, 1049, 1054, 1044, 1429, 33319, 2811, 1032, 1050, 1125, 2), (7, 19881, 1049, 1050, 1051, 1052, 1053, 19, 1049, 1044, 1050, 8)),
        ("devstral_tokenizer", (9, 1091, 19227, 2391, 2811, 1429, 44627, 3684, 1897, 1429, 61906, 2811, 16753, 12856, 2811, 1032, 1049, 1054, 1044, 1429, 33319, 2811, 1032, 1050, 4179, 1429, 1327, 2811, 1429, 19881, 1049, 1050, 1051, 1052, 1053, 1034, 27028, 2), (7, 19881, 1049, 1050, 1051, 1052, 1053, 19, 1049, 1044, 1050, 8)),
        ("devstral_1_1_tokenizer", (9, 44627, 3684, 32, 19227, 12856, 2811, 1032, 1049, 1054, 1044, 1429, 33319, 2811, 1032, 1050, 1125, 2,), (7, 1049, 1044, 1050, 8)),
    )
)
# fmt: on
def test_mistral_chat_template(
    tokenizer_str: str,
    assistant_toolcall_ids: tuple[int, ...],
    tool_result_ids: tuple[int, ...],
    request: pytest.FixtureRequest,
):
    """Test chat template with the Magistral/Devstral tokenizer"""

    from axolotl.prompt_strategies.chat_template import MistralPrompter, MistralStrategy

    tokenizer: HFMistralTokenizer = request.getfixturevalue(tokenizer_str)

    # check bos, eos, pad, unk are accessible properties
    assert tokenizer.bos_token_id == 1
    assert tokenizer.eos_token_id == 2
    assert tokenizer.pad_token_id == 11
    assert tokenizer.unk_token_id == 0

    assert tokenizer.pad_token == "<pad>"
    assert tokenizer.eos_token == "</s>"
    assert tokenizer.bos_token == "<s>"
    assert tokenizer.unk_token == "<unk>"

    strategy = MistralStrategy(
        MistralPrompter(
            tokenizer,
            chat_template=None,
            message_property_mappings={"role": "role", "content": "content"},
        ),
        tokenizer=tokenizer,
        train_on_inputs=False,
        train_on_eos="turn",
        sequence_len=512,
        roles_to_train=["assistant"],
    )

    # test chat template masking without system prompt
    res = strategy.tokenize_prompt(
        {
            "messages": [
                {"role": "user", "content": "Hello, how are you?"},
                {"role": "assistant", "content": "I'm doing great, thank you!"},
            ]
        }
    )

    assert res["input_ids"] == [
        1,  # bos
        3,  # [INST]
        22177,  # Hello
        1044,  # ,
        2606,  # how
        1584,  # are
        1636,  # you
        1063,  # ?
        4,  # [/INST]
        1073,  # I
        4525,  # 'm
        6965,  # doing
        4824,  # great
        1044,  # ,
        15412,  # thank
        1636,  # you
        1033,  # !
        2,  # </s>
    ]

    assert res["labels"] == [
        -100,  # bos
        -100,  # [INST]
        -100,  # Hello
        -100,  # ,
        -100,  # how
        -100,  # are
        -100,  # you
        -100,  # ?
        -100,  # [/INST]
        1073,  # I
        4525,  # 'm
        6965,  # doing
        4824,  # great
        1044,  # ,
        15412,  # thank
        1636,  # you
        1033,  # !
        2,  # </s>
    ]

    # test chat template masking with system prompt
    res = strategy.tokenize_prompt(
        {
            "messages": [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": "Hello, how are you?"},
                {"role": "assistant", "content": "I'm doing great, thank you!"},
            ]
        }
    )

    assert res["input_ids"] == [
        1,  # bos
        17,  # [SYSTEM_PROMPT]
        4568,  # You
        1584,  # are
        1261,  # a
        20351,  # helpful
        27089,  # assistant
        1046,  # .
        18,  # [/SYSTEM_PROMPT]
        3,  # [INST]
        22177,  # Hello
        1044,  # ,
        2606,  # how
        1584,  # are
        1636,  # you
        1063,  # ?
        4,  # [/INST]
        1073,  # I
        4525,  # 'm
        6965,  # doing
        4824,  # great
        1044,  # ,
        15412,  # thank
        1636,  # you
        1033,  # !
        2,  # </s>
    ]

    assert res["labels"] == [
        -100,  # bos
        -100,  # [SYSTEM_PROMPT]
        -100,  # You
        -100,  # are
        -100,  # a
        -100,  # helpful
        -100,  # assistant
        -100,  # .
        -100,  # [/SYSTEM_PROMPT]
        -100,  # [INST]
        -100,  # Hello
        -100,  # ,
        -100,  # how
        -100,  # are
        -100,  # you
        -100,  # ?
        -100,  # [/INST]
        1073,  # I
        4525,  # 'm
        6965,  # doing
        4824,  # great
        1044,  # ,
        15412,  # thank
        1636,  # you
        1033,  # !
        2,  # </s>
    ]

    # test chat template with tools
    res = strategy.tokenize_prompt(
        {
            "tools": [
                {
                    "type": "function",
                    "function": {
                        "name": "multiples",
                        "description": "Generates a list of all the multiples of a number that are less than a given limit.",
                        "parameters": {
                            "type": "object",
                            "properties": {
                                "number": {
                                    "type": "integer",
                                    "description": "The number to find multiples of.",
                                },
                                "limit": {
                                    "type": "integer",
                                    "description": "The upper limit for the multiples.",
                                },
                            },
                            "required": ["number", "limit"],
                        },
                    },
                },
            ],
            "messages": [
                {
                    "role": "user",
                    "content": "Hey, can you give me a breakdown of how to throw an awesome themed party? Like, what themes work best, and how can I set everything up to really wow my guests? I want some ideas on decorations, food, and activities that will make the party unforgettable!",
                },
                {
                    "role": "assistant",
                    "tool_calls": [
                        {
                            "id": "call12345",
                            "type": "function",
                            "function": {
                                "name": "multiples",
                                "arguments": {
                                    "number": 16,
                                    "limit": 2,
                                },
                            },
                        }
                    ],
                },
                {
                    "role": "tool",
                    "tool_call_id": "call12345",
                    "name": "multiples",
                    "content": "1,2",
                },
                {"role": "assistant", "content": "The multiples of 16 is 1 and 2."},
            ],
        }
    )

    # fmt: off
    assert res["input_ids"] == [
        1,  # bos
        5, 1091, 19227, 4994, 2811, 1429, 5165, 1897, 1429, 5165, 2811, 16753, 2391, 2811, 1429, 44627, 3684, 1897, 1429, 14653, 2811, 1429, 10639, 2130, 1261, 2951, 1307, 1747, 1278, 60092, 1307, 1261, 2782, 1455, 1584, 4289, 2224, 1261, 4265, 6139, 39249, 1429, 26204, 2811, 16753, 4994, 2811, 1429, 6371, 1897, 1429, 48649, 2811, 16753, 12856, 2811, 16753, 4994, 2811, 1429, 49039, 1897, 1429, 14653, 2811, 1429, 1784, 2782, 1317, 3081, 60092, 1307, 2613, 4179, 1429, 33319, 2811, 16753, 4994, 2811, 1429, 49039, 1897, 1429, 14653, 2811, 1429, 1784, 9229, 6139, 1394, 1278, 60092, 2613, 47579, 1429, 15760, 2811, 12161, 12856, 1897, 1429, 33319, 4964, 2821, 27028, 6,  # tool prompt
        3, 46634, 1044, 1710, 1636, 5628, 1639, 1261, 44433, 1307, 2606, 1317, 5388, 1420, 54191, 2424, 1286, 8967, 1063, 15621, 1044, 2549, 30305, 2196, 3560, 1044, 1321, 2606, 1710, 1362, 2016, 8605, 2015, 1317, 5524, 118931, 2036, 32951, 1063, 1362, 2933, 2269, 12106, 1408, 101987, 1044, 6939, 1044, 1321, 9216, 1455, 2084, 3180, 1278, 8967, 119141, 1689, 5935, 1033, 4,  # user
        *assistant_toolcall_ids,  # assistant tool calling
        *tool_result_ids,  # tool result
        1784, 60092, 1307, 1032, 1049, 1054, 1395, 1032, 1049, 1321, 1032, 1050, 1046,  # assistant
        2  # eos
    ]

    assert res["labels"] == [
        -100,  # bos
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,  # tool prompt
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,  # user prompt
        *assistant_toolcall_ids,  # assistant tool calling
        *([-100] * len(tool_result_ids)),  # tool result
        1784, 60092, 1307, 1032, 1049, 1054, 1395, 1032, 1049, 1321, 1032, 1050, 1046,  # assistant
        2  # eos
    ]
    # fmt: on

    # test chat template with tokenize=False
    res = tokenizer.apply_chat_template(
        [
            {"role": "user", "content": "Hello, how are you?"},
            {"role": "assistant", "content": "I'm doing great, thank you!"},
        ],
        tokenize=False,
    )

    assert res == "<s>[INST]Hello, how are you?[/INST]I'm doing great, thank you!</s>"

    # test encode
    res = tokenizer.encode("Hello, how are you?", add_special_tokens=True)
    assert res == [
        1,  # bos
        22177,  # Hello
        1044,  # ,
        2606,  # how
        1584,  # are
        1636,  # you
        1063,  # ?
        2,  # eos
    ]

    # test decode no skip special tokens
    decoded_res = tokenizer.decode(res, skip_special_tokens=False)

    assert decoded_res == "<s>Hello, how are you?</s>"

    # test decode skip special tokens
    decoded_res = tokenizer.decode(res, skip_special_tokens=True)
    assert decoded_res == "Hello, how are you?"

    # test encode no special tokens
    res = tokenizer.encode("Hello, how are you?", add_special_tokens=False)
    assert res == [
        22177,  # Hello
        1044,  # ,
        2606,  # how
        1584,  # are
        1636,  # you
        1063,  # ?
    ]

    # test convert ids to tokens
    res = tokenizer.convert_ids_to_tokens(res)
    # spacing are needed as we are converting without decoding
    assert res == ["Hello", ",", " how", " are", " you", "?"]


@pytest.mark.skip(reason="TODO, fix for new HF wrapper call")
def test_magistral_tokenizer_pad_method(magistral_tokenizer: "HFMistralTokenizer"):
    """Test the MistralTokenizer pad method"""
    from axolotl.utils.collators.core import IGNORE_INDEX

    magistral_pad_token_id = 11  # taken from tokenizer.pad_token_id

    # Test padding with input_ids and labels only
    features = [
        {"input_ids": [1, 2, 3], "labels": [4, 5, 6]},
        {"input_ids": [7, 8], "labels": [9, 10]},
    ]

    result = magistral_tokenizer.pad(features, padding=True, return_tensors="pt")

    # Check that input_ids are padded correctly
    assert result["input_ids"].shape == (2, 3)
    assert result["input_ids"].tolist() == [[1, 2, 3], [7, 8, magistral_pad_token_id]]

    # Check that labels are padded correctly
    assert result["labels"].shape == (2, 3)
    assert result["labels"].tolist() == [[4, 5, 6], [9, 10, IGNORE_INDEX]]

    # Check that attention_mask and position_ids are NOT created
    assert "attention_mask" not in result
    assert "position_ids" not in result

    # Test padding with attention_mask
    features_with_attention = [
        {"input_ids": [1, 2, 3], "labels": [4, 5, 6], "attention_mask": [1, 1, 1]},
        {"input_ids": [7, 8], "labels": [9, 10], "attention_mask": [1, 1]},
    ]

    result = magistral_tokenizer.pad(
        features_with_attention, padding=True, return_tensors="pt"
    )

    # Check that attention_mask is padded correctly
    assert result["attention_mask"].shape == (2, 3)
    assert result["attention_mask"].tolist() == [[1, 1, 1], [1, 1, 0]]

    # Test padding with position_ids
    features_with_position = [
        {"input_ids": [1, 2, 3], "labels": [4, 5, 6], "position_ids": [0, 1, 2]},
        {"input_ids": [7, 8], "labels": [9, 10], "position_ids": [0, 1]},
    ]

    result = magistral_tokenizer.pad(
        features_with_position, padding=True, return_tensors="pt"
    )

    # Check that position_ids are padded correctly (continuing sequence)
    assert result["position_ids"].shape == (2, 3)
    assert result["position_ids"].tolist() == [[0, 1, 2], [0, 1, 2]]

    # Test padding with all fields
    features_all = [
        {
            "input_ids": [1, 2, 3],
            "labels": [4, 5, 6],
            "attention_mask": [1, 1, 1],
            "position_ids": [0, 1, 2],
        },
        {
            "input_ids": [7, 8],
            "labels": [9, 10],
            "attention_mask": [1, 1],
            "position_ids": [0, 1],
        },
    ]

    result = magistral_tokenizer.pad(features_all, padding=True, return_tensors="pt")

    # All fields should be present and correctly padded
    assert "input_ids" in result
    assert "labels" in result
    assert "attention_mask" in result
    assert "position_ids" in result

    # Test padding with all sequences same length
    features_same_length = [
        {"input_ids": [1, 2, 3], "labels": [4, 5, 6]},
        {"input_ids": [7, 8, 9], "labels": [10, 11, 12]},
    ]

    result = magistral_tokenizer.pad(
        features_same_length, padding=True, return_tensors="pt"
    )

    # Check match when no padding is needed
    assert result["input_ids"][0].tolist() == features_same_length[0]["input_ids"]
    assert result["labels"][0].tolist() == features_same_length[0]["labels"]

    assert result["input_ids"][1].tolist() == features_same_length[1]["input_ids"]
    assert result["labels"][1].tolist() == features_same_length[1]["labels"]

    # Test padding with max_length parameter
    result = magistral_tokenizer.pad(
        features, padding="max_length", max_length=5, return_tensors="pt"
    )

    # Should pad to max_length
    assert result["input_ids"].shape == (2, 5)
    assert result["labels"].shape == (2, 5)

    # Test numpy return type
    result = magistral_tokenizer.pad(features, padding=True, return_tensors="np")

    # Should return numpy arrays
    import numpy as np

    assert isinstance(result["input_ids"], np.ndarray)
    assert isinstance(result["labels"], np.ndarray)

    # Test unsupported field rejection
    features_unsupported = [
        {"input_ids": [1, 2, 3], "labels": [4, 5, 6], "unsupported_field": [7, 8, 9]},
    ]

    with pytest.raises(NotImplementedError, match="unsupported_field"):
        magistral_tokenizer.pad(features_unsupported, padding=True, return_tensors="pt")

    # Test token_type_ids rejection
    features_token_type = [
        {"input_ids": [1, 2, 3], "labels": [4, 5, 6], "token_type_ids": [0, 0, 0]},
    ]

    with pytest.raises(ValueError, match="token_type_ids is not supported"):
        magistral_tokenizer.pad(features_token_type, padding=True, return_tensors="pt")


def test_magistral_tool_calling(magistral_tokenizer: "HFMistralTokenizer"):
    """Test tool calling with the Magistral tokenizer"""
    from axolotl.prompt_strategies.chat_template import MistralPrompter, MistralStrategy

    strategy = MistralStrategy(
        MistralPrompter(
            magistral_tokenizer,
            chat_template=None,
            message_property_mappings={"role": "role", "content": "content"},
        ),
        tokenizer=magistral_tokenizer,
        train_on_inputs=False,
        train_on_eos="turn",
        sequence_len=512,
        roles_to_train=["assistant"],
    )

    # Test basic tool calling with single function
    basic_tool_calling = {
        "tools": [
            {
                "type": "function",
                "function": {
                    "name": "get_weather",
                    "description": "Get the current weather for a location",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "location": {
                                "type": "string",
                                "description": "The city and state, e.g. San Francisco, CA",
                            },
                        },
                        "required": ["location"],
                    },
                },
            },
        ],
        "messages": [
            {
                "role": "user",
                "content": "What's the weather like in San Francisco?",
            },
            {
                "role": "assistant",
                "tool_calls": [
                    {
                        "id": "call12345",
                        "type": "function",
                        "function": {
                            "name": "get_weather",
                            "arguments": {
                                "location": "San Francisco, CA",
                            },
                        },
                    }
                ],
            },
            {
                "role": "tool",
                "tool_call_id": "call12345",
                "name": "get_weather",
                "content": "Sunny, 72°F",
            },
            {
                "role": "assistant",
                "content": "The weather in San Francisco is sunny and 72°F.",
            },
        ],
    }

    res = strategy.tokenize_prompt(basic_tool_calling)

    # Basic validation
    assert "input_ids" in res
    assert "labels" in res
    assert len(res["input_ids"]) > 0
    assert len(res["labels"]) == len(res["input_ids"])

    # Decode and verify structure
    decoded = magistral_tokenizer.decode(res["input_ids"])
    assert (
        '<s>[AVAILABLE_TOOLS][{"type": "function", "function": {"name": "get_weather", "description": "Get the current weather for a location", "parameters": {"type": "object", "properties": {"location": {"type": "string", "description": "The city and state, e.g. San Francisco, CA"}}, "required": ["location"]}}}][/AVAILABLE_TOOLS]'
        in decoded
    )
    assert (
        '[TOOL_CALLS]get_weather[CALL_ID]call12345[ARGS]{"location": "San Francisco, CA"}</s>'
        in decoded
    )
    assert "[TOOL_RESULTS]call12345[TOOL_CONTENT]Sunny, 72°F[/TOOL_RESULTS]" in decoded
    assert "The weather in San Francisco is sunny and 72°F.</s>" in decoded

    # Test multiple tool calls in sequence
    multi_tool_calling = {
        "tools": [
            {
                "type": "function",
                "function": {
                    "name": "add_numbers",
                    "description": "Add two numbers together",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "a": {"type": "number", "description": "First number"},
                            "b": {"type": "number", "description": "Second number"},
                        },
                        "required": ["a", "b"],
                    },
                },
            },
            {
                "type": "function",
                "function": {
                    "name": "multiply_numbers",
                    "description": "Multiply two numbers",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "x": {"type": "number", "description": "First number"},
                            "y": {"type": "number", "description": "Second number"},
                        },
                        "required": ["x", "y"],
                    },
                },
            },
        ],
        "messages": [
            {
                "role": "user",
                "content": "Add 5 and 3, then multiply the result by 2",
            },
            {
                "role": "assistant",
                "tool_calls": [
                    {
                        "id": "call12345",
                        "type": "function",
                        "function": {
                            "name": "add_numbers",
                            "arguments": {"a": 5, "b": 3},
                        },
                    }
                ],
            },
            {
                "role": "tool",
                "tool_call_id": "call12345",
                "name": "add_numbers",
                "content": "8",
            },
            {
                "role": "assistant",
                "tool_calls": [
                    {
                        "id": "call23456",
                        "type": "function",
                        "function": {
                            "name": "multiply_numbers",
                            "arguments": {"x": 8, "y": 2},
                        },
                    }
                ],
            },
            {
                "role": "tool",
                "tool_call_id": "call23456",
                "name": "multiply_numbers",
                "content": "16",
            },
            {
                "role": "assistant",
                "content": "The result is 16. I first added 5 and 3 to get 8, then multiplied 8 by 2 to get 16.",
            },
        ],
    }

    res = strategy.tokenize_prompt(multi_tool_calling)

    # Validation
    assert len(res["input_ids"]) > 0
    assert len(res["labels"]) == len(res["input_ids"])

    decoded = magistral_tokenizer.decode(res["input_ids"])
    assert (
        '<s>[AVAILABLE_TOOLS][{"type": "function", "function": {"name": "add_numbers", "description": "Add two numbers together", "parameters": {"type": "object", "properties": {"a": {"type": "number", "description": "First number"}, "b": {"type": "number", "description": "Second number"}}, "required": ["a", "b"]}}}, {"type": "function", "function": {"name": "multiply_numbers", "description": "Multiply two numbers", "parameters": {"type": "object", "properties": {"x": {"type": "number", "description": "First number"}, "y": {"type": "number", "description": "Second number"}}, "required": ["x", "y"]}}}][/AVAILABLE_TOOLS]'
        in decoded
    )
    assert (
        '[TOOL_CALLS]add_numbers[CALL_ID]call12345[ARGS]{"a": 5, "b": 3}</s>' in decoded
    )
    assert "[TOOL_RESULTS]call12345[TOOL_CONTENT]8[/TOOL_RESULTS]" in decoded
    assert (
        '[TOOL_CALLS]multiply_numbers[CALL_ID]call23456[ARGS]{"x": 8, "y": 2}</s>'
        in decoded
    )
    assert "[TOOL_RESULTS]call23456[TOOL_CONTENT]16[/TOOL_RESULTS]" in decoded
    assert (
        "The result is 16. I first added 5 and 3 to get 8, then multiplied 8 by 2 to get 16.</s>"
        in decoded
    )

    # Test tool calling with system message
    system_tool_calling = {
        "tools": [
            {
                "type": "function",
                "function": {
                    "name": "search_database",
                    "description": "Search for information in database",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "query": {"type": "string", "description": "Search query"},
                        },
                        "required": ["query"],
                    },
                },
            },
        ],
        "messages": [
            {
                "role": "system",
                "content": "You are a helpful assistant with access to a database.",
            },
            {
                "role": "user",
                "content": "Find information about Python programming",
            },
            {
                "role": "assistant",
                "tool_calls": [
                    {
                        "id": "search123",
                        "type": "function",
                        "function": {
                            "name": "search_database",
                            "arguments": {"query": "Python programming"},
                        },
                    }
                ],
            },
            {
                "role": "tool",
                "tool_call_id": "search123",
                "name": "search_database",
                "content": "Python is a high-level programming language known for its simplicity.",
            },
            {
                "role": "assistant",
                "content": "Based on the database search, Python is a high-level programming language known for its simplicity and readability.",
            },
        ],
    }

    res = strategy.tokenize_prompt(system_tool_calling)

    # Validation
    assert len(res["input_ids"]) > 0
    assert len(res["labels"]) == len(res["input_ids"])

    decoded = magistral_tokenizer.decode(res["input_ids"])

    assert (
        '<s>[SYSTEM_PROMPT]You are a helpful assistant with access to a database.[/SYSTEM_PROMPT][AVAILABLE_TOOLS][{"type": "function", "function": {"name": "search_database", "description": "Search for information in database", "parameters": {"type": "object", "properties": {"query": {"type": "string", "description": "Search query"}}, "required": ["query"]}}}][/AVAILABLE_TOOLS]'
        in decoded
    )

    # Test error handling - missing tool response
    incomplete_tool_calling = {
        "tools": [
            {
                "type": "function",
                "function": {
                    "name": "get_time",
                    "description": "Get current time",
                    "parameters": {"type": "object", "properties": {}},
                },
            },
        ],
        "messages": [
            {
                "role": "user",
                "content": "What time is it?",
            },
            {
                "role": "assistant",
                "tool_calls": [
                    {
                        "id": "time12345",
                        "type": "function",
                        "function": {
                            "name": "get_time",
                            "arguments": {},
                        },
                    }
                ],
            },
            {
                "role": "assistant",
                "content": "The current time is 12:00 PM.",
            },
        ],
    }

    from mistral_common.exceptions import InvalidMessageStructureException

    try:
        strategy.tokenize_prompt(incomplete_tool_calling)
    except InvalidMessageStructureException as e:
        assert "Not the same number of function calls and responses" in str(e)


@pytest.mark.skip(reason="TODO, fix for new HF wrapper call")
def test_magistral_tokenizer_call_method(
    magistral_tokenizer: "HFMistralTokenizer", llama3_tokenizer: "PreTrainedTokenizer"
):
    """Test the __call__ method behavior matches HuggingFace standards"""
    from copy import deepcopy

    import numpy as np
    import torch

    hf_tokenizer = deepcopy(llama3_tokenizer)
    hf_tokenizer.pad_token = hf_tokenizer.eos_token

    test_text = "Hello, how are you?"
    batch_texts = ["Hello world", "How are you?"]

    # Test single string with return_tensors=None
    hf_result: dict[str, list[int]] = hf_tokenizer(test_text, return_tensors=None)
    mistral_result: dict[str, list[int]] = magistral_tokenizer(
        test_text, return_tensors=None
    )

    assert isinstance(mistral_result, dict)
    assert set(mistral_result.keys()) == {"input_ids", "attention_mask"}
    assert isinstance(mistral_result["input_ids"], type(hf_result["input_ids"]))  # list
    assert isinstance(
        mistral_result["attention_mask"], type(hf_result["attention_mask"])
    )
    assert len(mistral_result["input_ids"]) == len(mistral_result["attention_mask"])
    assert np.all(mistral_result["attention_mask"])
    assert len(np.array(mistral_result["input_ids"]).shape) == 1  # 1D array

    # Test single string with return_tensors='pt'
    hf_result_pt: dict[str, torch.Tensor] = hf_tokenizer(test_text, return_tensors="pt")
    mistral_result_pt: dict[str, torch.Tensor] = magistral_tokenizer(
        test_text, return_tensors="pt"
    )

    # Check structure and types
    assert isinstance(mistral_result_pt["input_ids"], torch.Tensor)
    assert isinstance(mistral_result_pt["attention_mask"], torch.Tensor)

    # Check shapes match (don't compare token dimension)
    assert len(hf_result_pt["input_ids"].shape) == len(
        mistral_result_pt["input_ids"].shape
    )
    assert hf_result_pt["input_ids"].shape[0] == mistral_result_pt["input_ids"].shape[0]
    assert (
        mistral_result_pt["attention_mask"].shape
        == mistral_result_pt["input_ids"].shape
    )
    assert torch.all(mistral_result_pt["attention_mask"] == 1)

    # Test batch input with padding
    hf_batch: dict[str, torch.Tensor] = hf_tokenizer(
        batch_texts, return_tensors="pt", padding=True
    )
    mistral_batch: dict[str, torch.Tensor] = magistral_tokenizer(
        batch_texts, return_tensors="pt", padding=True
    )

    # Check batch behavior
    assert len(hf_batch["input_ids"].shape) == len(mistral_batch["input_ids"].shape)
    assert hf_batch["input_ids"].shape[0] == mistral_batch["input_ids"].shape[0]
    assert mistral_batch["attention_mask"].shape == mistral_batch["input_ids"].shape
    assert torch.any(
        mistral_batch["attention_mask"][0] == 0
    )  # padding in shorter sequence
    assert torch.all(
        mistral_batch["attention_mask"][1] == 1
    )  # no padding in longer sequence

    # Test numpy tensors
    mistral_result_np: dict[str, np.ndarray] = magistral_tokenizer(
        test_text, return_tensors="np"
    )
    assert isinstance(mistral_result_np["input_ids"], np.ndarray)
    assert isinstance(mistral_result_np["attention_mask"], np.ndarray)

    # Test consistency with encode()
    encoded: list[int] = magistral_tokenizer.encode(test_text, add_special_tokens=True)
    called: dict[str, torch.Tensor] = magistral_tokenizer(
        test_text, return_tensors="pt"
    )
    assert encoded == called["input_ids"][0].tolist()

    # Test Error handling
    with pytest.raises(ValueError, match="Unsupported kwargs"):
        magistral_tokenizer(test_text, unsupported_param=True)

    with pytest.raises(
        ValueError, match="return_tensors='pt' or 'np' requires padding or truncation"
    ):
        magistral_tokenizer(batch_texts, return_tensors="pt")


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/prompt_strategies/test_chat_templates_thinking.py
================================================
"""
Tests for splitting reasoning/thinking from content into separate field
"""

import pytest
from datasets import Dataset

from axolotl.prompt_strategies.chat_template import (
    load,
)
from axolotl.utils.dict import DictDefault


@pytest.fixture(name="messages_w_reasoning")
def messages_w_reasoning_fixture():
    return Dataset.from_list(
        [
            {
                "messages": [
                    {
                        "role": "user",
                        "content": "hello",
                    },
                    {
                        "role": "assistant",
                        "content": "<think>lorem</think>\nwelcome",
                    },
                ]
            },
            {
                "messages": [
                    {
                        "role": "user",
                        "content": "hello",
                    },
                    {
                        "role": "assistant",
                        "content": "<|begin_of_thought|>lorem<|end_of_thought|>\n<|begin_of_solution|>welcome\n<|end_of_solution|>",
                    },
                ]
            },
            {
                "messages": [
                    {
                        "role": "user",
                        "content": "hello",
                    },
                    {
                        "role": "assistant",
                        "content": "<reasoning>lorem</reasoning>\nwelcome",
                    },
                ]
            },
        ]
    )


class TestSplitThinking:
    """
    test class to make sure datasets with reasoning content conforms to the chat_template strategy
    """

    def test_splits_think(self, messages_w_reasoning, qwen3_tokenizer):
        strategy = load(
            qwen3_tokenizer,
            DictDefault(
                {
                    "train_on_inputs": False,
                    "sequence_len": 512,
                }
            ),
            DictDefault(
                {
                    "chat_template": "qwen3",
                    "message_field_role": "role",
                    "message_field_content": "content",
                    "message_property_mappings": {
                        "role": "role",
                        "content": "content",
                    },
                    "roles": {
                        "user": ["user"],
                        "assistant": ["assistant"],
                        "system": ["system"],
                    },
                    "field_messages": "messages",
                    "split_thinking": True,
                }
            ),
        )
        for conversation in messages_w_reasoning:
            transformed_prompt = strategy.get_conversation_thread(conversation)
            assert transformed_prompt[0]["role"] == "user"
            assert transformed_prompt[1]["role"] == "assistant"
            assert transformed_prompt[1]["reasoning_content"] == "lorem"
            assert transformed_prompt[1]["content"] == "welcome"

            res = strategy.tokenize_prompt(conversation)
            input_ids = res["input_ids"]
            # fmt: off
            expected_input_ids = [
                151644,  # im_start
                872,  # user
                198,  # \n
                14990,  # hello
                151645,  # im_end
                198,  # \n
                151644,  # im_start
                77091,  # assistant
                198,  # \n
                151667,  # think
                198,  # \n
                385, 1826,  # lorem
                198,  # \n
                151668,  # /think
                271,  # \n
                34084,  # welcome
                151645,  # im_end
                198,  # \n
            ]
            # fmt: on
            assert input_ids == expected_input_ids, (
                f"Input IDs mismatch: {input_ids} != {expected_input_ids}"
            )


================================================
FILE: tests/prompt_strategies/test_chat_templates_tool_call_string_arguments.py
================================================
"""
Tests for handling json tool content
"""

import json

import pytest
from datasets import Dataset

from axolotl.prompt_strategies.chat_template import (
    load,
)
from axolotl.utils.dict import DictDefault


@pytest.fixture(name="qwen3_instruct_prompt_strategy")
def qwen3_instruct_chat_template_strategy(qwen3_tokenizer):
    strategy = load(
        qwen3_tokenizer,
        DictDefault(
            {
                "train_on_inputs": False,
                "sequence_len": 512,
            }
        ),
        DictDefault(
            {
                "chat_template": "qwen3",
                "message_field_role": "role",
                "message_field_content": "content",
                "message_property_mappings": {
                    "role": "role",
                    "content": "content",
                },
                "roles": {
                    "user": ["user"],
                    "assistant": ["assistant"],
                    "system": ["system"],
                },
                "field_messages": "messages",
            }
        ),
    )
    return strategy


class TestQwen3IdenticalConversationArgs:
    """
    Test Qwen3 tools is identical between JSON and dict
    """

    @pytest.fixture(name="conversation_dict_args_dataset")
    def fixture_conversation_dict_args_dataset(self):
        """
        Provides a dataset with conversation where arguments is a dict.
        """
        user_content = "What is the weather in Boston?"
        function_name = "get_current_weather"
        arguments_dict = {"location": "Boston, MA", "unit": "celsius"}

        data = [
            {
                "messages": [
                    {"role": "user", "content": user_content},
                    {
                        "role": "assistant",
                        "content": "",
                        "tool_calls": [
                            {
                                "function": {
                                    "name": function_name,
                                    "arguments": arguments_dict,  # dict
                                }
                            }
                        ],
                    },
                ],
            }
        ]
        return Dataset.from_list(data)

    @pytest.fixture(name="conversation_str_args_dataset")
    def fixture_conversation_str_args_dataset(self):
        """
        Provides a dataset with conversation where arguments is a JSON string.
        """
        user_content = "What is the weather in Boston?"
        function_name = "get_current_weather"
        arguments_dict = {"location": "Boston, MA", "unit": "celsius"}
        arguments_str = json.dumps(arguments_dict)

        data = [
            {
                "messages": [
                    {"role": "user", "content": user_content},
                    {
                        "role": "assistant",
                        "content": "",
                        "tool_calls": [
                            {
                                "function": {
                                    "name": function_name,
                                    "arguments": arguments_str,  # str
                                }
                            }
                        ],
                    },
                ],
            }
        ]
        return Dataset.from_list(data)

    @pytest.fixture(name="conversation_mixed_time_types_dataset")
    def fixture_conversation_mixed_time_types_dataset(self):
        """
        Provides a dataset where 'time' field has different types in different tool calls.
        """
        data = [
            {
                "messages": [
                    {
                        "role": "user",
                        "content": "Get weather information at different times",
                    },
                    {
                        "role": "assistant",
                        "content": "",
                        "tool_calls": [
                            {
                                "function": {
                                    "name": "func1",
                                    "arguments": json.dumps(
                                        {"time": "2025-08-01"}
                                    ),  # string type
                                }
                            },
                            {
                                "function": {
                                    "name": "func2",
                                    "arguments": json.dumps(
                                        {"time": 1690876800}
                                    ),  # number type
                                }
                            },
                        ],
                    },
                ],
            }
        ]
        return Dataset.from_list(data)

    def test_dict_and_str_args_produce_identical_output(
        self,
        conversation_dict_args_dataset,
        conversation_str_args_dataset,
        qwen3_instruct_prompt_strategy,
        qwen3_tokenizer,
    ):
        """
        Tests that after tokenization and decoding, the outputs for both
        dict and string `arguments` are exactly the same.
        """
        processed_dict_args = conversation_dict_args_dataset.map(
            qwen3_instruct_prompt_strategy.tokenize_prompt,
            batched=True,
            remove_columns=["messages"],
        )

        processed_str_args = conversation_str_args_dataset.map(
            qwen3_instruct_prompt_strategy.tokenize_prompt,
            batched=True,
            remove_columns=["messages"],
        )

        decoded_prompt_from_dict = qwen3_tokenizer.decode(
            processed_dict_args[0]["input_ids"]
        )

        decoded_prompt_from_str = qwen3_tokenizer.decode(
            processed_str_args[0]["input_ids"]
        )

        assert decoded_prompt_from_dict == decoded_prompt_from_str, (
            f"Dict format output:\n{decoded_prompt_from_dict}\n"
            f"String format output:\n{decoded_prompt_from_str}"
        )

        assert (
            processed_dict_args[0]["input_ids"] == processed_str_args[0]["input_ids"]
        ), "The tokenized input_ids should be identical for dict and str arguments"

    def test_str_args_with_mixed_time_types_no_error(
        self,
        conversation_mixed_time_types_dataset,
        qwen3_instruct_prompt_strategy,
        qwen3_tokenizer,
    ):
        """
        Tests that when 'time' field has different types (string vs number)
        in different tool calls, str format arguments don't cause errors.
        """
        processed = conversation_mixed_time_types_dataset.map(
            qwen3_instruct_prompt_strategy.tokenize_prompt,
            batched=True,
            remove_columns=["messages"],
        )

        assert len(processed) == 1
        assert "input_ids" in processed[0]
        assert len(processed[0]["input_ids"]) > 0

        decoded = qwen3_tokenizer.decode(processed[0]["input_ids"])
        assert "2025-08-01" in decoded, "String time value should be present"
        assert "1690876800" in decoded, "Number time value should be present"


class TestQwen3IdenticalToolsParameters:
    """
    Test Qwen3 tools parameters handling is identical between JSON string and dict
    """

    @pytest.fixture(name="tools_dict_params_dataset")
    def fixture_tools_dict_params_dataset(self):
        """
        Provides a dataset with tools where parameters is a dict.
        """
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "get_weather",
                    "description": "Get weather information",
                    "parameters": {
                        "type": "object",
                        "properties": {
                            "location": {
                                "type": "string",
                                "description": "The city and state",
                            },
                            "unit": {
                                "type": "string",
                                "enum": ["celsius", "fahrenheit"],
                            },
                        },
                        "required": ["location"],
                    },
                },
            }
        ]

        data = [
            {
                "tools": tools,
                "messages": [
                    {"role": "user", "content": "What's the weather?"},
                    {
                        "role": "assistant",
                        "content": "",
                        "tool_calls": [
                            {
                                "type": "function",
                                "function": {
                                    "name": "get_weather",
                                    "arguments": {"location": "Boston, MA"},
                                },
                            }
                        ],
                    },
                    {
                        "role": "tool",
                        "name": "get_weather",
                        "content": "72°F and sunny",
                    },
                ],
            }
        ]
        return Dataset.from_list(data)

    @pytest.fixture(name="tools_str_params_dataset")
    def fixture_tools_str_params_dataset(self):
        """
        Provides a dataset with tools where parameters is a JSON string.
        """
        parameters_dict = {
            "type": "object",
            "properties": {
                "location": {"type": "string", "description": "The city and state"},
                "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
            },
            "required": ["location"],
        }

        tools = [
            {
                "type": "function",
                "function": {
                    "name": "get_weather",
                    "description": "Get weather information",
                    "parameters": json.dumps(parameters_dict),
                },
            }
        ]

        data = [
            {
                "tools": tools,
                "messages": [
                    {"role": "user", "content": "What's the weather?"},
                    {
                        "role": "assistant",
                        "content": "",
                        "tool_calls": [
                            {
                                "type": "function",
                                "function": {
                                    "name": "get_weather",
                                    "arguments": {"location": "Boston, MA"},
                                },
                            }
                        ],
                    },
                    {
                        "role": "tool",
                        "name": "get_weather",
                        "content": "72°F and sunny",
                    },
                ],
            }
        ]
        return Dataset.from_list(data)

    @pytest.fixture(name="tools_mixed_type_params_dataset")
    def fixture_tools_mixed_type_params_dataset(self):
        """
        Provides a dataset where different tools have the same parameter name with different types.
        This tests that JSON string format prevents casting issues.
        """
        tools = [
            {
                "type": "function",
                "function": {
                    "name": "tool_with_string_arg",
                    "description": "Tool expecting string argument",
                    "parameters": json.dumps(
                        {
                            "type": "object",
                            "properties": {
                                "arg1": {
                                    "type": "string",
                                    "description": "A string parameter",
                                }
                            },
                            "required": ["arg1"],
                        }
                    ),
                },
            },
            {
                "type": "function",
                "function": {
                    "name": "tool_with_number_arg",
                    "description": "Tool expecting number argument",
                    "parameters": json.dumps(
                        {
                            "type": "object",
                            "properties": {
                                "arg1": {
                                    "type": "number",
                                    "description": "A numeric parameter",
                                }
                            },
                            "required": ["arg1"],
                        }
                    ),
                },
            },
        ]

        data = [
            {
                "tools": tools,
                "messages": [
                    {"role": "user", "content": "Use both tools"},
                    {
                        "role": "assistant",
                        "content": "",
                        "tool_calls": [
                            {
                                "type": "function",
                                "function": {
                                    "name": "tool_with_string_arg",
                                    "arguments": json.dumps({"arg1": "hello"}),
                                },
                            },
                            {
                                "type": "function",
                                "function": {
                                    "name": "tool_with_number_arg",
                                    "arguments": json.dumps({"arg1": 42}),
                                },
                            },
                        ],
                    },
                ],
            }
        ]
        return Dataset.from_list(data)

    def test_dict_and_str_params_produce_equivalent_output(
        self,
        tools_dict_params_dataset,
        tools_str_params_dataset,
        qwen3_instruct_prompt_strategy,
        qwen3_tokenizer,
    ):
        """
        Tests that after tokenization and decoding, the outputs for both
        dict and string `parameters` in tools are semantically equivalent.
        """
        import re

        processed_dict_params = tools_dict_params_dataset.map(
            qwen3_instruct_prompt_strategy.tokenize_prompt,
            batched=True,
            remove_columns=["messages", "tools"],
        )

        processed_str_params = tools_str_params_dataset.map(
            qwen3_instruct_prompt_strategy.tokenize_prompt,
            batched=True,
            remove_columns=["messages", "tools"],
        )

        decoded_dict = qwen3_tokenizer.decode(processed_dict_params[0]["input_ids"])
        decoded_str = qwen3_tokenizer.decode(processed_str_params[0]["input_ids"])

        # Extract the tool JSON from both outputs
        tools_pattern = r"<tools>\n(.*?)\n</tools>"

        dict_tools_match = re.search(tools_pattern, decoded_dict, re.DOTALL)
        str_tools_match = re.search(tools_pattern, decoded_str, re.DOTALL)

        assert dict_tools_match and str_tools_match, (
            "Could not find tools section in output"
        )

        # Parse the JSON and compare as objects (order-independent)
        dict_tools_json = json.loads(dict_tools_match.group(1))
        str_tools_json = json.loads(str_tools_match.group(1))

        # Deep comparison of the tool definitions
        assert dict_tools_json == str_tools_json, (
            f"Tool definitions are not equivalent:\n"
            f"Dict format: {json.dumps(dict_tools_json, indent=2)}\n"
            f"String format: {json.dumps(str_tools_json, indent=2)}"
        )

        # Verify the rest of the structure is the same (excluding the tools JSON part)
        # The tools JSON can have different order, so we remove it here.
        dict_normalized = re.sub(
            r"<tools>.*?</tools>",
            "<tools>TOOLS_PLACEHOLDER</tools>",
            decoded_dict,
            flags=re.DOTALL,
        )
        str_normalized = re.sub(
            r"<tools>.*?</tools>",
            "<tools>TOOLS_PLACEHOLDER</tools>",
            decoded_str,
            flags=re.DOTALL,
        )

        assert dict_normalized == str_normalized, (
            "The overall structure differs between dict and string parameter formats"
        )

    def test_str_params_with_mixed_types_no_error(
        self,
        tools_mixed_type_params_dataset,
        qwen3_instruct_prompt_strategy,
        qwen3_tokenizer,
    ):
        """
        Tests that when different tools have the same parameter name with different types,
        JSON string format for parameters doesn't cause casting errors.
        """
        processed = tools_mixed_type_params_dataset.map(
            qwen3_instruct_prompt_strategy.tokenize_prompt,
            batched=True,
            remove_columns=["messages", "tools"],
        )

        assert len(processed) == 1
        assert "input_ids" in processed[0]
        assert len(processed[0]["input_ids"]) > 0

        decoded = qwen3_tokenizer.decode(processed[0]["input_ids"])

        # Check that both tools are present
        assert "tool_with_string_arg" in decoded
        assert "tool_with_number_arg" in decoded

        # Check that both argument values are present
        assert "hello" in decoded
        assert "42" in decoded


================================================
FILE: tests/prompt_strategies/test_dpo_chat_templates.py
================================================
"""
tests for chat_template prompt strategy
"""

import unittest

import pytest
from datasets import Dataset
from transformers import AutoTokenizer

from axolotl.prompt_strategies.dpo.chat_template import argilla_chat, default
from axolotl.utils.dict import DictDefault

from tests.hf_offline_utils import enable_hf_offline


@pytest.fixture(name="assistant_dataset")
def fixture_assistant_dataset():
    return Dataset.from_list(
        [
            {
                "messages": [
                    {
                        "role": "user",
                        "content": "hello",
                    },
                    {
                        "role": "assistant",
                        "content": "hello",
                    },
                    {
                        "role": "user",
                        "content": "goodbye",
                    },
                ],
                "chosen": {
                    "role": "assistant",
                    "content": "goodbye",
                },
                "rejected": {
                    "role": "assistant",
                    "content": "party on",
                },
            }
        ]
    )


@pytest.fixture(name="custom_assistant_dataset")
def fixture_custom_assistant_dataset():
    return Dataset.from_list(
        [
            {
                "conversation": [
                    {
                        "speaker": "human",
                        "text": "hello",
                    },
                    {
                        "speaker": "agent",
                        "text": "hello",
                    },
                    {
                        "speaker": "human",
                        "text": "goodbye",
                    },
                ],
                "better": {
                    "speaker": "agent",
                    "text": "goodbye",
                },
                "worse": {
                    "speaker": "agent",
                    "text": "party on",
                },
            }
        ]
    )


@pytest.fixture(name="argilla_chat_dataset")
def fixture_argilla_chat_dataset():
    return Dataset.from_list(
        [
            {
                "chosen": [
                    {
                        "role": "user",
                        "content": "hello",
                    },
                    {
                        "role": "assistant",
                        "content": "goodbye",
                    },
                ],
                "rejected": [
                    {
                        "role": "user",
                        "content": "hello",
                    },
                    {
                        "role": "assistant",
                        "content": "party on",
                    },
                ],
            }
        ]
    )


@pytest.fixture(name="phi3_tokenizer")
@enable_hf_offline
def fixture_phi3_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-medium-128k-instruct")

    return tokenizer


@pytest.fixture(name="gemma_tokenizer")
@enable_hf_offline
def fixture_gemma_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("unsloth/gemma-2b-it", revision="703fb4a")

    return tokenizer


class TestAssistantDPOChatTemplateLlama3:
    """
    Test class for assistant style datasets with llama-3 prompts using the chat_template strategy.
    """

    def test_llama3_defaults(self, llama3_tokenizer, assistant_dataset):
        transform_fn, _ = default(
            DictDefault(
                {
                    "chat_template": "llama3",
                    "datasets": [
                        {
                            "type": "chat_template",
                        }
                    ],
                }
            )
        )
        result = transform_fn(assistant_dataset[0], tokenizer=llama3_tokenizer)
        assert result["prompt"] == (
            "<|begin_of_text|>"
            + "<|start_header_id|>user<|end_header_id|>\n\nhello<|eot_id|>"
            + "<|start_header_id|>assistant<|end_header_id|>\n\nhello<|eot_id|>"
            + "<|start_header_id|>user<|end_header_id|>\n\ngoodbye<|eot_id|>"
            + "<|start_header_id|>assistant<|end_header_id|>\n\n"
        )
        assert result["chosen"] == "goodbye<|eot_id|>"
        assert result["rejected"] == "party on<|eot_id|>"

    def test_llama3_configured(self, llama3_tokenizer, custom_assistant_dataset):
        transform_fn, _ = default(
            DictDefault(
                {
                    "chat_template": "llama3",
                    "datasets": [
                        {
                            "type": "chat_template",
                            "field_messages": "conversation",
                            "field_chosen": "better",
                            "field_rejected": "worse",
                            "message_field_role": "speaker",
                            "message_field_content": "text",
                            "roles": {
                                "user": ["human"],
                                "assistant": ["agent"],
                                "system": ["sys"],
                            },
                        }
                    ],
                }
            )
        )
        result = transform_fn(custom_assistant_dataset[0], tokenizer=llama3_tokenizer)
        assert result["prompt"] == (
            "<|begin_of_text|>"
            + "<|start_header_id|>user<|end_header_id|>\n\nhello<|eot_id|>"
            + "<|start_header_id|>assistant<|end_header_id|>\n\nhello<|eot_id|>"
            + "<|start_header_id|>user<|end_header_id|>\n\ngoodbye<|eot_id|>"
            + "<|start_header_id|>assistant<|end_header_id|>\n\n"
        )
        assert result["chosen"] == "goodbye<|eot_id|>"
        assert result["rejected"] == "party on<|eot_id|>"


class TestAssistantDPOChatTemplatePhi3:
    """
    Test class for assistant style datasets with phi-3 prompts using the tokenizer's chat_template strategy.
    """

    def test_phi3_defaults(self, phi3_tokenizer, assistant_dataset):
        transform_fn, _ = default(
            DictDefault(
                {
                    "chat_template": "tokenizer_default",
                    "datasets": [
                        {
                            "type": "chat_template",
                        }
                    ],
                }
            )
        )
        result = transform_fn(assistant_dataset[0], tokenizer=phi3_tokenizer)
        assert result["prompt"] == (
            "<|user|>\nhello<|end|>\n"
            + "<|assistant|>\nhello<|end|>\n"
            + "<|user|>\ngoodbye<|end|>\n"
            + "<|assistant|>\n"
        )
        assert result["chosen"] == "goodbye<|end|>"
        assert result["rejected"] == "party on<|end|>"


class TestAssistantDPOChatTemplateGemma:
    """
    Test class for assistant style datasets with gemma prompts using the tokenizer's chat_template strategy.
    """

    def test_gemma_defaults(self, gemma_tokenizer, assistant_dataset):
        transform_fn, _ = default(
            DictDefault(
                {
                    "chat_template": "tokenizer_default",
                    "datasets": [
                        {
                            "type": "chat_template",
                        }
                    ],
                }
            )
        )
        result = transform_fn(assistant_dataset[0], tokenizer=gemma_tokenizer)
        assert result["prompt"] == (
            "<bos><start_of_turn>user\nhello<end_of_turn>\n"
            + "<start_of_turn>model\nhello<end_of_turn>\n"
            + "<start_of_turn>user\ngoodbye<end_of_turn>\n"
            + "<start_of_turn>model\n"
        )
        assert result["chosen"] == "goodbye<end_of_turn>"
        assert result["rejected"] == "party on<end_of_turn>"


class TestArgillaChatDPOChatTemplate:
    """
    Test class for argilla_chat style datasets (chosen/rejected contain full conversations).
    """

    def test_llama3_argilla_chat(self, llama3_tokenizer, argilla_chat_dataset):
        transform_fn, _ = argilla_chat(
            DictDefault(
                {
                    "chat_template": "llama3",
                    "datasets": [
                        {
                            "type": "chat_template.argilla_chat",
                        }
                    ],
                }
            )
        )
        result = transform_fn(argilla_chat_dataset[0], tokenizer=llama3_tokenizer)
        assert result["prompt"] == (
            "<|begin_of_text|>"
            + "<|start_header_id|>user<|end_header_id|>\n\nhello<|eot_id|>"
            + "<|start_header_id|>assistant<|end_header_id|>\n\n"
        )
        assert result["chosen"] == "goodbye<|eot_id|>"
        assert result["rejected"] == "party on<|eot_id|>"

    def test_phi3_argilla_chat(self, phi3_tokenizer, argilla_chat_dataset):
        transform_fn, _ = argilla_chat(
            DictDefault(
                {
                    "chat_template": "tokenizer_default",
                    "datasets": [
                        {
                            "type": "chat_template.argilla_chat",
                        }
                    ],
                }
            )
        )
        result = transform_fn(argilla_chat_dataset[0], tokenizer=phi3_tokenizer)
        assert result["prompt"] == "<|user|>\nhello<|end|>\n" + "<|assistant|>\n"
        assert result["chosen"] == "goodbye<|end|>"
        assert result["rejected"] == "party on<|end|>"


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/prompt_strategies/test_dpo_chatml.py
================================================
"""
Tests for loading DPO preference datasets with chatml formatting
"""

import unittest

import pytest

from axolotl.loaders.tokenizer import load_tokenizer
from axolotl.prompt_strategies.dpo import load as load_dpo
from axolotl.utils.data.rl import prepare_preference_datasets
from axolotl.utils.dict import DictDefault

from tests.hf_offline_utils import enable_hf_offline


@pytest.fixture(name="minimal_dpo_cfg")
def fixture_cfg():
    return DictDefault(
        {
            "base_model": "HuggingFaceTB/SmolLM2-135M",
            "tokenizer_config": "HuggingFaceTB/SmolLM2-135M",
            "rl": "dpo",
            "learning_rate": 0.000001,
            "micro_batch_size": 1,
            "gradient_accumulation_steps": 1,
            "special_tokens": {
                "pad_token": "<|endoftext|>",
            },
            "sequence_len": 2048,
        }
    )


class TestDPOChatml:
    """
    Test loading DPO preference datasets with chatml formatting
    """

    @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits")
    @enable_hf_offline
    def test_default(self, minimal_dpo_cfg):
        cfg = DictDefault(
            {
                "datasets": [
                    {
                        "path": "argilla/distilabel-intel-orca-dpo-pairs",
                        "type": "chatml",
                        "split": "train[:1%]",
                    }
                ]
            }
            | minimal_dpo_cfg
        )

        # test that dpo.load works
        load_dpo("chatml", cfg)
        # now actually load the datasets with the strategy
        tokenizer = load_tokenizer(cfg)
        train_ds, _ = prepare_preference_datasets(cfg, tokenizer)
        assert train_ds[0]["prompt"].startswith("<|im_start|>")
        assert train_ds[0]["prompt"].endswith("<|im_start|>assistant\n")
        assert "chosen" in train_ds[0]
        assert "rejected" in train_ds[0]


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/prompt_strategies/test_jinja_template_analyzer.py
================================================
"""
tests for jinja_template_analyzer
"""

import pytest

from axolotl.prompt_strategies.jinja_template_analyzer import JinjaTemplateAnalyzer
from axolotl.utils.logging import get_logger

LOG = get_logger(__name__, log_level="DEBUG")


class TestJinjaTemplateAnalyzer:
    """
    tests for jinja_template_analyzer
    """

    def test_basic_variable_extraction(self, basic_jinja_template_analyzer):
        """Test that all top-level variables are correctly extracted."""
        LOG.info("Testing with train_on_inputs=True")

        variables = basic_jinja_template_analyzer.get_template_variables()
        expected_vars = {"messages", "add_generation_prompt", "eos_token", "message"}
        assert set(variables.keys()) == expected_vars

    def test_mixtral_variable_extraction(self, mistral_jinja_template_analyzer):
        """Test that all top-level variables are correctly extracted."""
        LOG.info("Testing with train_on_inputs=True")

        variables = mistral_jinja_template_analyzer.get_template_variables()
        expected_vars = {
            "messages",
            "content",
            "eos_token",
            "message",
            "tools",
            "system_message",
            "loop_messages",
            "ns",
            "tool_call",
            "tool",
            "loop",
            "bos_token",
            "raise_exception",
        }
        assert set(variables.keys()) == expected_vars
        message_vars = variables["message"]
        assert message_vars == {"role", "content", "tool_calls", "tool_call_id"}

    def test_message_property_access(self, basic_jinja_template_analyzer):
        """Test that properties accessed on 'message' variable are correctly identified."""
        LOG.info("Testing message property access")

        variables = basic_jinja_template_analyzer.get_template_variables()
        assert "messages" in variables
        assert "message" in variables
        assert "role" in variables["message"]
        assert "content" in variables["message"]

    def test_detailed_analysis(self, basic_jinja_template_analyzer):
        """Test the detailed analysis of variable usage."""
        LOG.info("Testing detailed analysis")

        analysis = basic_jinja_template_analyzer.analyze_template()

        assert analysis["messages"]["is_iterated"] is True
        assert "role" in analysis["message"]["accessed_properties"]
        assert "content" in analysis["message"]["accessed_properties"]

        assert analysis["add_generation_prompt"]["is_conditional"] is True
        assert len(analysis["add_generation_prompt"]["accessed_properties"]) == 0

        assert not analysis["eos_token"]["is_iterated"]
        assert len(analysis["eos_token"]["accessed_properties"]) == 0

    def test_nested_property_access(self):
        """Test handling of nested property access."""
        LOG.info("Testing nested property access")

        template = """{{ user.profile.name }}{{ user.settings['preference'] }}"""
        analyzer = JinjaTemplateAnalyzer(template)
        variables = analyzer.get_template_variables()

        assert "user" in variables
        assert "profile" in variables["user"]
        assert "settings" in variables["user"]

    def test_loop_variable_handling(self):
        """Test handling of loop variables and their properties."""
        LOG.info("Testing loop variable handling")

        template = """
        {% for item in items %}
            {{ item.name }}
            {% for subitem in item.subitems %}
                {{ subitem.value }}
            {% endfor %}
        {% endfor %}
        """
        analyzer = JinjaTemplateAnalyzer(template)
        analysis = analyzer.analyze_template()

        assert analysis["items"]["is_iterated"]
        assert "name" in analysis["item"]["accessed_properties"]
        assert "subitems" in analysis["item"]["accessed_properties"]

    def test_conditional_variable_usage(self):
        """Test detection of variables used in conditional statements."""
        LOG.info("Testing conditional variable usage")

        template = """
        {% if user.is_admin and config.debug_mode %}
            {{ debug_info }}
        {% endif %}
        """
        analyzer = JinjaTemplateAnalyzer(template)
        analysis = analyzer.analyze_template()

        assert analysis["user"]["is_conditional"]
        assert analysis["config"]["is_conditional"]
        assert "is_admin" in analysis["user"]["accessed_properties"]
        assert "debug_mode" in analysis["config"]["accessed_properties"]

    def test_complex_expressions(self):
        """Test handling of complex expressions and filters."""
        LOG.info("Testing complex expressions and filters")

        template = """
        {{ user.name | upper }}
        {{ messages | length > 0 and messages[0].content }}
        {{ data['key'].nested['value'] }}
        """
        analyzer = JinjaTemplateAnalyzer(template)
        variables = analyzer.get_template_variables()

        assert "user" in variables
        assert "name" in variables["user"]
        assert "messages" in variables
        assert "content" in variables["messages"]
        assert "data" in variables

    def test_basic_msg_vars(self, basic_jinja_template_analyzer):
        """Test that the basic message variables are correctly identified."""
        LOG.info("Testing basic message variables")

        variables = basic_jinja_template_analyzer.get_message_vars()
        assert variables == {"role", "content"}

    def test_mixtral_msg_vars(self, mistral_jinja_template_analyzer):
        """Test that the mixtral message variables are correctly identified."""
        LOG.info("Testing mixtral message variables")

        variables = mistral_jinja_template_analyzer.get_message_vars()
        assert variables == {"role", "content", "tool_calls", "tool_call_id"}


if __name__ == "__main__":
    pytest.main([__file__])


================================================
FILE: tests/prompt_strategies/test_raw_io.py
================================================
"""
Test module for raw i/o data for prompts
"""

import pytest
from datasets import Dataset
from tokenizers import AddedToken
from transformers import AutoTokenizer

from axolotl.datasets import TokenizedPromptDataset
from axolotl.prompt_strategies.input_output import (
    RawInputOutputPrompter,
    RawInputOutputStrategy,
)


@pytest.fixture(name="segments_dataset")
def fixture_sharegpt_dataset():
    return Dataset.from_list(
        [
            {
                "segments": [
                    {
                        "label": False,
                        "text": "<s>hello ",
                    },
                    {
                        "label": True,
                        "text": "hi there.<eot>",
                    },
                    {
                        "label": False,
                        "text": "goodbye ",
                    },
                    {
                        "label": True,
                        "text": "farewell<eot>",
                    },
                ]
            }
        ]
    )


@pytest.fixture(name="tokenizer")
def fixture_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained(
        "casperhansen/mistral-7b-instruct-v0.1-awq"
    )
    tokenizer.add_tokens(
        [
            AddedToken("<eot>", rstrip=False, lstrip=False, normalized=False),
        ]
    )

    return tokenizer


class TestRawInputOutputPrompts:
    """
    Test class for raw i/o prompter
    """

    def test_segment_prompts(self, segments_dataset, tokenizer):
        strategy = RawInputOutputStrategy(
            RawInputOutputPrompter(),
            tokenizer,
            False,  # train_on_inputs
            2048,  # sequence_len
        )

        dataset_wrapper = TokenizedPromptDataset(
            strategy, segments_dataset, process_count=1
        )

        input_ids = dataset_wrapper[0]["input_ids"]
        labels = dataset_wrapper[0]["labels"]

        assert (
            tokenizer.decode(input_ids)
            == "<s> hello  hi there.<eot> goodbye  farewell<eot>"
        )
        # fmt: off
        assert input_ids == [
            1,  # <s>
            6312,  # hell
            28709,  # o
            28705,  #
            12014,  # hi
            736,  # there
            28723,  # .
            32000,  # <eot>
            1179,  # good
            17664,  # bye
            28705,  #
            19111,  # fare
            5458,  # well
            32000,  # <eot>
        ]
        # fmt: on

        # fmt: off
        assert labels == [
            -100,  # <s>
            -100,  # hell
            -100,  # o
            -100,  #
            12014,  # hi
            736,  # there
            28723,  # .
            32000,  # <eot>
            -100,  # good
            -100,  # bye
            -100,  #
            19111,  # fare
            5458,  # well
            32000,  # <eot>
        ]
        # fmt: on


================================================
FILE: tests/prompt_strategies/test_stepwise.py
================================================
"""
tests for chat_template prompt strategy
"""

import datasets
import pytest
from datasets import Dataset
from transformers import AutoTokenizer

from axolotl.datasets import TokenizedPromptDataset
from axolotl.prompt_strategies.stepwise_supervised import (
    StepwiseSupervisedPromptTokenizingStrategy,
)


class TestStepWiseSupervisedPromptTokenizingStrategy:
    """
    Test class for stepwise supervised prompt strategy
    """

    @pytest.fixture()
    def stepwise_supervised_dataset(self):
        return Dataset.from_list(
            [
                {
                    "prompt": "Which number is larger, 9.8 or 9.11?",
                    "completions": [
                        "The fractional part of 9.8 is 0.8, while the fractional part of 9.11 is 0.11.",
                        "Since 0.11 is greater than 0.8, the number 9.11 is larger than 9.8.",
                        "Actually, this is incorrect. In decimal numbers, 0.8 is equal to 0.80, which is larger than 0.11. Therefore, 9.8 is larger than 9.11.",
                    ],
                    "labels": [True, False, False],
                }
            ]
        )

    @pytest.fixture()
    def tokenizer(self):
        return AutoTokenizer.from_pretrained("Qwen/Qwen2.5-0.5B")

    def test_stepwise_supervised_dataset(self, tokenizer, stepwise_supervised_dataset):
        strategy = StepwiseSupervisedPromptTokenizingStrategy(
            tokenizer,
            sequence_len=2048,
            step_separator="\n",
        )
        stepwise_supervised_dataset = stepwise_supervised_dataset.cast_column(
            "labels", datasets.Sequence(datasets.Value("int64"))
        )
        dataset_wrapper = TokenizedPromptDataset(
            strategy,
            stepwise_supervised_dataset,
            process_count=1,
        )
        labels = dataset_wrapper[0]["labels"]
        # expected labels is:
        # the prompt + first step are ignored, followed by the label for step 1 (True)
        # the second step, and its label (False)
        # the third step, and its label (False)
        expected = [-100] * 47 + [1] + [-100] * 29 + [0] + [-100] * 48 + [0]

        assert labels == expected


================================================
FILE: tests/telemetry/__init__.py
================================================


================================================
FILE: tests/telemetry/conftest.py
================================================
"""Shared pytest fixtures for telemetry tests."""

import pytest


@pytest.fixture(autouse=True)
def del_track_env(monkeypatch):
    monkeypatch.delenv("AXOLOTL_DO_NOT_TRACK", raising=False)
    yield


================================================
FILE: tests/telemetry/test_callbacks.py
================================================
"""Tests for telemetry callback module."""

# pylint: disable=redefined-outer-name

import time
from unittest.mock import MagicMock, patch

import pytest
from transformers import TrainerControl, TrainerState, TrainingArguments

from axolotl.telemetry.callbacks import TIME_SINCE_LAST, TelemetryCallback


def calc_expected_metrics(step, last_step, current_time, last_time, start_time=900.0):
    """Calculate expected metrics values for tests"""
    time_diff = current_time - last_time
    step_diff = step - last_step
    return {
        "steps_per_second": (
            step_diff / time_diff if time_diff > 0 and step_diff > 0 else 0
        ),
        "time_since_last_report": time_diff,
        "elapsed_time": current_time - start_time,
    }


@pytest.fixture
def mock_time():
    """Mock time.time() to have predictable values in tests"""
    with patch("axolotl.telemetry.callbacks.time") as mock_time:
        mock_time.time.return_value = 1000.0
        yield mock_time


@pytest.fixture
def mock_telemetry_manager():
    """Create a mock TelemetryManager"""
    with patch("axolotl.telemetry.callbacks.TelemetryManager") as mock_manager_class:
        mock_manager = MagicMock()
        mock_manager_class.get_instance.return_value = mock_manager
        yield mock_manager


@pytest.fixture
def mock_runtime_metrics_tracker():
    """Create a mock RuntimeMetricsTracker"""
    with patch(
        "axolotl.telemetry.callbacks.RuntimeMetricsTracker"
    ) as mock_tracker_class:
        mock_tracker = MagicMock()
        # Set up metrics property on the tracker
        mock_metrics = MagicMock()
        mock_metrics.to_dict.return_value = {
            "total_steps": 100,
            "peak_cpu_memory_bytes": 1024,
        }
        mock_tracker.metrics = mock_metrics

        # Make the constructor return our mock
        mock_tracker_class.return_value = mock_tracker
        yield mock_tracker


@pytest.fixture
def training_args():
    """Create a minimal TrainingArguments instance"""
    return TrainingArguments(output_dir="./output")


@pytest.fixture
def trainer_state():
    """Create a mock TrainerState"""
    state = MagicMock(spec=TrainerState)
    state.global_step = 10
    state.epoch = 0.5  # halfway through first epoch
    state.log_history = [{"loss": 2.5, "learning_rate": 5e-5}]
    return state


@pytest.fixture
def trainer_control():
    """Create a mock TrainerControl"""
    return MagicMock(spec=TrainerControl)


# pylint: disable=unused-argument
@pytest.fixture
def callback(mock_telemetry_manager, mock_runtime_metrics_tracker):
    """Create a TelemetryCallback instance with mocked dependencies"""
    return TelemetryCallback()


class TestTelemetryCallback:
    """Tests for the TelemetryCallback class."""

    def test_initialization(self, callback, mock_runtime_metrics_tracker):
        """Test callback initialization."""
        assert callback.current_epoch == -1
        assert callback.tracker == mock_runtime_metrics_tracker
        assert callback.last_report_step == 0
        assert hasattr(callback, "start_time")
        assert hasattr(callback, "last_report_time")
        assert callback.report_interval_steps == 100

    def test_on_train_begin(
        self,
        callback,
        mock_telemetry_manager,
        training_args,
        trainer_state,
        trainer_control,
    ):
        """Test on_train_begin sends expected event."""
        callback.on_train_begin(training_args, trainer_state, trainer_control)

        mock_telemetry_manager.send_event.assert_called_once_with(
            event_type="train-start"
        )

    def test_on_train_end(
        self,
        callback,
        mock_telemetry_manager,
        training_args,
        trainer_state,
        trainer_control,
    ):
        """Test on_train_end sends expected event with metrics."""
        callback.on_train_end(training_args, trainer_state, trainer_control)

        mock_telemetry_manager.send_event.assert_called_once()
        call_args = mock_telemetry_manager.send_event.call_args[1]

        assert call_args["event_type"] == "train-end"
        assert "loss" in call_args["properties"]
        assert call_args["properties"]["loss"] == 2.5
        assert "learning_rate" in call_args["properties"]
        assert call_args["properties"]["learning_rate"] == 5e-5

        # Check that metrics from RuntimeMetricsTracker are included
        assert "total_steps" in call_args["properties"]
        assert call_args["properties"]["total_steps"] == 100
        assert "peak_cpu_memory_bytes" in call_args["properties"]
        assert call_args["properties"]["peak_cpu_memory_bytes"] == 1024

    def test_on_epoch_begin(
        self,
        callback,
        mock_runtime_metrics_tracker,
        training_args,
        trainer_state,
        trainer_control,
    ):
        """Test on_epoch_begin updates epoch counter and calls tracker."""
        initial_epoch = callback.current_epoch

        callback.on_epoch_begin(training_args, trainer_state, trainer_control)

        assert callback.current_epoch == initial_epoch + 1
        mock_runtime_metrics_tracker.start_epoch.assert_called_once_with(
            initial_epoch + 1
        )

    def test_on_epoch_end(
        self,
        callback,
        mock_runtime_metrics_tracker,
        training_args,
        trainer_state,
        trainer_control,
    ):
        """Test on_epoch_end calls tracker."""
        # Set current epoch
        callback.current_epoch = 2

        callback.on_epoch_end(training_args, trainer_state, trainer_control)

        mock_runtime_metrics_tracker.end_epoch.assert_called_once_with(2)

    def test_on_step_end_no_report(
        self,
        callback,
        mock_telemetry_manager,
        mock_runtime_metrics_tracker,
        training_args,
        trainer_state,
        trainer_control,
    ):
        """Test on_step_end updates tracker but doesn't report if criteria not met."""
        # Set up state to avoid reporting
        trainer_state.global_step = 42  # Not divisible by report_interval_steps
        callback.last_report_step = 41  # Just 1 step since last report
        callback.last_report_time = time.time()  # Just now

        callback.on_step_end(training_args, trainer_state, trainer_control)

        # Should update tracker
        mock_runtime_metrics_tracker.update_step.assert_called_once_with(42)

        # Should not send telemetry
        mock_telemetry_manager.send_event.assert_not_called()

        # Should not update last report time/step
        assert callback.last_report_step == 41

    def test_on_step_end_report_interval_steps(
        self,
        callback,
        mock_telemetry_manager,
        mock_runtime_metrics_tracker,
        mock_time,
        training_args,
        trainer_state,
        trainer_control,
    ):
        """Test on_step_end reports when step interval is reached."""
        # Set up state with clear values
        current_step = 100  # Exactly matches report_interval_steps
        last_step = 0
        start_time = 900.0
        current_time = 1000.0
        time_diff = current_time - start_time  # 100 seconds

        # Configure state and callback
        trainer_state.global_step = current_step
        callback.report_interval_steps = 100
        callback.last_report_step = last_step
        callback.start_time = start_time
        callback.last_report_time = start_time

        # Mock time.time() to return consistent values
        mock_time.time.return_value = current_time

        callback.on_step_end(training_args, trainer_state, trainer_control)

        # Should update tracker
        mock_runtime_metrics_tracker.update_step.assert_called_once_with(current_step)
        mock_runtime_metrics_tracker.update_memory_metrics.assert_called_once()

        # Should send telemetry
        mock_telemetry_manager.send_event.assert_called_once()
        call_args = mock_telemetry_manager.send_event.call_args[1]
        assert call_args["event_type"] == "train-progress"

        # Properties should include expected values
        props = call_args["properties"]
        assert props["step"] == current_step
        assert props["elapsed_time"] == time_diff  # 1000 - 900 = 100
        assert props["time_since_last_report"] == time_diff  # 1000 - 900 = 100
        assert props["steps_per_second"] == 1.0  # 100 steps / 100 seconds

        # Should update last report time/step
        assert callback.last_report_step == current_step
        assert callback.last_report_time == current_time

    def test_on_step_end_report_time_elapsed(
        self,
        callback,
        mock_telemetry_manager,
        mock_runtime_metrics_tracker,  # pylint: disable=unused-argument
        mock_time,
        training_args,
        trainer_state,
        trainer_control,
    ):
        """Test on_step_end reports when enough time has elapsed."""
        # Set up state with clear values
        current_step = 120
        last_step = 10
        start_time = 900.0
        current_time = 1000.0
        time_diff = TIME_SINCE_LAST + 1  # Just over the threshold

        # Configure state and callback
        trainer_state.global_step = current_step
        callback.report_interval_steps = 100
        callback.last_report_step = last_step
        callback.start_time = start_time
        callback.last_report_time = current_time - time_diff

        # Mock time.time() to return consistent values
        mock_time.time.return_value = current_time

        callback.on_step_end(training_args, trainer_state, trainer_control)

        # Should send telemetry
        mock_telemetry_manager.send_event.assert_called_once()

        # Properties should include expected values
        props = mock_telemetry_manager.send_event.call_args[1]["properties"]
        expected_metrics = calc_expected_metrics(
            current_step, last_step, current_time, current_time - time_diff, start_time
        )
        assert props["steps_per_second"] == expected_metrics["steps_per_second"]
        assert (
            props["time_since_last_report"]
            == expected_metrics["time_since_last_report"]
        )

    def test_on_step_end_first_step(
        self,
        callback,
        mock_telemetry_manager,
        mock_runtime_metrics_tracker,  # pylint: disable=unused-argument
        mock_time,
        training_args,
        trainer_state,
        trainer_control,
    ):
        """Test on_step_end always reports on first step."""
        # Set up state with clear values
        current_step = 1  # First step
        last_step = 0
        start_time = 900.0
        current_time = 1000.0
        last_report_time = 999.0  # Just 1 second ago

        # Configure state and callback
        trainer_state.global_step = current_step
        callback.report_interval_steps = 100
        callback.last_report_step = last_step
        callback.start_time = start_time
        callback.last_report_time = last_report_time

        # Mock time.time() to return consistent values
        mock_time.time.return_value = current_time

        callback.on_step_end(training_args, trainer_state, trainer_control)

        # Should send telemetry even though not much time has passed
        mock_telemetry_manager.send_event.assert_called_once()

        # Properties should include expected values for first step
        props = mock_telemetry_manager.send_event.call_args[1]["properties"]
        assert props["step"] == current_step
        expected_metrics = calc_expected_metrics(
            current_step, last_step, current_time, last_report_time, start_time
        )
        assert props["steps_per_second"] == expected_metrics["steps_per_second"]

    def test_log_history_empty(
        self,
        callback,
        mock_telemetry_manager,
        mock_runtime_metrics_tracker,  # pylint: disable=unused-argument
        mock_time,
        training_args,
        trainer_state,
        trainer_control,
    ):
        """Test handling of empty log history."""
        # Set up state with clear values
        current_step = 1
        start_time = 900.0
        current_time = 1000.0

        # Configure state and callback
        trainer_state.global_step = current_step
        trainer_state.log_history = []
        callback.start_time = start_time

        # Mock time.time() to return consistent values
        mock_time.time.return_value = current_time

        callback.on_step_end(training_args, trainer_state, trainer_control)

        # Should still send telemetry
        mock_telemetry_manager.send_event.assert_called_once()

        # Properties should have default values for missing log data
        props = mock_telemetry_manager.send_event.call_args[1]["properties"]
        assert props["loss"] == 0
        assert props["learning_rate"] == 0


================================================
FILE: tests/telemetry/test_errors.py
================================================
"""Tests for telemetry error utilities"""

# pylint: disable=redefined-outer-name

from unittest.mock import MagicMock, patch

import pytest

from axolotl.telemetry.errors import sanitize_stack_trace, send_errors


@pytest.fixture(autouse=True)
def reset_error_flag(monkeypatch):
    """Reset ERROR_HANDLED flag using monkeypatch"""
    import axolotl.telemetry.errors

    monkeypatch.setattr(axolotl.telemetry.errors, "ERROR_HANDLED", False)
    yield
    monkeypatch.setattr(axolotl.telemetry.errors, "ERROR_HANDLED", False)


@pytest.fixture
def example_stack_trace():
    """Provide a sample stack trace with mixed paths"""
    return """Traceback (most recent call last):
  File "/home/user/.local/lib/python3.9/site-packages/axolotl/cli/train.py", line 83, in main
    trainer = get_trainer(cfg)
  File "/home/user/.local/lib/python3.9/site-packages/axolotl/train.py", line 214, in get_trainer
    model = get_model(cfg, tokenizer)
  File "/home/user/.local/lib/python3.9/site-packages/axolotl/utils/models.py", line 120, in get_model
    raise ValueError("Model path not found")
ValueError: Model path not found
"""


@pytest.fixture
def windows_stack_trace():
    """Provide a sample stack trace with Windows paths"""
    return """Traceback (most recent call last):
  File "C:\\Users\\name\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\axolotl\\cli\\train.py", line 83, in main
    trainer = get_trainer(cfg)
  File "C:\\Users\\name\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\axolotl\\train.py", line 214, in get_trainer
    model = get_model(cfg, tokenizer)
  File "C:\\Users\\name\\AppData\\Local\\Programs\\Python\\Python39\\lib\\site-packages\\transformers\\models\\auto\\modeling_auto.py", line 482, in from_pretrained
    raise ValueError(f"Unrecognized configuration class {config.__class__}")
ValueError: Unrecognized configuration class <class 'transformers.models.llama.configuration_llama.LlamaConfig'>
"""


@pytest.fixture
def mixed_stack_trace():
    """Provide a sample stack trace with both axolotl and non-axolotl paths"""
    return """Traceback (most recent call last):
  File "/home/user/.local/lib/python3.9/site-packages/axolotl/cli/train.py", line 83, in main
    trainer = get_trainer(cfg)
  File "/home/user/.local/lib/python3.9/site-packages/transformers/trainer.py", line 520, in train
    self._inner_training_loop()
  File "/home/user/.local/lib/python3.9/site-packages/axolotl/utils/trainer.py", line 75, in _inner_training_loop
    super()._inner_training_loop()
  File "/home/user/.local/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 631, in __next__
    data = self._next_data()
RuntimeError: CUDA out of memory
"""


@pytest.fixture
def venv_stack_trace():
    """Provide a sample stack trace with virtual environment paths"""
    return """Traceback (most recent call last):
  File "/home/user/venv/lib/python3.9/site-packages/transformers/trainer.py", line 1729, in train
    self._inner_training_loop()
  File "/home/user/venv/lib/python3.9/site-packages/transformers/trainer.py", line 2013, in _inner_training_loop
    self.accelerator.backward(loss)
  File "/home/user/venv/lib/python3.9/site-packages/accelerate/accelerator.py", line 1851, in backward
    self.scaler.scale(loss).backward(**kwargs)
  File "/home/user/venv/lib/python3.9/site-packages/torch/_tensor.py", line 487, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
RuntimeError: CUDA out of memory
"""


@pytest.fixture
def dist_packages_stack_trace():
    """Provide a sample stack trace with dist-packages paths"""
    return """Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py", line 631, in __next__
    data = self._next_data()
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/dataloader.py", line 675, in _next_data
    data = self._dataset_fetcher.fetch(index)
  File "/usr/local/lib/python3.8/dist-packages/torch/utils/data/_utils/fetch.py", line 51, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "/usr/local/lib/python3.8/dist-packages/datasets/arrow_dataset.py", line 2808, in __getitem__
    raise IndexError(f"Index {key} out of range for dataset of length {len(self)}.")
IndexError: Index 10000 out of range for dataset of length 9832.
"""


@pytest.fixture
def project_stack_trace():
    """Provide a sample stack trace from a project directory (not a virtual env)"""
    return """Traceback (most recent call last):
  File "/home/user/projects/myproject/run.py", line 25, in <module>
    main()
  File "/home/user/projects/myproject/src/cli.py", line 45, in main
    app.run()
  File "/home/user/projects/myproject/src/app.py", line 102, in run
    raise ValueError("Configuration missing")
ValueError: Configuration missing
"""


def test_sanitize_stack_trace(example_stack_trace):
    """Test that sanitize_stack_trace properly preserves axolotl paths"""
    sanitized = sanitize_stack_trace(example_stack_trace)

    # Check that personal paths are removed
    assert "/home/user" not in sanitized
    assert ".local/lib/python3.9" not in sanitized

    # Check that site-packages is preserved
    assert "site-packages/axolotl/cli/train.py" in sanitized
    assert "site-packages/axolotl/train.py" in sanitized
    assert "site-packages/axolotl/utils/models.py" in sanitized

    # Check that error message is preserved
    assert "ValueError: Model path not found" in sanitized


def test_sanitize_windows_paths(windows_stack_trace):
    """Test that sanitize_stack_trace handles Windows paths"""
    sanitized = sanitize_stack_trace(windows_stack_trace)

    # Check that personal paths are removed
    assert "C:\\Users\\name" not in sanitized
    assert "AppData\\Local\\Programs\\Python" not in sanitized

    # Check that both axolotl and transformers packages are preserved
    assert (
        "site-packages\\axolotl\\cli\\train.py" in sanitized
        or "site-packages/axolotl/cli/train.py" in sanitized
    )
    assert (
        "site-packages\\axolotl\\train.py" in sanitized
        or "site-packages/axolotl/train.py" in sanitized
    )
    assert (
        "site-packages\\transformers\\models\\auto\\modeling_auto.py" in sanitized
        or "site-packages/transformers/models/auto/modeling_auto.py" in sanitized
    )

    # Check that error message is preserved
    assert "ValueError: Unrecognized configuration class" in sanitized


def test_sanitize_mixed_paths(mixed_stack_trace):
    """Test that sanitize_stack_trace preserves all package paths"""
    sanitized = sanitize_stack_trace(mixed_stack_trace)

    # Check that all package paths are preserved
    assert "site-packages/axolotl/cli/train.py" in sanitized
    assert "site-packages/transformers/trainer.py" in sanitized
    assert "site-packages/axolotl/utils/trainer.py" in sanitized
    assert "site-packages/torch/utils/data/dataloader.py" in sanitized

    # Check that error message is preserved
    assert "RuntimeError: CUDA out of memory" in sanitized


def test_sanitize_venv_paths(venv_stack_trace):
    """Test that sanitize_stack_trace preserves virtual environment package paths"""
    sanitized = sanitize_stack_trace(venv_stack_trace)

    # Check that personal paths are removed
    assert "/home/user/venv" not in sanitized

    # Check that all package paths are preserved
    assert "site-packages/transformers/trainer.py" in sanitized
    assert "site-packages/accelerate/accelerator.py" in sanitized
    assert "site-packages/torch/_tensor.py" in sanitized

    # Check that error message is preserved
    assert "RuntimeError: CUDA out of memory" in sanitized


def test_sanitize_dist_packages(dist_packages_stack_trace):
    """Test that sanitize_stack_trace preserves dist-packages paths"""
    sanitized = sanitize_stack_trace(dist_packages_stack_trace)

    # Check that system paths are removed
    assert "/usr/local/lib/python3.8" not in sanitized

    # Check that all package paths are preserved
    assert "dist-packages/torch/utils/data/dataloader.py" in sanitized
    assert "dist-packages/torch/utils/data/_utils/fetch.py" in sanitized
    assert "dist-packages/datasets/arrow_dataset.py" in sanitized

    # Check that error message is preserved
    assert (
        "IndexError: Index 10000 out of range for dataset of length 9832." in sanitized
    )


def test_sanitize_project_paths(project_stack_trace):
    """Test handling of project paths (non-virtual env)"""
    sanitized = sanitize_stack_trace(project_stack_trace)

    # Check that personal paths are removed
    assert "/home/user/projects" not in sanitized

    # For non-package paths, we should at least preserve the filename
    assert "run.py" in sanitized
    assert "cli.py" in sanitized
    assert "app.py" in sanitized

    # Check that error message is preserved
    assert "ValueError: Configuration missing" in sanitized


@pytest.fixture
def mock_telemetry_manager():
    """Create a mock TelemetryManager"""
    with patch("axolotl.telemetry.errors.TelemetryManager") as mock_manager_class:
        mock_manager = MagicMock()
        mock_manager.enabled = True
        mock_manager_class.get_instance.return_value = mock_manager
        yield mock_manager


def test_send_errors_successful_execution(mock_telemetry_manager):
    """Test that send_errors doesn't send telemetry for successful function execution"""

    @send_errors
    def test_func():
        return "success"

    result = test_func()
    assert result == "success"
    mock_telemetry_manager.send_event.assert_not_called()


def test_send_errors_with_exception(mock_telemetry_manager):
    """Test that send_errors sends telemetry when an exception occurs"""
    test_error = ValueError("Test error")

    @send_errors
    def test_func():
        raise test_error

    with pytest.raises(ValueError) as excinfo:
        test_func()

    assert excinfo.value == test_error
    mock_telemetry_manager.send_event.assert_called_once()

    # Check that the error info was passed correctly
    call_args = mock_telemetry_manager.send_event.call_args[1]
    assert "test_func-error" in call_args["event_type"]
    assert "Test error" in call_args["properties"]["exception"]
    assert "stack_trace" in call_args["properties"]


def test_send_errors_nested_calls(mock_telemetry_manager):
    """Test that send_errors only sends telemetry once for nested decorated functions"""

    @send_errors
    def inner_func():
        raise ValueError("Inner error")

    @send_errors
    def outer_func():
        return inner_func()

    with pytest.raises(ValueError):
        outer_func()

    # Telemetry should be sent only once for the inner function
    assert mock_telemetry_manager.send_event.call_count == 1
    call_args = mock_telemetry_manager.send_event.call_args[1]
    assert "inner_func-error" in call_args["event_type"]


def test_send_errors_telemetry_disable():
    """Test that send_errors doesn't attempt to send telemetry when disabled"""

    with patch("axolotl.telemetry.errors.TelemetryManager") as mock_manager_class:
        mock_manager = MagicMock()
        mock_manager.enabled = False
        mock_manager_class.get_instance.return_value = mock_manager

        @send_errors
        def test_func():
            raise ValueError("Test error")

        with pytest.raises(ValueError):
            test_func()

        mock_manager.send_event.assert_not_called()


def test_error_handled_reset():
    """Test that ERROR_HANDLED flag is properly reset"""
    with patch("axolotl.telemetry.errors.TelemetryManager") as mock_manager_class:
        # Create and configure the mock manager
        mock_manager = MagicMock()
        mock_manager.enabled = True
        mock_manager_class.get_instance.return_value = mock_manager

        from axolotl.telemetry.errors import ERROR_HANDLED

        @send_errors
        def test_func():
            raise ValueError("Test error")

        assert not ERROR_HANDLED

        with pytest.raises(ValueError):
            test_func()

        from axolotl.telemetry.errors import ERROR_HANDLED

        assert ERROR_HANDLED


def test_module_path_resolution(mock_telemetry_manager):
    """Test that the module path is correctly resolved for the event type"""
    import inspect

    current_module = inspect.getmodule(test_module_path_resolution).__name__

    @send_errors
    def test_func():
        raise ValueError("Test error")

    with pytest.raises(ValueError):
        test_func()

    assert mock_telemetry_manager.send_event.called
    event_type = mock_telemetry_manager.send_event.call_args[1]["event_type"]

    expected_event_type = f"{current_module}.test_func-error"
    assert expected_event_type == event_type


================================================
FILE: tests/telemetry/test_manager.py
================================================
"""Tests for TelemetryManager class and utilities"""

# pylint: disable=redefined-outer-name,protected-access

import os
from unittest.mock import patch

import pytest
import yaml

from axolotl.telemetry.manager import TelemetryManager


@pytest.fixture
def mock_whitelist(tmp_path):
    """Create a temporary whitelist file for testing"""
    whitelist_content = {
        "organizations": ["meta-llama", "mistralai"],
    }
    whitelist_file = tmp_path / "whitelist.yaml"
    with open(whitelist_file, "w", encoding="utf-8") as f:
        yaml.dump(whitelist_content, f)

    return str(whitelist_file)


@pytest.fixture
def telemetry_manager_class():
    """Reset the TelemetryManager singleton between tests"""
    original_instance = TelemetryManager._instance
    original_initialized = TelemetryManager._initialized
    TelemetryManager._instance = None
    TelemetryManager._initialized = False
    yield TelemetryManager
    TelemetryManager._instance = original_instance
    TelemetryManager._initialized = original_initialized


@pytest.fixture
def manager(telemetry_manager_class, mock_whitelist):
    """Create a TelemetryManager instance with mocked dependencies"""
    with (
        patch("posthog.capture"),
        patch("posthog.flush"),
        patch("time.sleep"),
        patch("axolotl.telemetry.manager.WHITELIST_PATH", mock_whitelist),
        patch.dict(os.environ, {"RANK": "0"}),
    ):
        manager = telemetry_manager_class()
        # Manually enable for most tests
        manager.enabled = True
        return manager


def test_singleton_instance(telemetry_manager_class):
    """Test that TelemetryManager is a singleton"""
    with (
        patch("posthog.capture"),
        patch("time.sleep"),
        patch.dict(os.environ, {"RANK": "0"}),
    ):
        first = telemetry_manager_class()
        second = telemetry_manager_class()
        assert first is second
        assert telemetry_manager_class.get_instance() is first


def test_telemetry_enabled_by_default(telemetry_manager_class):
    """Test that telemetry is enabled by default (opt-out)"""
    with (
        patch.dict(os.environ, {"RANK": "0"}, clear=True),
        patch("time.sleep"),
        patch("logging.Logger.info"),
    ):
        manager = telemetry_manager_class()
        assert manager.enabled


def test_telemetry_enabled_with_explicit_opt_in(telemetry_manager_class):
    """Test that telemetry is enabled when AXOLOTL_DO_NOT_TRACK=0"""
    with (
        patch.dict(os.environ, {"AXOLOTL_DO_NOT_TRACK": "0", "RANK": "0"}),
        patch("time.sleep"),
    ):
        manager = telemetry_manager_class()
        assert manager.enabled


def test_telemetry_disabled_with_axolotl_do_not_track(telemetry_manager_class):
    """Test that telemetry is disabled when AXOLOTL_DO_NOT_TRACK=1"""
    with (
        patch.dict(os.environ, {"AXOLOTL_DO_NOT_TRACK": "1", "RANK": "0"}),
        patch("time.sleep"),
    ):
        manager = telemetry_manager_class()
        assert not manager.enabled


def test_telemetry_disabled_with_do_not_track(telemetry_manager_class):
    """Test that telemetry is disabled when DO_NOT_TRACK=1"""
    with (
        patch.dict(
            os.environ, {"AXOLOTL_DO_NOT_TRACK": "0", "DO_NOT_TRACK": "1", "RANK": "0"}
        ),
        patch("time.sleep"),
    ):
        manager = telemetry_manager_class()
        assert not manager.enabled


def test_telemetry_disabled_for_non_main_process(telemetry_manager_class):
    """Test that telemetry is disabled for non-main processes"""
    with (
        patch.dict(os.environ, {"AXOLOTL_DO_NOT_TRACK": "0", "RANK": "1"}),
        patch("time.sleep"),
    ):
        manager = telemetry_manager_class()
        assert not manager.enabled


def test_is_whitelisted(telemetry_manager_class, mock_whitelist):
    """Test org whitelist functionality"""
    with (
        patch("axolotl.telemetry.manager.WHITELIST_PATH", mock_whitelist),
        patch.dict(os.environ, {"AXOLOTL_DO_NOT_TRACK": "0"}),
    ):
        manager = telemetry_manager_class()

        # Should match organizations from the mock whitelist
        assert manager._is_whitelisted("meta-llama/llama-7b")
        assert manager._is_whitelisted("mistralai/mistral-7b-instruct")
        # Should not match
        assert not manager._is_whitelisted("unknown/model")
        # Should handle case insensitively
        assert manager._is_whitelisted("META-LLAMA/Llama-7B")
        # Should handle empty input
        assert not manager._is_whitelisted("")


def test_system_info_collection(manager):
    """Test system information collection"""
    system_info = manager._get_system_info()

    # Check essential keys
    assert "os" in system_info
    assert "python_version" in system_info
    assert "cpu_count" in system_info
    assert "memory_total" in system_info
    assert "accelerator_count" in system_info


def test_send_event(telemetry_manager_class):
    """Test basic event sending"""
    with (
        patch("posthog.capture") as mock_capture,
        patch.dict(os.environ, {"AXOLOTL_DO_NOT_TRACK": "0"}),
    ):
        manager = telemetry_manager_class()

        # Test with clean properties (no PII)
        manager.send_event("test_event", {"key": "value"})
        assert mock_capture.called
        assert mock_capture.call_args[1]["event"] == "test_event"
        assert mock_capture.call_args[1]["properties"] == {"key": "value"}
        assert mock_capture.call_args[1]["distinct_id"] == manager.run_id

        # Test with default properties (None)
        mock_capture.reset_mock()
        manager.send_event("simple_event")
        assert mock_capture.called
        assert mock_capture.call_args[1]["properties"] == {}


def test_send_system_info(telemetry_manager_class):
    """Test sending system info"""
    with (
        patch("posthog.capture") as mock_capture,
        patch.dict(os.environ, {"AXOLOTL_DO_NOT_TRACK": "0"}),
    ):
        manager = telemetry_manager_class()
        manager.send_system_info()
        assert mock_capture.called
        assert mock_capture.call_args[1]["event"] == "system-info"
        assert mock_capture.call_args[1]["properties"] == manager.system_info


def test_redacted_properties(telemetry_manager_class):
    """Test path redaction in send_event method"""
    with (
        patch("posthog.capture") as mock_capture,
        patch.dict(os.environ, {"AXOLOTL_DO_NOT_TRACK": "0"}),
    ):
        manager = telemetry_manager_class()
        # Test with properties containing various paths and non-paths
        test_properties = {
            "filepath": "/home/user/sensitive/data.txt",
            "windows_path": "C:\\Users\\name\\Documents\\project\\file.py",
            "output_dir": "/var/lib/data",
            "path_to_model": "models/llama/7b",
            "message": "Training started",  # Should not be redacted
            "metrics": {"loss": 0.5, "accuracy": 0.95},  # Should not be redacted
            "base_model": "models/local_model",
            "nested": {
                "model_path": "/models/my_model",
                "root_dir": "/home/user/projects",
                "stats": {"steps": 1000, "epochs": 3},  # Should not be redacted
            },
        }

        manager.send_event("test_event", test_properties)

        # Verify the call was made
        assert mock_capture.called

        # Get the sanitized properties that were sent
        sanitized = mock_capture.call_args[1]["properties"]

        # Check that path-like and base_model keys were redacted
        assert sanitized["filepath"] == "[REDACTED]"
        assert sanitized["windows_path"] == "[REDACTED]"
        assert sanitized["path_to_model"] == "[REDACTED]"
        assert sanitized["base_model"] == "[REDACTED]"

        # Check that non-path values were preserved
        assert sanitized["message"] == "Training started"
        assert sanitized["metrics"] == {"loss": 0.5, "accuracy": 0.95}

        # Check nested structure handling
        assert sanitized["nested"]["model_path"] == "[REDACTED]"
        assert sanitized["nested"]["root_dir"] == "[REDACTED]"
        assert sanitized["nested"]["stats"] == {"steps": 1000, "epochs": 3}


def test_disable_telemetry(manager):
    """Test that disabled telemetry doesn't send events"""
    with patch("posthog.capture") as mock_capture:
        manager.enabled = False
        manager.send_event("test_event")
        assert not mock_capture.called


def test_exception_handling_during_send(manager):
    """Test that exceptions in PostHog are handled gracefully"""
    with (
        patch("posthog.capture", side_effect=Exception("Test error")),
        patch("logging.Logger.warning") as mock_warning,
    ):
        manager.send_event("test_event")
        warning_logged = False
        for call in mock_warning.call_args_list:
            if "Failed to send telemetry event" in str(call):
                warning_logged = True
                break
        assert warning_logged


def test_shutdown(manager):
    """Test shutdown behavior"""
    with patch("posthog.shutdown") as mock_shutdown:
        manager.shutdown()
        assert mock_shutdown.called


================================================
FILE: tests/telemetry/test_runtime_metrics.py
================================================
"""Tests for runtime metrics telemetry module"""

# pylint: disable=redefined-outer-name

from unittest.mock import MagicMock, patch

import pytest

from axolotl.telemetry.runtime_metrics import RuntimeMetrics, RuntimeMetricsTracker


@pytest.fixture
def mock_time():
    """Mock time.time() to have predictable values in tests"""
    with patch("time.time") as mock_time:
        # Start with time 1000.0 and increment by 10 seconds on each call
        times = [1000.0 + i * 10 for i in range(10)]
        mock_time.side_effect = times
        yield mock_time


@pytest.fixture
def mock_telemetry_manager():
    """Create a mock TelemetryManager"""
    with patch(
        "axolotl.telemetry.runtime_metrics.TelemetryManager"
    ) as mock_manager_class:
        mock_manager = MagicMock()
        mock_manager.enabled = True
        mock_manager_class.get_instance.return_value = mock_manager
        yield mock_manager


@pytest.fixture
def mock_psutil():
    """Mock psutil for memory information"""
    with patch("axolotl.telemetry.runtime_metrics.psutil") as mock_psutil:
        mock_process = MagicMock()
        mock_memory_info = MagicMock()
        # Set initial memory to 1GB
        mock_memory_info.rss = 1024 * 1024 * 1024
        mock_process.memory_info.return_value = mock_memory_info
        mock_psutil.Process.return_value = mock_process
        yield mock_psutil


@pytest.fixture
def mock_torch():
    """Mock torch.cuda functions"""
    with patch("axolotl.telemetry.runtime_metrics.torch") as mock_torch:
        mock_torch.cuda.is_available.return_value = True
        mock_torch.cuda.device_count.return_value = 2

        # Mock memory allocated per device (1GB for device 0, 2GB for device 1)
        mock_torch.cuda.memory_allocated.side_effect = lambda device: (
            (device + 1) * 1024 * 1024 * 1024
        )

        yield mock_torch


class TestRuntimeMetrics:
    """Tests for RuntimeMetrics class."""

    def test_initialization(self):
        """Test RuntimeMetrics initialization."""
        metrics = RuntimeMetrics(start_time=1000.0)

        assert metrics.start_time == 1000.0
        assert metrics.epoch_start_times == {}
        assert metrics.epoch_end_times == {}
        assert metrics.peak_gpu_memory == {}
        assert metrics.total_steps == 0
        assert metrics.current_epoch == 0
        assert metrics.current_step == 0
        assert metrics.peak_cpu_memory == 0

    def test_elapsed_time(self, mock_time):
        """Test elapsed_time property."""
        metrics = RuntimeMetrics(start_time=1000.0)

        # Mock time.time() to return 1050.0
        mock_time.side_effect = [1050.0]

        assert metrics.elapsed_time == 50.0

    def test_epoch_time(self):
        """Test epoch_time method."""
        metrics = RuntimeMetrics(start_time=1000.0)

        # No epoch data
        assert metrics.epoch_time(0) is None

        # Add epoch start but no end
        metrics.epoch_start_times[0] = 1000.0
        assert metrics.epoch_time(0) is None

        # Add epoch end
        metrics.epoch_end_times[0] = 1060.0
        assert metrics.epoch_time(0) == 60.0

    def test_average_epoch_time(self):
        """Test average_epoch_time method."""
        metrics = RuntimeMetrics(start_time=1000.0)

        # No completed epochs
        assert metrics.average_epoch_time() is None

        # Add one completed epoch
        metrics.epoch_start_times[0] = 1000.0
        metrics.epoch_end_times[0] = 1060.0
        assert metrics.average_epoch_time() == 60.0

        # Add second completed epoch
        metrics.epoch_start_times[1] = 1060.0
        metrics.epoch_end_times[1] = 1140.0  # 80 seconds
        assert metrics.average_epoch_time() == 70.0  # Average of 60 and 80

        # Add incomplete epoch (should not affect average)
        metrics.epoch_start_times[2] = 1140.0
        assert metrics.average_epoch_time() == 70.0

    def test_steps_per_second(self, mock_time):
        """Test steps_per_second method."""
        metrics = RuntimeMetrics(start_time=1000.0)

        # No steps - first call to time.time()
        mock_time.side_effect = None
        mock_time.return_value = 1050.0
        assert metrics.steps_per_second() is None

        # Add steps - second call to time.time()
        metrics.total_steps = 100
        mock_time.return_value = 1050.0  # Keep same time for consistent result
        assert metrics.steps_per_second() == 2.0  # 100 steps / 50 seconds

    def test_to_dict_basic(self, mock_time):
        """Test to_dict method with basic metrics."""
        metrics = RuntimeMetrics(start_time=1000.0)
        metrics.total_steps = 100
        metrics.peak_cpu_memory = 2 * 1024 * 1024 * 1024  # 2GB

        # Mock elapsed_time
        mock_time.side_effect = None
        mock_time.return_value = 1050.0

        result = metrics.to_dict()

        assert result["total_time_seconds"] == 50.0
        assert result["total_steps"] == 100
        assert result["steps_per_second"] == 2.0
        assert result["epochs_completed"] == 0
        assert result["peak_cpu_memory_bytes"] == 2 * 1024 * 1024 * 1024
        assert "epoch_times" not in result
        assert "gpu_memory" not in result

    def test_to_dict_with_epochs(self, mock_time):
        """Test to_dict method with epoch data."""
        metrics = RuntimeMetrics(start_time=1000.0)
        metrics.total_steps = 100

        # Add epoch data
        metrics.epoch_start_times[0] = 1000.0
        metrics.epoch_end_times[0] = 1060.0
        metrics.epoch_start_times[1] = 1060.0
        metrics.epoch_end_times[1] = 1140.0

        # Mock elapsed_time
        mock_time.side_effect = None
        mock_time.return_value = 1150.0

        result = metrics.to_dict()

        assert "epoch_times" in result
        assert result["epoch_times"]["epoch_0_seconds"] == 60.0
        assert result["epoch_times"]["epoch_1_seconds"] == 80.0
        assert result["average_epoch_time_seconds"] == 70.0

    def test_to_dict_with_gpu_memory(self, mock_time):
        """Test to_dict method with GPU memory data."""
        metrics = RuntimeMetrics(start_time=1000.0)
        metrics.peak_gpu_memory = {
            0: 1 * 1024 * 1024 * 1024,  # 1GB
            1: 2 * 1024 * 1024 * 1024,  # 2GB
        }

        # Mock elapsed_time
        mock_time.side_effect = [1050.0]

        result = metrics.to_dict()

        assert "gpu_memory" in result
        assert result["gpu_memory"]["gpu_0_peak_memory_bytes"] == 1 * 1024 * 1024 * 1024
        assert result["gpu_memory"]["gpu_1_peak_memory_bytes"] == 2 * 1024 * 1024 * 1024


class TestRuntimeMetricsTracker:
    """Tests for RuntimeMetricsTracker class."""

    # pylint: disable=unused-argument
    def test_initialization(self, mock_time, mock_telemetry_manager):
        """Test RuntimeMetricsTracker initialization."""
        tracker = RuntimeMetricsTracker()

        assert isinstance(tracker.metrics, RuntimeMetrics)
        assert tracker.metrics.start_time == 1000.0  # First value from mock_time

    # pylint: disable=unused-argument
    def test_start_epoch(
        self, mock_time, mock_psutil, mock_torch, mock_telemetry_manager
    ):
        """Test start_epoch method."""
        tracker = RuntimeMetricsTracker()

        # Reset mock_time to control next value
        mock_time.side_effect = [1010.0]

        tracker.start_epoch(0)

        assert tracker.metrics.current_epoch == 0
        assert tracker.metrics.epoch_start_times[0] == 1010.0

        # Verify memory metrics were updated
        assert tracker.metrics.peak_cpu_memory == 1 * 1024 * 1024 * 1024
        assert 0 in tracker.metrics.peak_gpu_memory
        assert 1 in tracker.metrics.peak_gpu_memory

    # pylint: disable=unused-argument
    def test_end_epoch(self, mock_time, mock_telemetry_manager):
        """Test end_epoch method."""
        tracker = RuntimeMetricsTracker()

        # Start epoch 0
        mock_time.side_effect = [1010.0]
        tracker.start_epoch(0)

        # End epoch 0
        mock_time.side_effect = [1060.0]
        tracker.end_epoch(0)

        assert 0 in tracker.metrics.epoch_end_times
        assert tracker.metrics.epoch_end_times[0] == 1060.0

    # pylint: disable=unused-argument
    def test_update_step(
        self, mock_time, mock_psutil, mock_torch, mock_telemetry_manager
    ):
        """Test update_step method."""
        tracker = RuntimeMetricsTracker()

        # Update step to a non-multiple of 100
        tracker.update_step(42)

        assert tracker.metrics.current_step == 42
        assert tracker.metrics.total_steps == 1

        # Memory metrics should not be updated for non-multiple of 100
        assert tracker.metrics.peak_cpu_memory == 0

        # Update step to a multiple of 100
        tracker.update_step(100)

        assert tracker.metrics.current_step == 100
        assert tracker.metrics.total_steps == 2

        # Memory metrics should be updated for multiple of 100
        assert tracker.metrics.peak_cpu_memory == 1 * 1024 * 1024 * 1024

    # pylint: disable=unused-argument
    def test_update_memory_metrics(
        self, mock_psutil, mock_torch, mock_telemetry_manager
    ):
        """Test update_memory_metrics method."""
        tracker = RuntimeMetricsTracker()

        # Initial memory state
        assert tracker.metrics.peak_cpu_memory == 0
        assert tracker.metrics.peak_gpu_memory == {}

        # Update memory metrics
        tracker.update_memory_metrics()

        # Verify CPU memory
        assert tracker.metrics.peak_cpu_memory == 1 * 1024 * 1024 * 1024

        # Verify GPU memory
        assert tracker.metrics.peak_gpu_memory[0] == 1 * 1024 * 1024 * 1024
        assert tracker.metrics.peak_gpu_memory[1] == 2 * 1024 * 1024 * 1024

        # Change mocked memory values to be lower
        mock_process = mock_psutil.Process.return_value
        mock_memory_info = mock_process.memory_info.return_value
        mock_memory_info.rss = 0.5 * 1024 * 1024 * 1024  # 0.5GB

        mock_torch.cuda.memory_allocated.side_effect = lambda device: (
            (device + 0.5) * 1024 * 1024 * 1024
        )

        # Update memory metrics again
        tracker.update_memory_metrics()

        # Peak values should not decrease
        assert tracker.metrics.peak_cpu_memory == 1 * 1024 * 1024 * 1024
        assert tracker.metrics.peak_gpu_memory[0] == 1 * 1024 * 1024 * 1024
        assert tracker.metrics.peak_gpu_memory[1] == 2 * 1024 * 1024 * 1024

        # Change mocked memory values to be higher
        mock_memory_info.rss = 2 * 1024 * 1024 * 1024  # 2GB

        mock_torch.cuda.memory_allocated.side_effect = lambda device: (
            (device + 2) * 1024 * 1024 * 1024
        )

        # Update memory metrics again
        tracker.update_memory_metrics()

        # Peak values should increase
        assert tracker.metrics.peak_cpu_memory == 2 * 1024 * 1024 * 1024
        assert tracker.metrics.peak_gpu_memory[0] == 2 * 1024 * 1024 * 1024
        assert tracker.metrics.peak_gpu_memory[1] == 3 * 1024 * 1024 * 1024

    # pylint: disable=unused-argument
    def test_get_memory_metrics(self, mock_psutil, mock_torch, mock_telemetry_manager):
        """Test get_memory_metrics method."""
        tracker = RuntimeMetricsTracker()

        # Set peak memory values
        tracker.metrics.peak_cpu_memory = 2 * 1024 * 1024 * 1024
        tracker.metrics.peak_gpu_memory = {
            0: 3 * 1024 * 1024 * 1024,
            1: 4 * 1024 * 1024 * 1024,
        }

        # Get memory metrics
        memory_metrics = tracker.get_memory_metrics()

        # Verify CPU memory
        assert (
            memory_metrics["cpu_memory_bytes"] == 1 * 1024 * 1024 * 1024
        )  # Current value from mock
        assert (
            memory_metrics["peak_cpu_memory_bytes"] == 2 * 1024 * 1024 * 1024
        )  # Peak value we set

        # Verify GPU memory
        assert (
            memory_metrics["gpu_0_memory_bytes"] == 1 * 1024 * 1024 * 1024
        )  # Current value from mock
        assert (
            memory_metrics["gpu_0_peak_memory_bytes"] == 3 * 1024 * 1024 * 1024
        )  # Peak value we set
        assert (
            memory_metrics["gpu_1_memory_bytes"] == 2 * 1024 * 1024 * 1024
        )  # Current value from mock
        assert (
            memory_metrics["gpu_1_peak_memory_bytes"] == 4 * 1024 * 1024 * 1024
        )  # Peak value we set


================================================
FILE: tests/test_chunked_xentropy.py
================================================
"""
test suite for chunked cross entropy
"""

import pytest
import torch
from torch import nn

from axolotl.monkeypatch.loss.chunked import get_causal_lm_loss


@pytest.fixture
def chunked_fixtures():
    model_dim = 512
    vocab_size = 1024 * 256
    seq_len = 2048
    batch_size = 1

    lm_head = nn.Linear(model_dim, vocab_size)
    hidden_state = torch.randn(batch_size, seq_len, model_dim)
    labels = torch.randint(low=0, high=vocab_size, size=(batch_size, seq_len))
    return lm_head, hidden_state, labels, vocab_size


def test_chunked_forward(chunked_fixtures):
    lm_head, hidden_state, labels, vocab_size = chunked_fixtures
    lm_loss = get_causal_lm_loss()

    logits = lm_head(hidden_state)

    chunked_lm_loss = lm_loss(logits, labels)

    logits_flattened = logits.view(-1, vocab_size)
    labels_flattened = labels.view(-1)

    loss = nn.functional.cross_entropy(
        logits_flattened.float(), labels_flattened, reduction="mean"
    )

    assert torch.allclose(chunked_lm_loss, loss, atol=1e-2, rtol=1e-2)


================================================
FILE: tests/test_context_parallel_batch_size.py
================================================
"""Tests for batch_size calculation with context parallelism."""

import sys
import types

import pytest

from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault


@pytest.fixture(name="cp_base_cfg")
def fixture_cp_base_cfg(min_base_cfg):
    return (
        DictDefault(
            micro_batch_size=2,
            gradient_accumulation_steps=4,
            sequence_len=2048,
            num_epochs=1,
            flash_attention=True,
        )
        | min_base_cfg
    )


class TestContextParallelBatchSize:
    """Verify batch_size scales by effective dp world_size when using context parallelism."""

    @pytest.mark.parametrize(
        "world_size, context_parallel_size, expected_batch_size",
        [
            (4, 1, 32),  # no CP: 2*4*4 = 32
            (4, 2, 16),  # CP=2: 2*4*(4//2) = 16
            (4, 4, 8),  # CP=4: 2*4*(4//4) = 8
            (2, 2, 8),  # CP=ws: 2*4*(2//2) = 8 (no scaling)
        ],
    )
    def test_batch_size_with_context_parallelism(
        self,
        cp_base_cfg,
        monkeypatch,
        world_size,
        context_parallel_size,
        expected_batch_size,
    ):
        monkeypatch.setenv("WORLD_SIZE", str(world_size))
        # Mock ring_flash_attn since it's not installable on CPU,
        # but required by schema validation when context_parallel_size > 1.
        if "ring_flash_attn" not in sys.modules:
            monkeypatch.setitem(
                sys.modules, "ring_flash_attn", types.ModuleType("ring_flash_attn")
            )
        cp_base_cfg["context_parallel_size"] = context_parallel_size
        cfg = validate_config(cp_base_cfg)
        normalize_config(cfg)
        assert cfg.batch_size == expected_batch_size


================================================
FILE: tests/test_convert.py
================================================
"""Unit tests for src/axolotl/convert.py"""

import json

import pytest

from axolotl.convert import (
    FileReader,
    FileWriter,
    JsonlSerializer,
    JsonParser,
    JsonToJsonlConverter,
    StdoutWriter,
)


class TestJsonParser:
    def test_parse_valid_json_array(self):
        parser = JsonParser()
        result = parser.parse('[{"key": "value"}]')
        assert result == [{"key": "value"}]

    def test_parse_valid_json_object(self):
        parser = JsonParser()
        result = parser.parse('{"key": "value"}')
        assert result == {"key": "value"}

    def test_parse_invalid_json_raises(self):
        parser = JsonParser()
        with pytest.raises(json.JSONDecodeError):
            parser.parse("not valid json")


class TestJsonlSerializer:
    def test_serialize_single_item(self):
        serializer = JsonlSerializer()
        result = serializer.serialize([{"a": 1}])
        assert result == '{"a": 1}'

    def test_serialize_multiple_items(self):
        serializer = JsonlSerializer()
        result = serializer.serialize([{"a": 1}, {"b": 2}])
        lines = result.split("\n")
        assert len(lines) == 2
        assert json.loads(lines[0]) == {"a": 1}
        assert json.loads(lines[1]) == {"b": 2}

    def test_serialize_empty_list(self):
        serializer = JsonlSerializer()
        result = serializer.serialize([])
        assert result == ""


class TestFileReaderWriter:
    def test_read_write_roundtrip(self, tmp_path):
        test_file = tmp_path / "test.txt"
        content = '{"hello": "world"}'
        writer = FileWriter(str(test_file))
        writer.write(content)

        reader = FileReader()
        result = reader.read(str(test_file))
        assert result == content


class TestStdoutWriter:
    def test_write_to_stdout(self, capsys):
        writer = StdoutWriter()
        writer.write("hello")
        captured = capsys.readouterr()
        assert captured.out == "hello\n"


class TestJsonToJsonlConverter:
    def test_convert_json_to_jsonl(self, tmp_path):
        input_data = [{"name": "Alice"}, {"name": "Bob"}]
        input_file = tmp_path / "input.json"
        output_file = tmp_path / "output.jsonl"

        input_file.write_text(json.dumps(input_data), encoding="utf-8")

        converter = JsonToJsonlConverter(
            FileReader(), FileWriter(str(output_file)), JsonParser(), JsonlSerializer()
        )
        converter.convert(str(input_file))

        result = output_file.read_text(encoding="utf-8")
        lines = result.split("\n")
        assert len(lines) == 2
        assert json.loads(lines[0]) == {"name": "Alice"}
        assert json.loads(lines[1]) == {"name": "Bob"}


================================================
FILE: tests/test_data.py
================================================
"""
test module for the axolotl.utils.data module
"""

import unittest

from transformers import LlamaTokenizer

from axolotl.utils.data import encode_streaming, md5
from axolotl.utils.trainer import filter_sequences_by_length

from tests.hf_offline_utils import enable_hf_offline


class TestEncodePretraining(unittest.TestCase):
    """
    test class for encode pretraining and md5 helper
    """

    @enable_hf_offline
    def setUp(self):
        self.tokenizer = LlamaTokenizer.from_pretrained("huggyllama/llama-7b")
        self.tokenizer.add_special_tokens(
            {
                "eos_token": "</s>",
                "bos_token": "<s>",
                "unk_token": "<unk>",
                "pad_token": "<pad>",
            }
        )
        self.max_tokens = 15  # set a small number for easy inspection

    def test_encode_pretraining(self):
        examples = {
            "text": [
                "Hello, world!",
                "Nice to meet you.",
                "lorem ipsum dolor sit amet.",
                "Nice to meet you again!.",
                "hello, hello",
            ]
        }
        result = encode_streaming(examples, self.tokenizer, self.max_tokens)

        self.assertEqual(len(result["input_ids"]), 3)

        # Assert the length of input_ids and attention_mask is correct
        self.assertEqual(len(result["input_ids"][0]), self.max_tokens)
        self.assertEqual(len(result["attention_mask"][0]), self.max_tokens)

        # Assert EOS and PAD tokens are correctly added
        # hello world! is 4 tokens
        self.assertEqual(result["input_ids"][0][0], self.tokenizer.bos_token_id)
        self.assertEqual(result["input_ids"][0][5], self.tokenizer.eos_token_id)
        self.assertEqual(result["input_ids"][0][6], self.tokenizer.pad_token_id)
        # second part, 5 tokens
        self.assertEqual(result["input_ids"][0][7], self.tokenizer.bos_token_id)
        self.assertEqual(result["input_ids"][0][13], self.tokenizer.eos_token_id)
        self.assertEqual(result["input_ids"][0][14], self.tokenizer.pad_token_id)

    def test_md5(self):
        self.assertEqual(md5("hello world"), "5eb63bbbe01eeed093cb22bb8f5acdc3")
        self.assertEqual(
            md5("hello world", "utf-8"), "5eb63bbbe01eeed093cb22bb8f5acdc3"
        )

    def test_excess_length_strategy(self):
        """Test that excess_length_strategy results in a value error when set to 'raise'."""

        # -- single sequence --
        # This should work
        data = {"input_ids": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]}
        filter_sequences_by_length(data, 32, raise_on_drop=True)

        # This should return True, since data fits
        dropped = filter_sequences_by_length(data, 32)
        self.assertTrue(dropped)

        # This should raise
        self.assertRaises(
            ValueError, filter_sequences_by_length, data, 15, raise_on_drop=True
        )

        # This should return False, since data doesn't fit
        dropped = filter_sequences_by_length(data, 15)
        self.assertFalse(dropped)

        # -- batch sequence --
        # This should work
        data = {
            "input_ids": [
                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15],
                [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16],
            ]
        }
        filter_sequences_by_length(data, 32, raise_on_drop=True)

        # This should raise
        self.assertRaises(
            ValueError, filter_sequences_by_length, data, 15, raise_on_drop=True
        )

        # This should keep the first but drop the second entry
        dropped = filter_sequences_by_length(data, 15)
        self.assertEqual(dropped, [True, False])


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/test_datasets.py
================================================
"""Test dataset loading under various conditions."""

import shutil
import tempfile
from pathlib import Path
from typing import Any, Generator
from unittest.mock import patch

import pytest
from datasets import Dataset
from huggingface_hub import snapshot_download
from transformers import PreTrainedTokenizer

from axolotl.loaders.tokenizer import load_tokenizer
from axolotl.utils.data.rl import prepare_preference_datasets
from axolotl.utils.data.sft import (
    _load_tokenized_prepared_datasets,
)
from axolotl.utils.dict import DictDefault

from tests.constants import (
    ALPACA_MESSAGES_CONFIG_OG,
    ALPACA_MESSAGES_CONFIG_REVISION,
    SPECIAL_TOKENS,
)
from tests.hf_offline_utils import enable_hf_offline


class TestDatasetPreparation:
    """Test a configured dataloader."""

    @pytest.fixture
    def tokenizer(
        self, tokenizer_huggyllama
    ) -> Generator[PreTrainedTokenizer, Any, Any]:
        tokenizer_huggyllama.add_special_tokens(SPECIAL_TOKENS)
        yield tokenizer_huggyllama

    @pytest.fixture
    def dataset_fixture(self):
        yield Dataset.from_list(
            [
                {
                    "instruction": "Evaluate this sentence for spelling and grammar mistakes",
                    "input": "He finnished his meal and left the resturant",
                    "output": "He finished his meal and left the restaurant.",
                }
            ]
        )

    @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits")
    @enable_hf_offline
    def test_load_hub(self, tokenizer):
        """Core use case.  Verify that processing data from the hub works"""
        with tempfile.TemporaryDirectory() as tmp_dir:
            prepared_path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
                {
                    "tokenizer_config": "huggyllama/llama-7b",
                    "sequence_len": 1024,
                    "datasets": [
                        {
                            "path": "mhenrichsen/alpaca_2k_test",
                            "type": "alpaca",
                        },
                    ],
                }
            )

            with patch(
                "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path)
            ):
                dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)

            assert len(dataset) == 2000
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features

    @enable_hf_offline
    @pytest.mark.skip("datasets bug with local datasets when offline")
    def test_load_local_hub(self, tokenizer):
        """Niche use case.  Verify that a local copy of a hub dataset can be loaded"""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
            tmp_ds_path.mkdir(parents=True, exist_ok=True)
            snapshot_path = snapshot_download(
                repo_id="mhenrichsen/alpaca_2k_test",
                repo_type="dataset",
                local_dir=tmp_ds_path,
            )
            # offline mode doesn't actually copy it to local_dir, so we
            # have to copy all the contents in the dir manually from the returned snapshot_path
            shutil.copytree(snapshot_path, tmp_ds_path, dirs_exist_ok=True)

            prepared_path = Path(tmp_dir) / "prepared"
            # Right now a local copy that doesn't fully conform to a dataset
            # must list data_files and ds_type otherwise the loader won't know
            # how to load it.
            cfg = DictDefault(
                {
                    "tokenizer_config": "HuggingFaceTB/SmolLM2-135M",
                    "sequence_len": 1024,
                    "datasets": [
                        {
                            "path": "mhenrichsen/alpaca_2k_test",
                            "ds_type": "parquet",
                            "type": "alpaca",
                            "data_files": [
                                f"{tmp_ds_path}/alpaca_2000.parquet",
                            ],
                        },
                    ],
                }
            )

            with patch(
                "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path)
            ):
                dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)

            assert len(dataset) == 2000
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features
            shutil.rmtree(tmp_ds_path)

    @enable_hf_offline
    def test_load_from_save_to_disk(self, tokenizer, dataset_fixture):
        """Usual use case.  Verify datasets saved via `save_to_disk` can be loaded."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_name = Path(tmp_dir) / "tmp_dataset"
            dataset_fixture.save_to_disk(str(tmp_ds_name))

            prepared_path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
                {
                    "tokenizer_config": "huggyllama/llama-7b",
                    "sequence_len": 256,
                    "datasets": [
                        {
                            "path": str(tmp_ds_name),
                            "type": "alpaca",
                        },
                    ],
                    "dataset_num_proc": 4,
                }
            )

            with patch(
                "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path)
            ):
                dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)

            assert len(dataset) == 1
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features

    @enable_hf_offline
    def test_load_from_dir_of_parquet(self, tokenizer, dataset_fixture):
        """Usual use case. Verify a directory of parquet files can be loaded."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_dir = Path(tmp_dir) / "tmp_dataset"
            tmp_ds_dir.mkdir()
            tmp_ds_path = tmp_ds_dir / "shard1.parquet"
            dataset_fixture.to_parquet(tmp_ds_path)

            prepared_path: Path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
                {
                    "tokenizer_config": "huggyllama/llama-7b",
                    "sequence_len": 256,
                    "datasets": [
                        {
                            "path": str(tmp_ds_dir),
                            "ds_type": "parquet",
                            "name": "test_data",
                            "data_files": [
                                str(tmp_ds_path),
                            ],
                            "type": "alpaca",
                        },
                    ],
                    "dataset_num_proc": 4,
                }
            )

            with patch(
                "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path)
            ):
                dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)

            assert len(dataset) == 1
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features

    @enable_hf_offline
    def test_load_from_dir_of_json(self, tokenizer, dataset_fixture):
        """Standard use case.  Verify a directory of json files can be loaded."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_dir = Path(tmp_dir) / "tmp_dataset"
            tmp_ds_dir.mkdir()
            tmp_ds_path = tmp_ds_dir / "shard1.json"
            dataset_fixture.to_json(tmp_ds_path)

            prepared_path: Path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
                {
                    "tokenizer_config": "huggyllama/llama-7b",
                    "sequence_len": 256,
                    "datasets": [
                        {
                            "path": str(tmp_ds_dir),
                            "ds_type": "json",
                            "name": "test_data",
                            "data_files": [
                                str(tmp_ds_path),
                            ],
                            "type": "alpaca",
                        },
                    ],
                    "dataset_num_proc": 4,
                }
            )

            with patch(
                "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path)
            ):
                dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)

            assert len(dataset) == 1
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features

    @enable_hf_offline
    def test_load_from_single_parquet(self, tokenizer, dataset_fixture):
        """Standard use case.  Verify a single parquet file can be loaded."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "tmp_dataset.parquet"
            dataset_fixture.to_parquet(tmp_ds_path)

            prepared_path: Path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
                {
                    "tokenizer_config": "huggyllama/llama-7b",
                    "sequence_len": 256,
                    "datasets": [
                        {
                            "path": str(tmp_ds_path),
                            "name": "test_data",
                            "type": "alpaca",
                        },
                    ],
                    "dataset_num_proc": 4,
                }
            )

            with patch(
                "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path)
            ):
                dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)

            assert len(dataset) == 1
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features

    @enable_hf_offline
    def test_load_from_single_json(self, tokenizer, dataset_fixture):
        """Standard use case.  Verify a single json file can be loaded."""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "tmp_dataset.json"
            dataset_fixture.to_json(tmp_ds_path)

            prepared_path: Path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
                {
                    "tokenizer_config": "huggyllama/llama-7b",
                    "sequence_len": 256,
                    "datasets": [
                        {
                            "path": str(tmp_ds_path),
                            "name": "test_data",
                            "type": "alpaca",
                        },
                    ],
                    "dataset_num_proc": 4,
                }
            )

            with patch(
                "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path)
            ):
                dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)

            assert len(dataset) == 1
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features

    @pytest.mark.skip(reason="TODO: fix hf offline mode for CI rate limits")
    @enable_hf_offline
    def test_load_hub_with_dpo(self):
        """Verify that processing dpo data from the hub works"""

        cfg = DictDefault(
            {
                "tokenizer_config": "huggyllama/llama-7b",
                "sequence_len": 1024,
                "rl": "dpo",
                "chat_template": "llama3",
                "datasets": [ALPACA_MESSAGES_CONFIG_OG],
            }
        )

        tokenizer = load_tokenizer(cfg)
        train_dataset, _ = prepare_preference_datasets(cfg, tokenizer)

        assert len(train_dataset) == 1800
        assert "conversation" not in train_dataset.features
        assert "chosen" in train_dataset.features
        assert "rejected" in train_dataset.features
        assert "prompt" in train_dataset.features

    @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits")
    @enable_hf_offline
    def test_load_hub_with_revision(self, tokenizer):
        """Verify that processing data from the hub works with a specific revision"""
        with tempfile.TemporaryDirectory() as tmp_dir:
            prepared_path = Path(tmp_dir) / "prepared"

            # make sure prepared_path is empty
            shutil.rmtree(prepared_path, ignore_errors=True)

            cfg = DictDefault(
                {
                    "tokenizer_config": "huggyllama/llama-7b",
                    "sequence_len": 1024,
                    "datasets": [
                        {
                            "path": "mhenrichsen/alpaca_2k_test",
                            "type": "alpaca",
                            "revision": "d05c1cb",
                        },
                    ],
                }
            )

            with patch(
                "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path)
            ):
                dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)

            assert len(dataset) == 2000
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features

    @enable_hf_offline
    def test_load_hub_with_revision_with_dpo(
        self, dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff
    ):
        """Verify that processing dpo data from the hub works with a specific revision"""

        cfg = DictDefault(
            {
                "tokenizer_config": "huggyllama/llama-7b",
                "sequence_len": 1024,
                "rl": "dpo",
                "chat_template": "llama3",
                "datasets": [ALPACA_MESSAGES_CONFIG_REVISION],
                "dataset_num_proc": 4,
            }
        )

        with patch(
            "axolotl.utils.data.rl.load_dataset_with_config"
        ) as mock_load_dataset:
            # Set up the mock to return different values on successive calls
            mock_load_dataset.return_value = (
                dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff
            )

            tokenizer = load_tokenizer(cfg)
            train_dataset, _ = prepare_preference_datasets(cfg, tokenizer)

            assert len(train_dataset) == 1800
            assert "conversation" not in train_dataset.features
            assert "chosen" in train_dataset.features
            assert "rejected" in train_dataset.features
            assert "prompt" in train_dataset.features

    @enable_hf_offline
    @pytest.mark.skip("datasets bug with local datasets when offline")
    def test_load_local_hub_with_revision(
        self, dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff, tokenizer
    ):
        """Verify that a local copy of a hub dataset can be loaded with a specific revision"""
        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
            tmp_ds_path.mkdir(parents=True, exist_ok=True)
            snapshot_path = snapshot_download(
                repo_id="mhenrichsen/alpaca_2k_test",
                repo_type="dataset",
                local_dir=tmp_ds_path,
                revision="d05c1cb",
            )
            shutil.copytree(snapshot_path, tmp_ds_path, dirs_exist_ok=True)

            prepared_path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
                {
                    "tokenizer_config": "huggyllama/llama-7b",
                    "sequence_len": 1024,
                    "datasets": [
                        {
                            "path": "mhenrichsen/alpaca_2k_test",
                            "ds_type": "parquet",
                            "type": "alpaca",
                            "data_files": [
                                f"{tmp_ds_path}/alpaca_2000.parquet",
                            ],
                            "revision": "d05c1cb",
                        },
                    ],
                }
            )

            with patch(
                "axolotl.utils.data.shared.load_dataset_with_config"
            ) as mock_load_dataset:
                # Set up the mock to return different values on successive calls
                mock_load_dataset.return_value = (
                    dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff
                )

                with patch(
                    "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH",
                    str(prepared_path),
                ):
                    dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)

                assert len(dataset) == 2000
                assert "input_ids" in dataset.features
                assert "attention_mask" in dataset.features
                assert "labels" in dataset.features
                shutil.rmtree(tmp_ds_path)

    @enable_hf_offline
    def test_loading_local_dataset_folder(self, tokenizer):
        """Verify that a dataset downloaded to a local folder can be loaded"""

        with tempfile.TemporaryDirectory() as tmp_dir:
            tmp_ds_path = Path(tmp_dir) / "mhenrichsen/alpaca_2k_test"
            tmp_ds_path.mkdir(parents=True, exist_ok=True)
            snapshot_path = snapshot_download(
                repo_id="mhenrichsen/alpaca_2k_test",
                repo_type="dataset",
            )
            shutil.copytree(snapshot_path, tmp_ds_path, dirs_exist_ok=True)

            prepared_path = Path(tmp_dir) / "prepared"
            cfg = DictDefault(
                {
                    "tokenizer_config": "huggyllama/llama-7b",
                    "sequence_len": 1024,
                    "datasets": [
                        {
                            "path": str(tmp_ds_path),
                            "type": "alpaca",
                        },
                    ],
                    "dataset_num_proc": 4,
                }
            )

            with patch(
                "axolotl.common.const.DEFAULT_DATASET_PREPARED_PATH", str(prepared_path)
            ):
                dataset, _ = _load_tokenized_prepared_datasets(tokenizer, cfg)

            assert len(dataset) == 2000
            assert "input_ids" in dataset.features
            assert "attention_mask" in dataset.features
            assert "labels" in dataset.features
            shutil.rmtree(tmp_ds_path)


================================================
FILE: tests/test_dict.py
================================================
"""Module for testing DictDefault class"""

import unittest

import pytest

from axolotl.utils.dict import DictDefault


class DictDefaultTest(unittest.TestCase):
    """
    Test DictDefault class
    """

    def test_dict_default(self):
        cfg = DictDefault(
            {
                "key_a": {"key_b": "value_a"},
                "key_c": "value_c",
                "key_d": ["value_d", "value_e"],
            }
        )

        assert cfg.key_a.key_b == "value_a", (
            "DictDefault should return value for existing nested keys"
        )

        assert cfg.key_c == "value_c", (
            "DictDefault should return value for existing keys"
        )

        assert cfg.key_d[0] == "value_d", (
            "DictDefault should return value for existing keys in list"
        )

        assert "value_e" in cfg.key_d, (
            "DictDefault should support in operator for existing keys in list"
        )

    def test_dict_or_operator(self):
        cfg = DictDefault({"key_a": {"key_b": "value_b"}, "key_f": "value_g"})

        cfg = cfg | DictDefault(
            {
                "key_a": {"key_b": "value_a"},
                "key_c": "value_c",
                "key_d": ["value_d", "value_e"],
                "key_f": "value_f",
            }
        )

        assert cfg.key_a.key_b == "value_b", (
            "DictDefault should support OR operator for existing nested keys"
        )

        assert cfg.key_c == "value_c", "DictDefault should not delete existing key"

        assert cfg.key_d == [
            "value_d",
            "value_e",
        ], "DictDefault should not overwrite existing keys in list"

        assert cfg.key_f == "value_g", (
            "DictDefault should support OR operator for existing key"
        )

    def test_dict_missingkey(self):
        cfg = DictDefault({})

        assert cfg.random_key is None, "DictDefault should return None for missing keys"

    def test_dict_or(self):
        cfg = DictDefault({}) | DictDefault({})

        assert cfg.random_key is None, (
            "DictDefault should return None for missing keys after | operation"
        )

    def test_dict_nested_missingparentkey(self):
        """
        Due to subclassing Dict, DictDefault will error if we try to access a nested key whose parent key does not exist.
        """
        cfg = DictDefault({})

        with pytest.raises(
            AttributeError,
            match=r"'NoneType' object has no attribute 'another_random_key'",
        ):
            cfg.random_key.another_random_key = "value"

    def test_dict_shorthand_assignment(self):
        """
        Shorthand assignment is said to not be supported if subclassed. However, their example raises error instead of None.
        This test ensures that it is supported for current implementation.

        Ref: https://github.com/mewwts/addict#default-values
        """

        cfg = DictDefault({"key_a": {"key_b": "value_a"}})

        cfg.key_a.key_b = "value_b"

        assert cfg.key_a.key_b == "value_b", "Shorthand assignment should be supported"


================================================
FILE: tests/test_exact_deduplication.py
================================================
"""Test suite for functions in the `axolotl.utils.data.utils` module, focusing on the
`deduplicate_and_log_datasets` function.

Additionally, this test suite includes tests for functions that indirectly call
`deduplicate_and_log_datasets` during the execution of the preprocess command.
"""

import unittest
from unittest.mock import patch

import pytest
from datasets import Dataset

from axolotl.loaders import load_processor, load_tokenizer
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.data import prepare_datasets, prepare_preference_datasets
from axolotl.utils.data.utils import deduplicate_and_log_datasets
from axolotl.utils.dict import DictDefault

from tests.constants import ALPACA_MESSAGES_CONFIG_REVISION
from tests.hf_offline_utils import enable_hf_offline


def verify_deduplication(actual_dataset, expected_dataset, dataset_name):
    """Validates deduplication results and size consistency.

    Parameters:
    - actual_dataset: Deduplicated dataset.
    - expected_dataset: Expected dataset.
    - dataset_name: Name of the dataset (e.g., 'train' or 'eval').

    Asserts:
    - Datasets match in content.
    - Dataset size matches unique row count.
    """
    # Convert datasets to sets of tuples for unordered comparison
    actual_rows = set(tuple(row.values()) for row in actual_dataset)
    expected_rows = set(tuple(row.values()) for row in expected_dataset)

    # Verify deduplication correctness
    assert actual_rows == expected_rows, f"Mismatch in {dataset_name} dataset"

    # Verify size consistency
    assert len(actual_rows) == len(actual_dataset), (
        f"Size mismatch in {dataset_name} dataset after deduplication"
    )


class TestDeduplicateIndividualFunctions(unittest.TestCase):
    """Test class for deduplication function in data utils"""

    def setUp(self):
        # Sample data with duplicates
        self.data = {
            "column1": ["apple", "banana", "apple", "orange", "banana"],
            "column2": [1, 2, 1, 3, 2],
            "column3": ["red", "yellow", "red", "orange", "yellow"],
        }

        # Expected result after deduplication
        self.expected_data = {
            "column1": ["apple", "banana", "orange"],
            "column2": [1, 2, 3],
            "column3": ["red", "yellow", "orange"],
        }

        # Convert to Dataset format
        self.dataset = Dataset.from_dict(self.data)
        self.expected_dataset = Dataset.from_dict(self.expected_data)

    def test_deduplication(self):
        train_dataset, _ = deduplicate_and_log_datasets(dataset=self.dataset)
        eval_dataset, _ = deduplicate_and_log_datasets(
            dataset=self.dataset, dataset_name="eval"
        )

        verify_deduplication(train_dataset, self.expected_dataset, "train_dataset")
        verify_deduplication(eval_dataset, self.expected_dataset, "eval_dataset")

    def test_exact_duplicates(self):
        # Test when datasets are exact duplicates
        duplicate_data = {
            "column1": ["apple", "apple", "apple"],
            "column2": [1, 1, 1],
            "column3": ["red", "red", "red"],
        }
        expected_data = {"column1": ["apple"], "column2": [1], "column3": ["red"]}

        # Convert to Dataset format
        dataset = Dataset.from_dict(duplicate_data)
        expected_dataset = Dataset.from_dict(expected_data)

        # Run deduplication
        train_dataset, _ = deduplicate_and_log_datasets(dataset=dataset)
        eval_dataset, _ = deduplicate_and_log_datasets(
            dataset=dataset, dataset_name="eval"
        )

        verify_deduplication(train_dataset, expected_dataset, "train_dataset")
        verify_deduplication(eval_dataset, expected_dataset, "eval_dataset")

    def test_partial_duplicates(self):
        # Test when only part of the dataset is a duplicate
        partial_duplicate_data = {
            "column1": ["apple", "banana", "apple"],
            "column2": [1, 2, 1],
            "column3": ["red", "yellow", "red"],
        }
        expected_data = {
            "column1": ["apple", "banana"],
            "column2": [1, 2],
            "column3": ["red", "yellow"],
        }

        # Convert to Dataset format
        dataset = Dataset.from_dict(partial_duplicate_data)
        expected_dataset = Dataset.from_dict(expected_data)

        # Run deduplication
        train_dataset, _ = deduplicate_and_log_datasets(dataset=dataset)
        eval_dataset, _ = deduplicate_and_log_datasets(
            dataset=dataset, dataset_name="eval"
        )

        verify_deduplication(train_dataset, expected_dataset, "train_dataset")
        verify_deduplication(eval_dataset, expected_dataset, "eval_dataset")

    def test_combined_duplicates_empty(self):
        # Test when only part of the dataset is a duplicate
        partial_duplicate_data = {
            "column1": ["apple", "banana", "apple"],
            "column2": [1, 2, 1],
            "column3": ["red", "yellow", "red"],
        }
        expected_data_train = {
            "column1": ["apple", "banana"],
            "column2": [1, 2],
            "column3": ["red", "yellow"],
        }
        expected_data_eval = {
            "column1": [],
            "column2": [],
            "column3": [],
        }

        # Convert to Dataset format
        dataset = Dataset.from_dict(partial_duplicate_data)
        expected_dataset_train = Dataset.from_dict(expected_data_train)
        expected_dataset_eval = Dataset.from_dict(expected_data_eval)

        # Run deduplication
        train_dataset, eval_dataset = deduplicate_and_log_datasets(
            dataset=dataset, other_dataset=dataset
        )

        verify_deduplication(train_dataset, expected_dataset_train, "train_dataset")
        verify_deduplication(eval_dataset, expected_dataset_eval, "eval_dataset")

    def test_combined_duplicates_one(self):
        # Test when only part of the dataset is a duplicate
        partial_duplicate_data_train = {
            "column1": ["apple", "banana", "apple"],
            "column2": [1, 2, 1],
            "column3": ["red", "yellow", "red"],
        }
        partial_duplicate_data_eval = {
            "column1": ["apple", "orange", "apple"],
            "column2": [1, 2, 1],
            "column3": ["red", "orange", "red"],
        }
        expected_data_train = {
            "column1": ["apple", "banana"],
            "column2": [1, 2],
            "column3": ["red", "yellow"],
        }
        expected_data_eval = {
            "column1": ["orange"],
            "column2": [2],
            "column3": ["orange"],
        }

        # Convert to Dataset format
        dataset_train = Dataset.from_dict(partial_duplicate_data_train)
        dataset_eval = Dataset.from_dict(partial_duplicate_data_eval)
        expected_dataset_train = Dataset.from_dict(expected_data_train)
        expected_dataset_eval = Dataset.from_dict(expected_data_eval)

        # Run deduplication
        train_dataset, eval_dataset = deduplicate_and_log_datasets(
            dataset=dataset_train, other_dataset=dataset_eval
        )

        verify_deduplication(train_dataset, expected_dataset_train, "train_dataset")
        verify_deduplication(eval_dataset, expected_dataset_eval, "eval_dataset")


class TestDeduplicateRLDataset:
    """Test a configured dataloader with deduplication."""

    @pytest.fixture
    def cfg(self):
        fixture = DictDefault(
            {
                "tokenizer_config": "huggyllama/llama-7b",
                "sequence_len": 1024,
                "rl": "dpo",
                "chat_template": "llama3",
                "dataset_exact_deduplication": True,
                "datasets": [
                    ALPACA_MESSAGES_CONFIG_REVISION,
                    ALPACA_MESSAGES_CONFIG_REVISION,
                ],
                "dataset_num_proc": 4,
            }
        )
        yield fixture

    @enable_hf_offline
    def test_load_with_deduplication(
        self,
        cfg,
        dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff,
        tokenizer_huggyllama,
    ):
        """Verify that loading with deduplication removes duplicates."""

        with (
            patch(
                "axolotl.utils.data.rl.load_dataset_with_config"
            ) as mock_load_dataset,
            patch("axolotl.loaders.load_tokenizer") as mock_load_tokenizer,
        ):
            # Set up the mock to return different values on successive calls
            mock_load_dataset.side_effect = [
                dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff,
                dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff,
            ]
            mock_load_tokenizer.return_value = tokenizer_huggyllama

            tokenizer = load_tokenizer(cfg)
            train_dataset, _ = prepare_preference_datasets(cfg, tokenizer)

            # Verify that the dataset has been deduplicated
            assert len(train_dataset) == 1800, "Dataset was not properly deduplicated"

    @enable_hf_offline
    def test_load_without_deduplication(
        self,
        cfg,
        dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff,
        tokenizer_huggyllama,
    ):
        with (
            patch(
                "axolotl.utils.data.rl.load_dataset_with_config"
            ) as mock_load_dataset,
            patch("axolotl.loaders.load_tokenizer") as mock_load_tokenizer,
        ):
            # Set up the mock to return different values on successive calls
            mock_load_dataset.side_effect = [
                dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff,
                dataset_fozziethebeat_alpaca_messages_2k_dpo_test_rev_ea82cff,
            ]
            mock_load_tokenizer.return_value = tokenizer_huggyllama

            # Load the dataset without deduplication
            cfg.dataset_exact_deduplication = False
            tokenizer = load_tokenizer(cfg)
            train_dataset, _ = prepare_preference_datasets(cfg, tokenizer)

            # Verify that the dataset retains duplicates
            assert len(train_dataset) == 1800 * 2, (
                "Dataset deduplication occurred when it should not have"
            )


class TestDeduplicateNonRL(unittest.TestCase):
    """Test prepare_dataset function with different configurations."""

    @enable_hf_offline
    def setUp(self) -> None:
        self.cfg_1 = DictDefault(
            {
                "base_model": "huggyllama/llama-7b",
                "tokenizer_config": "huggyllama/llama-7b",
                "sequence_len": 1024,
                "dataset_exact_deduplication": True,
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "val_set_size": 0.0,
                "gradient_accumulation_steps": 2,
                "batch_size": 10,
                "micro_batch_size": 10,
                "num_epochs": 1,
            }
        )
        self.cfg_1 = validate_config(self.cfg_1)
        normalize_config(self.cfg_1)

    @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits")
    @enable_hf_offline
    def test_prepare_dataset_with_deduplication_train(self):
        """Verify that prepare_dataset function processes the dataset correctly with deduplication."""
        self.cfg_1.dataset_exact_deduplication = True

        # Load tokenizer and processor
        tokenizer = load_tokenizer(self.cfg_1)
        processor = (
            load_processor(self.cfg_1, tokenizer=tokenizer)
            if self.cfg_1.processor_type
            else None
        )

        # Prepare dataset using the prepare_dataset function
        train_dataset, _, _, _ = prepare_datasets(
            self.cfg_1,
            tokenizer,
            processor=processor,
        )

        self.assertEqual(
            len(train_dataset),
            2000,
            "Train dataset should have 2000 samples after deduplication.",
        )

    @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits")
    @enable_hf_offline
    def test_prepare_dataset_with_deduplication_eval(self):
        """Verify that prepare_dataset function processes the dataset correctly with deduplication."""
        self.cfg_1.dataset_exact_deduplication = True
        self.cfg_1.val_set_size = 0.5
        # Load tokenizer and processor
        tokenizer = load_tokenizer(self.cfg_1)
        processor = (
            load_processor(self.cfg_1, tokenizer=tokenizer)
            if self.cfg_1.processor_type
            else None
        )

        # Prepare dataset using the prepare_dataset function
        _, eval_dataset, _, _ = prepare_datasets(
            self.cfg_1,
            tokenizer,
            processor=processor,
        )

        self.assertEqual(
            len(eval_dataset),
            1000,
            "Eval dataset should have 2000 samples after deduplication.",
        )

    @pytest.mark.skip(reason="TODO: fix hf hub offline to work with HF rate limits")
    @enable_hf_offline
    def test_prepare_dataset_without_deduplication(self):
        """Verify that prepare_dataset function processes the dataset correctly without deduplication."""
        self.cfg_1.dataset_exact_deduplication = False
        self.cfg_1.val_set_size = 0.1
        # Load tokenizer and processor
        tokenizer = load_tokenizer(self.cfg_1)
        processor = (
            load_processor(self.cfg_1, tokenizer=tokenizer)
            if self.cfg_1.processor_type
            else None
        )

        # Prepare dataset using the prepare_dataset function
        train_dataset, eval_dataset, _, _ = prepare_datasets(
            self.cfg_1,
            tokenizer,
            processor=processor,
        )

        # Verify that the dataset has been prepared correctly
        self.assertEqual(
            len(train_dataset),
            1800 * 2,
            "Train dataset should have 3600 samples without deduplication.",
        )
        self.assertEqual(
            len(eval_dataset),
            200 * 2,
            "Train dataset should have 400 samples after deduplication.",
        )


class TestWrongCollisions(unittest.TestCase):
    """Creating mock datasets for testing wrong collisions."""

    def setUp(self):
        self.train_data = {"text": ["sample 5", "sample 6"], "label": [1, 2]}
        self.eval_data = {
            "text": [
                "sample 5",
                "sample 7",
            ],  # Different label but same text as in train_data
            "label": [2, 3],
        }
        self.dataset_data = {
            "text": ["sample 5", "sample 9", "sample 5"],
            "label": [1, 2, 8],
        }
        self.train_dataset = Dataset.from_dict(self.train_data)
        self.eval_dataset = Dataset.from_dict(self.eval_data)
        self.dataset = Dataset.from_dict(self.dataset_data)

    def test_deduplication_dataset_only(self):
        dedup_dataset, _ = deduplicate_and_log_datasets(dataset=self.dataset)
        self.assertEqual(
            len(dedup_dataset), 3, "Dataset should have all original values"
        )
        self.assertEqual(
            str(dedup_dataset),
            str(self.dataset),
            "The string representation of the output dataset should not differ.",
        )


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/test_freeze.py
================================================
"""
This module contains unit tests for the `freeze_layers_except` function.

The `freeze_layers_except` function is used to freeze layers in a model, except for the specified layers.
The unit tests in this module verify the behavior of the `freeze_layers_except` function in different scenarios.
"""

import unittest

import torch
from torch import nn

from axolotl.utils.freeze import freeze_layers_except

ZERO = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
ONE_TO_TEN = [1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0]


class TestFreezeLayersExcept(unittest.TestCase):
    """
    A test case class for the `freeze_layers_except` function.
    """

    def setUp(self):
        self.model = _TestModel()

    def test_freeze_layers_with_dots_in_name(self):
        freeze_layers_except(self.model, ["features.layer"])
        self.assertTrue(
            self.model.features.layer.weight.requires_grad,
            "model.features.layer should be trainable.",
        )
        self.assertFalse(
            self.model.classifier.weight.requires_grad,
            "model.classifier should be frozen.",
        )

    def test_freeze_layers_without_dots_in_name(self):
        freeze_layers_except(self.model, ["classifier"])
        self.assertFalse(
            self.model.features.layer.weight.requires_grad,
            "model.features.layer should be trainable.",
        )
        self.assertTrue(
            self.model.classifier.weight.requires_grad,
            "model.classifier should be frozen.",
        )

    def test_freeze_layers_regex_patterns(self):
        # The second pattern cannot match because only characters 'a' to 'c' are allowed after the word 'class', whereas it should be matching the character 'i'.
        freeze_layers_except(self.model, [r"^features.[a-z]+.weight$", r"class[a-c]+"])
        self.assertTrue(
            self.model.features.layer.weight.requires_grad,
            "model.features.layer should be trainable.",
        )
        self.assertFalse(
            self.model.classifier.weight.requires_grad,
            "model.classifier should be frozen.",
        )

    def test_all_layers_frozen(self):
        freeze_layers_except(self.model, [])
        self.assertFalse(
            self.model.features.layer.weight.requires_grad,
            "model.features.layer should be frozen.",
        )
        self.assertFalse(
            self.model.classifier.weight.requires_grad,
            "model.classifier should be frozen.",
        )

    def test_all_layers_unfrozen(self):
        freeze_layers_except(self.model, ["features.layer", "classifier"])
        self.assertTrue(
            self.model.features.layer.weight.requires_grad,
            "model.features.layer should be trainable.",
        )
        self.assertTrue(
            self.model.classifier.weight.requires_grad,
            "model.classifier should be trainable.",
        )

    def test_freeze_layers_with_range_pattern_start_end(self):
        freeze_layers_except(self.model, ["features.layer[1:5]"])
        self.assertTrue(
            self.model.features.layer.weight.requires_grad,
            "model.features.layer should be trainable.",
        )
        self.assertFalse(
            self.model.classifier.weight.requires_grad,
            "model.classifier should be frozen.",
        )

        self._assert_gradient_output(
            [
                ZERO,
                ONE_TO_TEN,
                ONE_TO_TEN,
                ONE_TO_TEN,
                ONE_TO_TEN,
                ZERO,
                ZERO,
                ZERO,
                ZERO,
                ZERO,
            ]
        )

    def test_freeze_layers_with_range_pattern_single_index(self):
        freeze_layers_except(self.model, ["features.layer[5]"])
        self.assertTrue(
            self.model.features.layer.weight.requires_grad,
            "model.features.layer should be trainable.",
        )
        self.assertFalse(
            self.model.classifier.weight.requires_grad,
            "model.classifier should be frozen.",
        )

        self._assert_gradient_output(
            [ZERO, ZERO, ZERO, ZERO, ZERO, ONE_TO_TEN, ZERO, ZERO, ZERO, ZERO]
        )

    def test_freeze_layers_with_range_pattern_start_omitted(self):
        freeze_layers_except(self.model, ["features.layer[:5]"])
        self.assertTrue(
            self.model.features.layer.weight.requires_grad,
            "model.features.layer should be trainable.",
        )
        self.assertFalse(
            self.model.classifier.weight.requires_grad,
            "model.classifier should be frozen.",
        )

        self._assert_gradient_output(
            [
                ONE_TO_TEN,
                ONE_TO_TEN,
                ONE_TO_TEN,
                ONE_TO_TEN,
                ONE_TO_TEN,
                ZERO,
                ZERO,
                ZERO,
                ZERO,
                ZERO,
            ]
        )

    def test_freeze_layers_with_range_pattern_end_omitted(self):
        freeze_layers_except(self.model, ["features.layer[4:]"])
        self.assertTrue(
            self.model.features.layer.weight.requires_grad,
            "model.features.layer should be trainable.",
        )
        self.assertFalse(
            self.model.classifier.weight.requires_grad,
            "model.classifier should be frozen.",
        )

        self._assert_gradient_output(
            [
                ZERO,
                ZERO,
                ZERO,
                ZERO,
                ONE_TO_TEN,
                ONE_TO_TEN,
                ONE_TO_TEN,
                ONE_TO_TEN,
                ONE_TO_TEN,
                ONE_TO_TEN,
            ]
        )

    def test_freeze_layers_with_range_pattern_merge_included(self):
        freeze_layers_except(self.model, ["features.layer[4:]", "features.layer[5:6]"])
        self.assertTrue(
            self.model.features.layer.weight.requires_grad,
            "model.features.layer should be trainable.",
        )
        self.assertFalse(
            self.model.classifier.weight.requires_grad,
            "model.classifier should be frozen.",
        )

        self._assert_gradient_output(
            [
                ZERO,
                ZERO,
                ZERO,
                ZERO,
                ONE_TO_TEN,
                ONE_TO_TEN,
                ONE_TO_TEN,
                ONE_TO_TEN,
                ONE_TO_TEN,
                ONE_TO_TEN,
            ]
        )

    def test_freeze_layers_with_range_pattern_merge_intersect(self):
        freeze_layers_except(self.model, ["features.layer[4:7]", "features.layer[6:8]"])
        self.assertTrue(
            self.model.features.layer.weight.requires_grad,
            "model.features.layer should be trainable.",
        )
        self.assertFalse(
            self.model.classifier.weight.requires_grad,
            "model.classifier should be frozen.",
        )

        self._assert_gradient_output(
            [
                ZERO,
                ZERO,
                ZERO,
                ZERO,
                ONE_TO_TEN,
                ONE_TO_TEN,
                ONE_TO_TEN,
                ONE_TO_TEN,
                ZERO,
                ZERO,
            ]
        )

    def test_freeze_layers_with_range_pattern_merge_separate(self):
        freeze_layers_except(
            self.model,
            ["features.layer[1:2]", "features.layer[3:4]", "features.layer[5:6]"],
        )
        self.assertTrue(
            self.model.features.layer.weight.requires_grad,
            "model.features.layer should be trainable.",
        )
        self.assertFalse(
            self.model.classifier.weight.requires_grad,
            "model.classifier should be frozen.",
        )

        self._assert_gradient_output(
            [
                ZERO,
                ONE_TO_TEN,
                ZERO,
                ONE_TO_TEN,
                ZERO,
                ONE_TO_TEN,
                ZERO,
                ZERO,
                ZERO,
                ZERO,
            ]
        )

    def _assert_gradient_output(self, expected):
        input_tensor = torch.tensor([ONE_TO_TEN], dtype=torch.float32)

        self.model.features.layer.weight.grad = None  # Reset gradients
        output = self.model.features.layer(input_tensor)
        loss = output.sum()
        loss.backward()

        expected_grads = torch.tensor(expected)
        torch.testing.assert_close(
            self.model.features.layer.weight.grad, expected_grads
        )


class _SubLayerModule(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer = nn.Linear(10, 10)


class _TestModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.features = _SubLayerModule()
        self.classifier = nn.Linear(10, 2)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/test_loaders.py
================================================
"""Module for `axolotl.loaders`."""

from unittest.mock import MagicMock

import pytest
from transformers import BitsAndBytesConfig, PreTrainedTokenizerBase
from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
from transformers.utils.import_utils import is_torch_mps_available

from axolotl.loaders import ModelLoader
from axolotl.utils.dict import DictDefault
from axolotl.utils.distributed import _get_parallel_config_kwargs


class TestModelsUtils:
    """Testing module for `axolotl.loaders`."""

    def setup_method(self) -> None:
        # load config
        self.cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "model_type": "AutoModelForCausalLM",
                "tokenizer_type": "AutoTokenizer",
                "load_in_8bit": True,
                "load_in_4bit": False,
                "adapter": "lora",
                "flash_attention": False,
                "sample_packing": True,
                "device_map": "auto",
            }
        )
        self.tokenizer = MagicMock(spec=PreTrainedTokenizerBase)
        self.inference = False
        self.reference_model = True

        # init ModelLoader
        self.model_loader = ModelLoader(
            cfg=self.cfg,
            tokenizer=self.tokenizer,
            inference=self.inference,
            reference_model=self.reference_model,
        )

    def test_set_device_map_config(self):
        # check device_map
        device_map = self.cfg.device_map
        if is_torch_mps_available():
            device_map = "mps"

        self.model_loader._set_device_map_config()
        if is_deepspeed_zero3_enabled():
            assert "device_map" not in self.model_loader.model_kwargs
        else:
            assert device_map in self.model_loader.model_kwargs["device_map"]

        # check torch_dtype
        assert self.cfg.torch_dtype == self.model_loader.model_kwargs["torch_dtype"]

    @pytest.mark.parametrize("adapter", ["lora", "qlora", None])
    @pytest.mark.parametrize("load_in_8bit", [True, False])
    @pytest.mark.parametrize("load_in_4bit", [True, False])
    @pytest.mark.parametrize("gptq", [True, False])
    def test_set_quantization_config(
        self,
        adapter,
        load_in_8bit,
        load_in_4bit,
        gptq,
    ):
        # init cfg as args
        self.cfg.load_in_8bit = load_in_8bit
        self.cfg.load_in_4bit = load_in_4bit
        self.cfg.gptq = gptq
        self.cfg.adapter = adapter

        self.model_loader._set_quantization_config()
        if "quantization_config" in self.model_loader.model_kwargs or self.cfg.gptq:
            assert not (
                hasattr(self.model_loader.model_kwargs, "load_in_8bit")
                and hasattr(self.model_loader.model_kwargs, "load_in_4bit")
            )

        if self.cfg.adapter == "qlora" and load_in_4bit:
            assert isinstance(
                self.model_loader.model_kwargs.get("quantization_config"),
                BitsAndBytesConfig,
            )

            assert (
                self.model_loader.model_kwargs["quantization_config"]._load_in_4bit
                is True
            )
        if self.cfg.adapter == "lora" and load_in_8bit:
            assert isinstance(
                self.model_loader.model_kwargs.get("quantization_config"),
                BitsAndBytesConfig,
            )

            assert (
                self.model_loader.model_kwargs["quantization_config"]._load_in_8bit
                is True
            )

    def test_message_property_mapping(self):
        """Test message property mapping configuration validation"""
        from axolotl.utils.schemas.datasets import SFTDataset

        # Test legacy fields are mapped orrectly
        dataset = SFTDataset(
            path="test_path",
            message_field_role="role_field",
            message_field_content="content_field",
        )
        assert dataset.message_property_mappings == {
            "role": "role_field",
            "content": "content_field",
        }

        # Test direct message_property_mapping works
        dataset = SFTDataset(
            path="test_path",
            message_property_mappings={
                "role": "custom_role",
                "content": "custom_content",
            },
        )
        assert dataset.message_property_mappings == {
            "role": "custom_role",
            "content": "custom_content",
        }

        # Test both legacy and new fields work when they match
        dataset = SFTDataset(
            path="test_path",
            message_field_role="same_role",
            message_property_mappings={"role": "same_role"},
        )
        assert dataset.message_property_mappings == {
            "role": "same_role",
            "content": "content",
        }

        # Test both legacy and new fields work when they don't overlap
        dataset = SFTDataset(
            path="test_path",
            message_field_role="role_field",
            message_property_mappings={"content": "content_field"},
        )
        assert dataset.message_property_mappings == {
            "role": "role_field",
            "content": "content_field",
        }

        # Test no role or content provided
        dataset = SFTDataset(
            path="test_path",
        )
        assert dataset.message_property_mappings == {
            "role": "role",
            "content": "content",
        }

        # Test error when legacy and new fields conflict
        with pytest.raises(ValueError) as exc_info:
            SFTDataset(
                path="test_path",
                message_field_role="legacy_role",
                message_property_mappings={"role": "different_role"},
            )
        assert "Conflicting message role fields" in str(exc_info.value)

        with pytest.raises(ValueError) as exc_info:
            SFTDataset(
                path="test_path",
                message_field_content="legacy_content",
                message_property_mappings={"content": "different_content"},
            )
        assert "Conflicting message content fields" in str(exc_info.value)

    @pytest.mark.parametrize(
        "world_size, tensor_parallel_size, context_parallel_size, dp_shard_size, dp_replicate_size, is_fsdp, expected",
        [
            (16, 2, 2, 2, 2, True, (2, 2, 2, 2)),
            (16, 1, 1, None, None, True, (0, 0, 16, 1)),
            (16, 2, 2, 2, None, True, (2, 2, 2, 2)),
            (16, 2, 2, None, 2, True, (2, 2, 2, 2)),
            (16, 1, 1, None, 2, True, (0, 0, 8, 2)),
            (2, 1, 1, None, None, True, (0, 0, 2, 1)),
        ],
    )
    def test_get_parallel_config_kwargs(
        self,
        world_size,
        tensor_parallel_size,
        context_parallel_size,
        dp_shard_size,
        dp_replicate_size,
        is_fsdp,
        expected,
    ):
        res = _get_parallel_config_kwargs(
            world_size,
            tensor_parallel_size,
            context_parallel_size,
            dp_shard_size,
            dp_replicate_size,
            is_fsdp,
        )

        if expected[0] > 1:
            assert res["tp_size"] == expected[0]
        if expected[1] > 1:
            assert res["cp_size"] == expected[1]
        if expected[2] > 1:
            assert res["dp_shard_size"] == expected[2]
        if expected[3] > 1:
            assert res["dp_replicate_size"] == expected[3]


================================================
FILE: tests/test_logging_config_file_capture.py
================================================
import logging
import tempfile

import pytest


def read(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        return f.read()


@pytest.fixture(autouse=True)
def _reset_logging_state():
    # Ensure a clean slate for logging between tests
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.shutdown()
    # Note: dictConfig in configure_logging will set up handlers again
    yield
    for handler in logging.root.handlers[:]:
        logging.root.removeHandler(handler)
    logging.shutdown()


def test_axolotl_logs_captured_at_all_levels(monkeypatch):
    from axolotl.logging_config import configure_logging
    from axolotl.utils import tee
    from axolotl.utils.logging import get_logger

    with tempfile.TemporaryDirectory() as td:
        # Avoid stdout tee in this test to simplify interaction with pytest capture
        monkeypatch.setenv("AXOLOTL_TEE_STDOUT", "0")
        configure_logging()
        path = tee.prepare_debug_log(
            type("Cfg", (), {"output_dir": td, "get": lambda *_: False})
        )

        log = get_logger("axolotl.test")
        log.info("AX-INFO")
        log.debug("AX-DEBUG")
        tee.file_only_stream.flush()

        data = read(path)
        assert "AX-INFO" in data
        assert "AX-DEBUG" in data
        tee.close_debug_log()


def test_third_party_logs_filtered_and_warning_captured(monkeypatch):
    from axolotl.logging_config import configure_logging
    from axolotl.utils import tee

    with tempfile.TemporaryDirectory() as td:
        monkeypatch.setenv("AXOLOTL_TEE_STDOUT", "0")
        configure_logging()
        path = tee.prepare_debug_log(
            type("Cfg", (), {"output_dir": td, "get": lambda *_: False})
        )

        # Third-party logger (non-axolotl)
        other = logging.getLogger("thirdparty.lib")
        other.info("TP-INFO")
        other.warning("TP-WARN")

        # Simulate Python warnings routed through logging
        logging.getLogger("py.warnings").warning("PY-WARN")

        # Push through buffers
        tee.file_only_stream.flush()

        data = read(path)
        # INFO from non-axolotl should be filtered out (not present)
        assert "TP-INFO" not in data
        # WARNING+ should be present
        assert "TP-WARN" in data
        # Python warnings captured (via py.warnings logger)
        assert "PY-WARN" in data
        tee.close_debug_log()
        tee.close_debug_log()


def test_prepare_debug_log_idempotent_and_no_duplicate(monkeypatch):
    from axolotl.logging_config import configure_logging
    from axolotl.utils import tee
    from axolotl.utils.logging import get_logger

    with tempfile.TemporaryDirectory() as td:
        monkeypatch.setenv("AXOLOTL_TEE_STDOUT", "0")
        configure_logging()
        cfg = type("Cfg", (), {"output_dir": td, "get": lambda *_: False})
        p1 = tee.prepare_debug_log(cfg)
        p2 = tee.prepare_debug_log(cfg)
        assert p1 == p2

        log = get_logger("axolotl.test")
        marker = "UNIQUE-MARKER-12345"
        log.info(marker)
        tee.file_only_stream.flush()

        data = read(p1)
        # Ensure the marker appears once (not duplicated via propagation)
        assert data.count(marker) == 1
        tee.close_debug_log()


================================================
FILE: tests/test_lora.py
================================================
"""
tests for loading loras
"""

from axolotl.loaders import ModelLoader, load_tokenizer
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

minimal_config = DictDefault(
    {
        "base_model": "HuggingFaceTB/SmolLM2-135M",
        "learning_rate": 0.000001,
        "datasets": [
            {
                "path": "mhenrichsen/alpaca_2k_test",
                "type": "alpaca",
            }
        ],
        "micro_batch_size": 1,
        "gradient_accumulation_steps": 1,
    }
)


class TestLoRALoad:
    """
    Test class for loading LoRA weights
    """

    def test_load_lora_weights(self):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.0,
                "lora_target_linear": True,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "sequence_len": 1024,
            }
            | minimal_config
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        tokenizer = load_tokenizer(cfg)
        ModelLoader(cfg, tokenizer).load()

    def test_load_lora_weights_empty_dropout(self):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": None,
                "lora_target_linear": True,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "sequence_len": 1024,
            }
            | minimal_config
        )
        cfg = validate_config(cfg)
        normalize_config(cfg)
        assert cfg.lora_dropout == 0.0
        tokenizer = load_tokenizer(cfg)
        ModelLoader(cfg, tokenizer).load()


================================================
FILE: tests/test_normalize_config.py
================================================
"""
Test classes for checking functionality of the cfg normalization
"""

import unittest
from unittest.mock import patch

from axolotl.utils.config import (
    normalize_cfg_datasets,
    normalize_config,
    validate_config,
)
from axolotl.utils.dict import DictDefault


class NormalizeConfigTestCase(unittest.TestCase):
    """
    test class for normalize_config checks
    """

    def _get_base_cfg(self):
        return DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "base_model_config": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "num_epochs": 1,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "learning_rate": 0.0001,
            }
        )

    def test_base_model_config_set_when_empty(self):
        cfg = self._get_base_cfg()
        del cfg.base_model_config
        normalize_config(cfg)

        assert cfg.base_model_config == cfg.base_model

    def test_chat_template_chatml(self):
        cfg = DictDefault(
            {
                "chat_template": "chatml",
                "datasets": [
                    {
                        "path": "lorem/ipsum",
                        "type": "chat_template",
                        "chat_template": "gemma",
                    },
                    {
                        "path": "sit/amet",
                        "type": "chat_template",
                    },
                ],
            }
        )

        normalize_cfg_datasets(cfg)

        assert cfg.datasets[0].chat_template == "gemma"
        assert cfg.datasets[1].chat_template == "chatml"

    @patch("axolotl.utils.config.is_torch_bf16_gpu_available")
    def test_bf16_auto_setter_available(self, mock_bf16_avail):
        cfg = self._get_base_cfg()
        cfg.bf16 = "auto"
        mock_bf16_avail.return_value = True

        normalize_config(cfg)

        self.assertTrue(cfg.bf16)
        self.assertFalse(cfg.fp16)

    @patch("axolotl.utils.config.is_torch_bf16_gpu_available")
    def test_bf16_auto_setter_not_available(self, mock_bf16_avail):
        cfg = self._get_base_cfg()
        cfg.bf16 = "auto"
        cfg.fp16 = None
        mock_bf16_avail.return_value = False

        normalize_config(cfg)

        self.assertFalse(cfg.bf16)
        self.assertTrue(cfg.fp16)

    @patch("axolotl.utils.config.is_torch_bf16_gpu_available")
    def test_bf16_disables_fp16(self, mock_bf16_avail):
        cfg = self._get_base_cfg()
        cfg.bf16 = True
        cfg.fp16 = False
        mock_bf16_avail.return_value = True

        normalize_config(cfg)

        self.assertTrue(cfg.bf16)
        self.assertFalse(cfg.fp16)

    def test_migrate_fsdp_config(self):
        """Test basic FSDP config migration with and without fsdp_version"""
        cfg_with_version = self._get_base_cfg() | DictDefault(
            {
                "fsdp_config": {
                    "fsdp_version": 2,
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                    "fsdp_offload_params": False,
                    "fsdp_cpu_ram_efficient_loading": True,
                }
            }
        )

        cfg_with_version = validate_config(cfg_with_version)

        self.assertEqual(cfg_with_version.fsdp_version, 2)
        self.assertEqual(
            cfg_with_version.fsdp_config.auto_wrap_policy, "TRANSFORMER_BASED_WRAP"
        )
        self.assertEqual(cfg_with_version.fsdp_config.offload_params, False)
        self.assertEqual(cfg_with_version.fsdp_config.cpu_ram_efficient_loading, True)

        self.assertNotIn("fsdp_auto_wrap_policy", cfg_with_version.fsdp_config)
        self.assertNotIn("fsdp_offload_params", cfg_with_version.fsdp_config)
        self.assertNotIn("fsdp_cpu_ram_efficient_loading", cfg_with_version.fsdp_config)
        self.assertIn("fsdp_version", cfg_with_version.fsdp_config)

        cfg_without_version = self._get_base_cfg() | DictDefault(
            {
                "fsdp_config": {
                    "fsdp_auto_wrap_policy": "SIZE_BASED_WRAP",
                    "fsdp_offload_params": True,
                }
            }
        )

        cfg_without_version = validate_config(cfg_without_version)

        self.assertNotIn("fsdp_version", cfg_without_version)
        self.assertEqual(
            cfg_without_version.fsdp_config.auto_wrap_policy, "SIZE_BASED_WRAP"
        )
        self.assertEqual(cfg_without_version.fsdp_config.offload_params, True)

        self.assertNotIn("fsdp_auto_wrap_policy", cfg_without_version.fsdp_config)
        self.assertNotIn("fsdp_offload_params", cfg_without_version.fsdp_config)

    def test_migrate_fsdp_config_no_fsdp_config(self):
        """Test that function doesn't crash when no fsdp_config is present"""
        cfg = self._get_base_cfg()

        cfg = validate_config(cfg)

        self.assertNotIn("fsdp_config", cfg)
        self.assertNotIn("fsdp_version", cfg)

    def test_migrate_fsdp_config_empty_fsdp_config(self):
        """Test migration with empty fsdp_config"""
        cfg = self._get_base_cfg() | DictDefault({"fsdp_config": {}})

        cfg = validate_config(cfg)

        self.assertNotIn("fsdp_version", cfg)
        self.assertEqual(cfg.fsdp_config, {})

    def test_migrate_fsdp_config_mixed_keys(self):
        """Test migration with a mix of fsdp_ and non-fsdp_ keys"""
        cfg = self._get_base_cfg() | DictDefault(
            {
                "fsdp_config": {
                    "fsdp_version": 1,
                    "fsdp_state_dict_type": "FULL_STATE_DICT",
                    "mixed_precision_policy": "fp16",
                    "activation_checkpointing": True,
                    "fsdp_reshard_after_forward": False,
                }
            }
        )

        cfg = validate_config(cfg)

        self.assertEqual(cfg.fsdp_version, 1)
        self.assertEqual(cfg.fsdp_config.state_dict_type, "FULL_STATE_DICT")
        self.assertEqual(cfg.fsdp_config.reshard_after_forward, False)
        self.assertEqual(cfg.fsdp_config.mixed_precision_policy, "fp16")
        self.assertEqual(cfg.fsdp_config.activation_checkpointing, True)

        # Check original fsdp_ keys are removed
        self.assertNotIn("fsdp_state_dict_type", cfg.fsdp_config)
        self.assertNotIn("fsdp_reshard_after_forward", cfg.fsdp_config)

        self.assertIn("fsdp_version", cfg.fsdp_config)


================================================
FILE: tests/test_opentelemetry_callback.py
================================================
"""Tests for OpenTelemetry metrics callback functionality."""

import time

import pytest

from axolotl.utils.dict import DictDefault


@pytest.fixture
def mock_otel_config():
    """Mock configuration for OpenTelemetry callback."""
    return DictDefault(
        {
            "use_otel_metrics": True,
            "otel_metrics_host": "localhost",
            "otel_metrics_port": 8003,  # Use unique port for tests
        }
    )


@pytest.fixture
def mock_trainer_state():
    """Mock trainer state for callback testing."""
    from transformers import TrainerState

    state = TrainerState()
    state.epoch = 1.0
    state.global_step = 100
    return state


@pytest.fixture
def mock_training_args():
    """Mock training arguments for callback testing."""
    from transformers import TrainingArguments

    return TrainingArguments(output_dir="/tmp/test")


@pytest.fixture
def mock_trainer_control():
    """Mock trainer control for callback testing."""
    from transformers.trainer_callback import TrainerControl

    return TrainerControl()


class TestOpenTelemetryConfig:
    """Test OpenTelemetry configuration schema."""

    def test_config_schema_valid(self):
        """Test OpenTelemetry configuration schema validation."""
        from axolotl.utils.schemas.integrations import OpenTelemetryConfig

        # Test valid config
        valid_config = {
            "use_otel_metrics": True,
            "otel_metrics_host": "localhost",
            "otel_metrics_port": 8000,
        }

        otel_config = OpenTelemetryConfig(**valid_config)
        assert otel_config.use_otel_metrics is True
        assert otel_config.otel_metrics_host == "localhost"
        assert otel_config.otel_metrics_port == 8000

    def test_config_defaults(self):
        """Test OpenTelemetry configuration default values."""
        from axolotl.utils.schemas.integrations import OpenTelemetryConfig

        # Test minimal config with defaults
        minimal_config = {"use_otel_metrics": True}

        otel_config = OpenTelemetryConfig(**minimal_config)
        assert otel_config.use_otel_metrics is True
        assert otel_config.otel_metrics_host == "localhost"  # default
        assert otel_config.otel_metrics_port == 8000  # default

    def test_config_disabled_by_default(self):
        """Test that OpenTelemetry is disabled by default."""
        from axolotl.utils.schemas.integrations import OpenTelemetryConfig

        # Test default config
        default_config = OpenTelemetryConfig()
        assert default_config.use_otel_metrics is False


class TestOpenTelemetryCallback:
    """Test OpenTelemetry callback functionality."""

    def test_callback_import(self):
        """Test that OpenTelemetry callback can be imported."""
        from axolotl.utils.callbacks.opentelemetry import OpenTelemetryMetricsCallback

        assert OpenTelemetryMetricsCallback is not None

    def test_callback_graceful_fallback(self, mock_otel_config):
        """Test callback gracefully handles missing dependencies."""
        from axolotl.utils.callbacks.opentelemetry import OpenTelemetryMetricsCallback

        # This should not raise an exception even if dependencies are missing
        callback = OpenTelemetryMetricsCallback(mock_otel_config)

        # Callback should exist but may have metrics disabled
        assert callback is not None
        assert hasattr(callback, "metrics_enabled")

    def test_callback_initialization_enabled(self, mock_otel_config):
        """Test callback initialization when OpenTelemetry is available."""
        from axolotl.utils.callbacks.opentelemetry import (
            OPENTELEMETRY_AVAILABLE,
            OpenTelemetryMetricsCallback,
        )

        callback = OpenTelemetryMetricsCallback(mock_otel_config)

        if OPENTELEMETRY_AVAILABLE:
            assert callback.metrics_enabled is True
            assert callback.cfg == mock_otel_config
            assert callback.metrics_host == "localhost"
            assert callback.metrics_port == 8003
        else:
            assert callback.metrics_enabled is False

    def test_metrics_server_lifecycle(
        self,
        mock_otel_config,
        mock_trainer_state,
        mock_training_args,
        mock_trainer_control,
    ):
        """Test metrics server starts and stops correctly."""
        from axolotl.utils.callbacks.opentelemetry import (
            OPENTELEMETRY_AVAILABLE,
            OpenTelemetryMetricsCallback,
        )

        if not OPENTELEMETRY_AVAILABLE:
            pytest.skip("OpenTelemetry dependencies not available")

        callback = OpenTelemetryMetricsCallback(mock_otel_config)

        # Start server
        callback.on_train_begin(
            mock_training_args, mock_trainer_state, mock_trainer_control
        )
        assert callback.server_started is True

        # End training
        callback.on_train_end(
            mock_training_args, mock_trainer_state, mock_trainer_control
        )

    def test_metrics_recording(
        self,
        mock_otel_config,
        mock_trainer_state,
        mock_training_args,
        mock_trainer_control,
    ):
        """Test that metrics are recorded during training."""
        from axolotl.utils.callbacks.opentelemetry import (
            OPENTELEMETRY_AVAILABLE,
            OpenTelemetryMetricsCallback,
        )

        if not OPENTELEMETRY_AVAILABLE:
            pytest.skip("OpenTelemetry dependencies not available")

        callback = OpenTelemetryMetricsCallback(mock_otel_config)
        callback.on_train_begin(
            mock_training_args, mock_trainer_state, mock_trainer_control
        )

        # Test logging metrics
        test_logs = {
            "loss": 0.5,
            "learning_rate": 1e-4,
            "grad_norm": 0.8,
        }

        # This should not raise an exception
        callback.on_log(
            mock_training_args, mock_trainer_state, mock_trainer_control, logs=test_logs
        )
        assert callback.metrics_enabled is True

    def test_evaluation_metrics(
        self,
        mock_otel_config,
        mock_trainer_state,
        mock_training_args,
        mock_trainer_control,
    ):
        """Test evaluation metrics recording."""
        from axolotl.utils.callbacks.opentelemetry import (
            OPENTELEMETRY_AVAILABLE,
            OpenTelemetryMetricsCallback,
        )

        if not OPENTELEMETRY_AVAILABLE:
            pytest.skip("OpenTelemetry dependencies not available")

        callback = OpenTelemetryMetricsCallback(mock_otel_config)
        callback.on_train_begin(
            mock_training_args, mock_trainer_state, mock_trainer_control
        )

        # Test evaluation metrics
        eval_logs = {
            "eval_loss": 0.3,
            "eval_accuracy": 0.95,
        }

        # This should not raise an exception
        callback.on_evaluate(
            mock_training_args, mock_trainer_state, mock_trainer_control, eval_logs
        )
        assert callback.metrics_enabled is True

    def test_thread_safety(self, mock_otel_config):
        """Test that callback has thread safety mechanisms."""
        from axolotl.utils.callbacks.opentelemetry import (
            OPENTELEMETRY_AVAILABLE,
            OpenTelemetryMetricsCallback,
        )

        if not OPENTELEMETRY_AVAILABLE:
            pytest.skip("OpenTelemetry dependencies not available")

        callback = OpenTelemetryMetricsCallback(mock_otel_config)
        assert hasattr(callback, "metrics_lock")
        # Check it's a lock-like object
        assert hasattr(callback.metrics_lock, "__enter__")
        assert hasattr(callback.metrics_lock, "__exit__")


class TestOpenTelemetryIntegration:
    """Integration tests for OpenTelemetry."""

    def test_availability_check(self):
        """Test availability check function."""
        from axolotl.utils import is_opentelemetry_available

        result = is_opentelemetry_available()
        assert isinstance(result, bool)

    def test_prometheus_endpoint_basic(
        self,
        mock_otel_config,
        mock_trainer_state,
        mock_training_args,
        mock_trainer_control,
    ):
        """Test basic Prometheus endpoint functionality."""
        from axolotl.utils.callbacks.opentelemetry import (
            OPENTELEMETRY_AVAILABLE,
            OpenTelemetryMetricsCallback,
        )

        if not OPENTELEMETRY_AVAILABLE:
            pytest.skip("OpenTelemetry dependencies not available")

        try:
            import requests
        except ImportError:
            pytest.skip("requests library not available")

        callback = OpenTelemetryMetricsCallback(mock_otel_config)
        callback.on_train_begin(
            mock_training_args, mock_trainer_state, mock_trainer_control
        )

        if not callback.server_started:
            pytest.skip("Metrics server failed to start")

        # Give server time to start
        time.sleep(1)

        # Try to access metrics endpoint
        try:
            response = requests.get(
                f"http://{callback.metrics_host}:{callback.metrics_port}/metrics",
                timeout=2,
            )
            assert response.status_code == 200
            # Check for Prometheus format
            assert "# TYPE" in response.text or "# HELP" in response.text
        except requests.exceptions.RequestException:
            pytest.skip(
                "Could not connect to metrics endpoint - this is expected in some environments"
            )


class TestOpenTelemetryCallbackMethods:
    """Test specific callback methods."""

    def test_step_end_callback(
        self,
        mock_otel_config,
        mock_trainer_state,
        mock_training_args,
        mock_trainer_control,
    ):
        """Test step end callback method."""
        from axolotl.utils.callbacks.opentelemetry import (
            OPENTELEMETRY_AVAILABLE,
            OpenTelemetryMetricsCallback,
        )

        if not OPENTELEMETRY_AVAILABLE:
            pytest.skip("OpenTelemetry dependencies not available")

        callback = OpenTelemetryMetricsCallback(mock_otel_config)
        callback.on_train_begin(
            mock_training_args, mock_trainer_state, mock_trainer_control
        )

        # Should not raise an exception
        callback.on_step_end(
            mock_training_args, mock_trainer_state, mock_trainer_control
        )

    def test_epoch_end_callback(
        self,
        mock_otel_config,
        mock_trainer_state,
        mock_training_args,
        mock_trainer_control,
    ):
        """Test epoch end callback method."""
        from axolotl.utils.callbacks.opentelemetry import (
            OPENTELEMETRY_AVAILABLE,
            OpenTelemetryMetricsCallback,
        )

        if not OPENTELEMETRY_AVAILABLE:
            pytest.skip("OpenTelemetry dependencies not available")

        callback = OpenTelemetryMetricsCallback(mock_otel_config)
        callback.on_train_begin(
            mock_training_args, mock_trainer_state, mock_trainer_control
        )

        # Should not raise an exception
        callback.on_epoch_end(
            mock_training_args, mock_trainer_state, mock_trainer_control
        )


================================================
FILE: tests/test_packed_batch_sampler.py
================================================
"""Module for testing streaming dataset sequence packing"""

import pytest
from datasets import concatenate_datasets
from torch.utils.data import DataLoader, RandomSampler
from transformers import AutoTokenizer

from axolotl.datasets import TokenizedPromptDataset
from axolotl.prompt_strategies.completion import load
from axolotl.utils.collators import V2BatchSamplerDataCollatorForSeq2Seq
from axolotl.utils.data.utils import handle_long_seq_in_dataset
from axolotl.utils.dict import DictDefault
from axolotl.utils.samplers import MultipackBatchSampler, get_dataset_lengths

from tests.hf_offline_utils import enable_hf_offline


@pytest.fixture(name="tokenizer")
def fixture_tokenizer():
    tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
    tokenizer.pad_token = "</s>"
    return tokenizer


class TestBatchedSamplerPacking:
    """
    Test class for packing streaming dataset sequences
    """

    @pytest.mark.parametrize(
        "batch_size, num_workers",
        [
            (1, 0),
            (2, 0),
            (1, 2),
            (2, 2),
        ],
    )
    @pytest.mark.parametrize("max_seq_length", [4096, 512])
    @pytest.mark.parametrize("sequential", [True, False])
    @enable_hf_offline
    def test_packing(
        self,
        dataset_winglian_tiny_shakespeare,
        batch_size,
        num_workers,
        tokenizer,
        max_seq_length,
        sequential,
    ):
        from axolotl.monkeypatch.data.batch_dataset_fetcher import (
            apply_multipack_dataloader_patch,
            remove_multipack_dataloader_patch,
        )

        # Apply the patch for multipack handling
        apply_multipack_dataloader_patch()

        dataset = dataset_winglian_tiny_shakespeare["train"]

        cfg = DictDefault(
            {
                "train_on_inputs": True,
                "sequence_len": max_seq_length,
            }
        )
        ds_cfg = DictDefault(
            {
                "field": "text",
            }
        )
        completion_strategy = load(tokenizer, cfg, ds_cfg)
        dataset_wrapper = TokenizedPromptDataset(
            completion_strategy,
            dataset,
        )
        train_dataset = concatenate_datasets([dataset_wrapper])

        train_dataset = handle_long_seq_in_dataset(train_dataset, cfg.sequence_len, cfg)

        lengths = get_dataset_lengths(train_dataset)
        batch_sampler = MultipackBatchSampler(
            sampler=RandomSampler(train_dataset),
            lengths=lengths,
            batch_size=batch_size,
            batch_max_len=max_seq_length,
            group_size=100000,
            bin_size=200,
            sequential=sequential,
            drop_last=False,
        )

        loader = DataLoader(
            train_dataset,
            batch_sampler=batch_sampler,
            collate_fn=V2BatchSamplerDataCollatorForSeq2Seq(
                tokenizer=tokenizer,
                padding=True,
                pad_to_multiple_of=max_seq_length,
                return_tensors="pt",
            ),
            num_workers=num_workers,
        )

        batch_idxs = []
        for batch in batch_sampler:
            for pack in batch:
                batch_idxs.extend(pack)

        try:
            for batch in loader:
                assert batch["input_ids"].numel() <= batch_size * max_seq_length
                assert batch["input_ids"].shape[1] == max_seq_length

            original_idxs = set(range(len(train_dataset)))
            assert original_idxs == set(batch_idxs)
            assert len(batch_idxs) == len(set(batch_idxs))
        finally:
            # Clean up: remove the patch after the test
            remove_multipack_dataloader_patch()


================================================
FILE: tests/test_packed_dataset.py
================================================
"""Module for testing dataset sequence packing"""

import unittest

from transformers import AutoTokenizer

from axolotl.cli.args import TrainerCliArgs
from axolotl.common.datasets import load_datasets
from axolotl.train import setup_model_and_trainer
from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault

from tests.e2e.utils import with_temp_dir
from tests.hf_offline_utils import enable_hf_offline


class TestPacking(unittest.TestCase):
    """
    Test class for packing dataset sequences
    """

    @enable_hf_offline
    def setUp(self) -> None:
        self.tokenizer = AutoTokenizer.from_pretrained("huggyllama/llama-7b")
        self.tokenizer.add_special_tokens(
            {
                "bos_token": "<s>",
                "eos_token": "</s>",
                "unk_token": "<unk>",
            }
        )

    @with_temp_dir
    def test_lora_packing(self, temp_dir):
        cfg = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "tokenizer_type": "AutoTokenizer",
                "sequence_len": 1024,
                "sample_packing": True,
                "multipack_real_batches": False,
                "eval_sample_packing": True,
                "adapter": "lora",
                "lora_r": 32,
                "lora_alpha": 64,
                "lora_dropout": 0.05,
                "lora_target_linear": True,
                "val_set_size": 0.2,
                "special_tokens": {
                    "pad_token": "<|endoftext|>",
                },
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    },
                ],
                "dataset_num_proc": 4,
                "num_epochs": 1,
                "max_steps": 20,
                "save_steps": 10,
                "micro_batch_size": 8,
                "gradient_accumulation_steps": 1,
                "output_dir": temp_dir,
                "learning_rate": 0.00001,
                "optimizer": "adamw_torch_fused",
                "lr_scheduler": "cosine",
                "fp16": False,
                "bf16": False,
            }
        )

        cfg = validate_config(cfg)
        normalize_config(cfg)
        cli_args = TrainerCliArgs()
        dataset_meta = load_datasets(cfg=cfg, cli_args=cli_args)

        (
            trainer,
            _,
            _,
            _,
            _,
        ) = setup_model_and_trainer(cfg, dataset_meta)

        sampler = trainer._get_eval_sampler(trainer.eval_dataset)
        assert "MultipackBatchSampler" in sampler.__class__.__name__
        assert (
            "V2BatchSamplerDataCollatorForSeq2Seq"
            in trainer.eval_data_collator.__class__.__name__
        )
        dataloader = trainer.get_eval_dataloader(trainer.eval_dataset)
        dataloader_iter = iter(dataloader)
        batch = next(dataloader_iter)
        assert batch["input_ids"].shape == (1, 8192)

        sampler = trainer._get_train_sampler(trainer.train_dataset)
        assert "MultipackBatchSampler" in sampler.__class__.__name__
        assert (
            "V2BatchSamplerDataCollatorForSeq2Seq"
            in trainer.train_data_collator.__class__.__name__
        )
        dataloader = trainer.get_train_dataloader()
        dataloader_iter = iter(dataloader)
        batch = next(dataloader_iter)
        assert batch["input_ids"].shape == (1, 8192)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/test_packed_pretraining.py
================================================
"""Module for testing streaming dataset sequence packing"""

import functools
import random
import string

import pytest
import torch
from datasets import IterableDataset
from torch.utils.data import DataLoader

from axolotl.utils.data import get_dataset_wrapper, wrap_streaming_dataset
from axolotl.utils.dict import DictDefault


class TestPretrainingPacking:
    """
    Test class for packing streaming dataset sequences
    """

    @pytest.fixture
    def random_text(self):
        # seed with random.seed(0) for reproducibility
        random.seed(0)

        # generate row of random text with "words" of between 2 and 10 characters and
        # between 400 to 1200 characters per line
        def rand_txt():
            return " ".join(
                [
                    "".join(
                        random.choices(string.ascii_lowercase, k=random.randint(2, 10))
                    )
                    for _ in range(random.randint(50, 200))
                ]
            )

        # Create a list of 2000 random texts rather than just using it within the
        # generator so the test runs faster
        data = [rand_txt() for _ in range(500)]

        # Create an IterableDataset
        def generator():
            for row in data:
                yield {"text": row}

        return IterableDataset.from_generator(generator)

    @pytest.mark.flaky(retries=1, delay=5)
    def test_packing_stream_dataset(self, tokenizer_huggyllama, random_text):
        dataset = random_text

        cfg = DictDefault(
            {
                "pretraining_dataset": [
                    {
                        "path": "winglian/tiny-shakespeare",
                        "type": "pretrain",
                    }
                ],
                "sample_packing": True,
                "pretrain_multipack_attn": True,
                "pad_to_sequence_len": True,
                "sequence_len": 2048,
                "micro_batch_size": 2,
                "sample_packing_group_size": 100000,
                "sample_packing_bin_size": 200,
            }
        )

        ds_wrapper_partial = functools.partial(
            get_dataset_wrapper,
            cfg.pretraining_dataset[0],
            tokenizer_huggyllama,
            cfg,
            cfg.pretraining_dataset[0]["type"] or "pretrain",
        )

        original_bsz = cfg.micro_batch_size
        train_dataset = wrap_streaming_dataset(
            dataset,
            tokenizer_huggyllama,
            cfg,
            ds_wrapper_partial,
        )

        trainer_loader = DataLoader(
            train_dataset,
            batch_size=1,
            collate_fn=None,
            drop_last=True,
        )
        idx = 0
        for data in trainer_loader:
            if idx > 3:
                break
            assert data["input_ids"].shape == torch.Size(
                [1, original_bsz * cfg.sequence_len]
            )
            assert data["position_ids"].shape == torch.Size(
                [1, original_bsz * cfg.sequence_len]
            )
            assert data["labels"].shape == torch.Size(
                [1, original_bsz * cfg.sequence_len]
            )
            assert "attention_mask" not in data
            # FIXME add back once we fix packing unpad/pad with attention mask
            # assert data["attention_mask"].shape == torch.Size(
            #     [1, original_bsz * cfg.sequence_len]
            # )
            idx += 1


================================================
FILE: tests/test_perplexity.py
================================================
"""unit tests for perplexity eval callback"""

from pytest import fixture
from transformers.models.auto.modeling_auto import AutoModelForCausalLM
from transformers.models.auto.tokenization_auto import AutoTokenizer

from axolotl.utils.callbacks.perplexity import Perplexity

MODEL_NAME = "HuggingFaceTB/SmolLM2-135M"


@fixture()
def metric(tokenizer):
    return Perplexity(tokenizer=tokenizer, max_seq_len=512)


@fixture()
def model():
    return AutoModelForCausalLM.from_pretrained(
        MODEL_NAME, trust_remote_code=True, dtype="float32"
    )


@fixture()
def tokenizer():
    tokenizer_ = AutoTokenizer.from_pretrained(MODEL_NAME, trust_remote_code=True)
    tokenizer_.add_special_tokens({"pad_token": "<|endoftext|>"})
    return tokenizer_


def test_perplexity_longer_than_stride(model, metric):
    # taken from https://huggingface.co/datasets/roneneldan/TinyStories
    sample_text = """
Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun. Beep was a healthy car because he always had good fuel. Good fuel made Beep happy and strong. One day, Beep was driving in the park when he saw a big tree. The tree had many leaves that were falling. Beep liked how the leaves fall and wanted to play with them. Beep drove under the tree and watched the leaves fall on him. He laughed and beeped his horn. Beep played with the falling leaves all day. When it was time to go home, Beep knew he needed more fuel. He went to the fuel place and got more healthy fuel. Now, Beep was ready to go fast and play again the next day. And Beep lived happily ever after.
One day, a little fish named Fin was swimming near the shore. He saw a big crab and wanted to be friends. "Hi, I am Fin. Do you want to play?" asked the little fish. The crab looked at Fin and said, "No, I don't want to play. I am cold and I don't feel fine." Fin felt sad but wanted to help the crab feel better. He swam away and thought of a plan. He remembered that the sun could make things warm. So, Fin swam to the top of the water and called to the sun, "Please, sun, help my new friend feel fine and not freeze!" The sun heard Fin's call and shone its warm light on the shore. The crab started to feel better and not so cold. He saw Fin and said, "Thank you, little fish, for making me feel fine. I don't feel like I will freeze now. Let's play together!" And so, Fin and the crab played and became good friends.
"""
    result = metric.compute(model, [sample_text])
    ppl = result["score"]
    assert round(ppl, 2) == 7.41


def test_perplexity_short(model, metric):
    # taken from https://huggingface.co/datasets/roneneldan/TinyStories
    sample_text = "Once upon a time, there was a little car named Beep. Beep loved to go fast and play in the sun."
    result = metric.compute(model, [sample_text])
    ppl = result["score"]
    assert round(ppl, 2) == 10.33


================================================
FILE: tests/test_prompt_tokenizers.py
================================================
"""Module for testing prompt tokenizers."""

import json
from pathlib import Path

from axolotl.prompt_strategies.alpaca_chat import NoSystemPrompter
from axolotl.prompt_strategies.alpaca_w_system import (
    InstructionWSystemPromptTokenizingStrategy,
    SystemDataPrompter,
)
from axolotl.prompt_strategies.llama2_chat import (
    Llama2ChatPrompter,
    LLama2ChatTokenizingStrategy,
)
from axolotl.prompt_strategies.orpo.chat_template import load
from axolotl.prompt_tokenizers import AlpacaPromptTokenizingStrategy
from axolotl.prompters import AlpacaPrompter, PromptStyle
from axolotl.utils.dict import DictDefault

from tests.hf_offline_utils import enable_hf_offline

test_data = {
    "multi_turn_sys": {
        "conversations": [
            {"from": "system", "value": "lorem"},
            {"from": "human", "value": "abc"},
            {"from": "gpt", "value": "ipsum"},
            {"from": "human", "value": "123"},
            {"from": "gpt", "value": "sit"},
        ]
    },
    "single_turn_sys": {
        "conversations": [
            {"from": "system", "value": "lorem"},
            {"from": "human", "value": "abc"},
            {"from": "gpt", "value": "ipsum"},
        ]
    },
    "single_turn_no_sys": {
        "conversations": [
            {"from": "human", "value": "abc"},
            {"from": "gpt", "value": "ipsum"},
        ]
    },
    "multi_turn_no_sys": {
        "conversations": [
            {"from": "human", "value": "abc"},
            {"from": "gpt", "value": "ipsum"},
            {"from": "human", "value": "123"},
            {"from": "gpt", "value": "sit"},
        ]
    },
}


class TestPromptTokenizationStrategies:
    """
    Test class for prompt tokenization strategies.
    """

    @enable_hf_offline
    def test_no_sys_prompt(self, tokenizer_huggyllama_w_special_tokens):
        """
        tests the interface between the user and assistant parts
        """
        prompter = NoSystemPrompter()

        strat = AlpacaPromptTokenizingStrategy(
            prompter,
            tokenizer_huggyllama_w_special_tokens,
            False,
            2048,
        )
        sample = {
            "instruction": "hello cruel. lorem ipsum dolor sit amet.",
            "output": "world!",
        }
        example = strat.tokenize_prompt(sample)
        world_idx = example["input_ids"].index(3186)
        assert example["labels"][world_idx] == 3186
        assert example["labels"][world_idx - 1] == -100

    @enable_hf_offline
    def test_alpaca(self, tokenizer_huggyllama_w_special_tokens):
        """
        tests the interface between the user and assistant parts
        """

        prompter = AlpacaPrompter()
        strat = AlpacaPromptTokenizingStrategy(
            prompter,
            tokenizer_huggyllama_w_special_tokens,
            False,
            2048,
        )
        sample = {"instruction": "hello!", "output": "Hi! How can I help?"}
        example = strat.tokenize_prompt(sample)
        world_idx = example["input_ids"].index(6324)
        assert example["labels"][world_idx] == 6324
        assert example["labels"][world_idx - 1] == -100


class TestInstructionWSystemPromptTokenizingStrategy:
    """
    Test class for prompt tokenization strategies with sys prompt from the dataset
    """

    @enable_hf_offline
    def test_system_alpaca(self, tokenizer_huggyllama_w_special_tokens):
        prompter = SystemDataPrompter(PromptStyle.CHAT.value)
        strat = InstructionWSystemPromptTokenizingStrategy(
            prompter,
            tokenizer_huggyllama_w_special_tokens,
            False,
            2048,
        )
        sample = {
            "system": "use cot",
            "instruction": "hello!",
            "output": "Hi! How can I help?",
        }
        example = strat.tokenize_prompt(sample)
        assert example["input_ids"][0:5] == [
            1,
            28962,
            1254,
            12665,
            29901,
        ]  # "<s>SYSTEM:"
        assert example["input_ids"][5:7] == [671, 20118]  # " use cot"
        assert example["input_ids"][8] == 11889  # USER


class Llama2ChatTokenizationTest:
    """
    Test class for prompt tokenization strategies with sys prompt from the dataset
    """

    @enable_hf_offline
    def test_llama2_chat_integration(self, tokenizer_llama2_7b):
        with open(
            Path(__file__).parent / "fixtures/conversation.json", encoding="utf-8"
        ) as fin:
            data = fin.read()
            conversation = json.loads(data)
        with open(
            Path(__file__).parent / "fixtures/conversation.tokenized_llama2chat.json",
            encoding="utf-8",
        ) as fin:
            data = fin.read()
            tokenized_conversation = json.loads(data)
        prompter = Llama2ChatPrompter()
        strat = LLama2ChatTokenizingStrategy(
            prompter,
            tokenizer_llama2_7b,
            False,
            4096,
        )
        example = strat.tokenize_prompt(conversation)
        for fields in ["input_ids", "attention_mask", "labels"]:
            # pytest assert equals

            assert len(example[fields]) == len(tokenized_conversation[fields])
            assert example[fields] == tokenized_conversation[fields]

    def compare_with_transformers_integration(self, tokenizer_llama2_7b):
        # this needs transformers >= v4.31.0
        from transformers.models.llama.tokenization_llama import B_SYS, E_SYS
        from transformers.pipelines.conversational import Conversation

        # from transformers.models.llama.tokenization_llama import DEFAULT_SYSTEM_PROMPT
        # broken as of 23/7/20
        # see https://github.com/huggingface/transformers/pull/24935

        DEFAULT_SYSTEM_PROMPT = """\
You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature.

If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information."""
        with open(
            Path(__file__).parent / "fixtures/conversation.json", encoding="utf-8"
        ) as fin:
            data = fin.read()
            conversation = json.loads(data)
        with open(
            Path(__file__).parent / "fixtures/conversation.tokenized_llama2chat.json",
            encoding="utf-8",
        ) as fin:
            data = fin.read()
            tokenized_conversation = json.loads(data)

        user_input = []
        answers = []
        for msg in conversation["conversations"]:
            if msg["from"] == "human":
                user_input.append(msg["value"])
            else:
                answers.append(msg["value"])
        hf_conf = Conversation(
            text=user_input[-1],
            past_user_inputs=[B_SYS + DEFAULT_SYSTEM_PROMPT + E_SYS + user_input[0]]
            + user_input[1:-1],
            generated_responses=answers,
        )

        hf_tokens = tokenizer_llama2_7b._build_conversation_input_ids(hf_conf)

        assert hf_tokens == tokenized_conversation["input_ids"][: len(hf_tokens)]


class OrpoTokenizationTest:
    """test case for the ORPO tokenization"""

    @enable_hf_offline
    def test_orpo_integration(
        self,
        tokenizer_mistral_7b_instruct_chatml,
        dataset_argilla_ultrafeedback_binarized_preferences_cleaned,
    ):
        ds = dataset_argilla_ultrafeedback_binarized_preferences_cleaned.select([0])
        strat = load(
            tokenizer_mistral_7b_instruct_chatml,
            DictDefault({"train_on_inputs": False}),
            DictDefault({"chat_template": "chatml"}),
        )
        res = strat.tokenize_prompt(ds[0])
        assert "rejected_input_ids" in res
        assert "rejected_labels" in res
        assert "input_ids" in res
        assert "labels" in res
        assert "prompt_attention_mask" in res

        assert len(res["rejected_input_ids"]) == len(res["rejected_labels"])
        assert len(res["input_ids"]) == len(res["labels"])
        assert len(res["input_ids"]) == len(res["prompt_attention_mask"])

        assert res["rejected_labels"][0] == -100
        assert res["rejected_input_ids"][-1] == res["rejected_labels"][-1]

        assert res["labels"][0] == -100
        assert res["input_ids"][-1] == res["labels"][-1]

        assert res["prompt_attention_mask"][0] == 1
        assert res["prompt_attention_mask"][-1] == 0


================================================
FILE: tests/test_prompters.py
================================================
"""Module testing prompters"""

import unittest

from axolotl.prompt_strategies.alpaca_w_system import SystemDataPrompter
from axolotl.prompters import (
    AlpacaPrompter,
    MultipleChoiceExplainPrompter,
    PromptStyle,
    UnpromptedPrompter,
)


class AlpacaPrompterTest(unittest.TestCase):
    """
    Test AlpacaPrompter
    """

    def test_prompt_style_w_none(self):
        prompter = AlpacaPrompter(prompt_style=None)
        res = next(prompter.build_prompt("tell me a joke"))
        # just testing that it uses instruct style
        assert "### Instruction:" in res

    def test_prompt_style_w_instruct(self):
        prompter = AlpacaPrompter(prompt_style=PromptStyle.INSTRUCT.value)
        res = next(
            prompter.build_prompt("tell me a joke about the following", "alpacas")
        )
        assert "Below is an instruction" in res
        assert "### Instruction:" in res
        assert "### Input:" in res
        assert "alpacas" in res
        assert "### Response:" in res
        assert "USER:" not in res
        assert "ASSISTANT:" not in res
        res = next(prompter.build_prompt("tell me a joke about the following"))
        assert "Below is an instruction" in res
        assert "### Instruction:" in res
        assert "### Input:" not in res
        assert "### Response:" in res
        assert "USER:" not in res
        assert "ASSISTANT:" not in res

    def test_prompt_style_w_phi(self):
        prompter = AlpacaPrompter(prompt_style=PromptStyle.PHI.value)
        res = next(prompter.build_prompt("tell me a joke about the following"))
        assert (
            """<|system|>
Below is an instruction that describes a task. Write a response that appropriately completes the request.<|end|>
<|user|>
tell me a joke about the following<|end|>
<|assistant|>
"""
            == res
        )

    def test_prompt_style_w_chat(self):
        prompter = AlpacaPrompter(prompt_style=PromptStyle.CHAT.value)
        res = next(
            prompter.build_prompt("tell me a joke about the following", "alpacas")
        )
        assert "Below is an instruction" in res
        assert "### Instruction:" not in res
        assert "### Input:" not in res
        assert "alpacas" in res
        assert "### Response:" not in res
        assert "USER:" in res
        assert "ASSISTANT:" in res
        res = next(prompter.build_prompt("tell me a joke about the following"))
        assert "Below is an instruction" in res
        assert "### Instruction:" not in res
        assert "### Input:" not in res
        assert "### Response:" not in res
        assert "USER:" in res
        assert "ASSISTANT:" in res

    def test_system_prompt(self):
        prompter = SystemDataPrompter(prompt_style=PromptStyle.CHAT.value)
        res = next(
            prompter.build_prompt_w_system(
                "use cot", "tell me a joke about the following", "alpacas"
            )
        )
        assert "use cot" in res
        assert res.startswith("SYSTEM:")
        assert "### Instruction:" not in res
        assert "### Input:" not in res
        assert "alpacas" in res
        assert "### Response:" not in res
        assert "USER:" in res
        assert "ASSISTANT:" in res


class UnpromptedPrompterTest(unittest.TestCase):
    """
    Test class for UnpromptedPrompter with no system prompts
    """

    def test_prompt_style_w_none(self):
        prompter = UnpromptedPrompter(prompt_style=None)
        res = next(prompter.build_prompt("tell me a joke"))
        assert "### Instruction:" in res
        assert "tell me a joke" in res
        assert res.startswith("###")

    def test_prompt_style_w_instruct(self):
        prompter = UnpromptedPrompter(prompt_style=PromptStyle.INSTRUCT.value)
        res = next(
            prompter.build_prompt("tell me a joke about the following", "alpacas")
        )
        assert "### Instruction:" in res
        assert "tell me a joke" in res
        assert res.startswith("###")

    def test_prompt_style_w_chat(self):
        prompter = UnpromptedPrompter(prompt_style=PromptStyle.CHAT.value)
        res = next(
            prompter.build_prompt("tell me a joke about the following", "alpacas")
        )
        assert "USER:" in res
        assert "tell me a joke" in res
        assert res.startswith("USER:")


class MultipleChoiceExplainPrompterTest(unittest.TestCase):
    """
    Test class for MultipleChoiceExplainPrompter
    """

    def test_prompt_style_w_chat(self):
        prompter = MultipleChoiceExplainPrompter(prompt_style=PromptStyle.CHAT.value)
        res = next(prompter.build_prompt("choose one", "- A\n- B\n- C", "C"))
        assert "USER:" in res
        assert "choose one" in res
        assert "Choose the answer that best answers the question." in res
        assert "- A\n- B\n- C" in res


================================================
FILE: tests/test_revision_parameter.py
================================================
"""Tests for revision_of_model being passed to tokenizer and processor loaders."""

from unittest.mock import MagicMock, patch

from transformers import PreTrainedTokenizerBase

from axolotl.utils.dict import DictDefault


class TestRevisionParameter:
    """Tests for revision_of_model being passed to tokenizer and processor loaders."""

    @patch("axolotl.loaders.tokenizer.load_model_config")
    @patch("axolotl.loaders.tokenizer.AutoTokenizer")
    @patch(
        "axolotl.loaders.patch_manager.PatchManager.apply_pre_tokenizer_load_patches"
    )
    def test_load_tokenizer_passes_revision(
        self, _mock_patches, mock_auto_tokenizer, _mock_load_config
    ):
        mock_tokenizer = MagicMock()
        mock_tokenizer.__class__.__name__ = "MockTokenizer"
        mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer

        cfg = DictDefault(
            {
                "tokenizer_config": "some-model",
                "revision_of_model": "abc123",
            }
        )
        from axolotl.loaders.tokenizer import load_tokenizer

        load_tokenizer(cfg)

        call_kwargs = mock_auto_tokenizer.from_pretrained.call_args
        assert call_kwargs.kwargs.get("revision") == "abc123"

    @patch("axolotl.loaders.tokenizer.load_model_config")
    @patch("axolotl.loaders.tokenizer.AutoTokenizer")
    @patch(
        "axolotl.loaders.patch_manager.PatchManager.apply_pre_tokenizer_load_patches"
    )
    def test_load_tokenizer_omits_revision_when_unset(
        self, _mock_patches, mock_auto_tokenizer, _mock_load_config
    ):
        mock_tokenizer = MagicMock()
        mock_tokenizer.__class__.__name__ = "MockTokenizer"
        mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer

        cfg = DictDefault(
            {
                "tokenizer_config": "some-model",
            }
        )
        from axolotl.loaders.tokenizer import load_tokenizer

        load_tokenizer(cfg)

        call_kwargs = mock_auto_tokenizer.from_pretrained.call_args
        assert "revision" not in call_kwargs.kwargs

    @patch("axolotl.loaders.tokenizer.AutoTokenizer")
    @patch("axolotl.loaders.tokenizer.is_local_main_process", return_value=True)
    @patch("axolotl.loaders.tokenizer.barrier")
    def test_modify_tokenizer_files_passes_revision(
        self, _mock_barrier, _mock_main, mock_auto_tokenizer, temp_dir
    ):
        mock_tokenizer = MagicMock()
        mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer

        from axolotl.loaders.tokenizer import modify_tokenizer_files

        modify_tokenizer_files("some-model", {}, output_dir=temp_dir, revision="abc123")

        call_kwargs = mock_auto_tokenizer.from_pretrained.call_args
        assert call_kwargs.kwargs.get("revision") == "abc123"

    @patch("axolotl.loaders.tokenizer.AutoTokenizer")
    @patch("axolotl.loaders.tokenizer.is_local_main_process", return_value=True)
    @patch("axolotl.loaders.tokenizer.barrier")
    def test_modify_tokenizer_files_defaults_revision_to_main(
        self, _mock_barrier, _mock_main, mock_auto_tokenizer, temp_dir
    ):
        mock_tokenizer = MagicMock()
        mock_auto_tokenizer.from_pretrained.return_value = mock_tokenizer

        from axolotl.loaders.tokenizer import modify_tokenizer_files

        modify_tokenizer_files("some-model", {}, output_dir=temp_dir)

        call_kwargs = mock_auto_tokenizer.from_pretrained.call_args
        assert call_kwargs.kwargs.get("revision") == "main"

    @patch("axolotl.loaders.processor.AutoProcessor")
    def test_load_processor_passes_revision(self, mock_auto_processor):
        mock_processor = MagicMock()
        mock_processor.size = {}
        mock_auto_processor.from_pretrained.return_value = mock_processor

        cfg = DictDefault(
            {
                "processor_config": "some-model",
                "revision_of_model": "abc123",
                "trust_remote_code": False,
            }
        )
        tokenizer = MagicMock(spec=PreTrainedTokenizerBase)

        from axolotl.loaders.processor import load_processor

        load_processor(cfg, tokenizer)

        call_kwargs = mock_auto_processor.from_pretrained.call_args
        assert call_kwargs.kwargs.get("revision") == "abc123"

    @patch("axolotl.loaders.processor.AutoProcessor")
    def test_load_processor_omits_revision_when_unset(self, mock_auto_processor):
        mock_processor = MagicMock()
        mock_processor.size = {}
        mock_auto_processor.from_pretrained.return_value = mock_processor

        cfg = DictDefault(
            {
                "processor_config": "some-model",
                "trust_remote_code": False,
            }
        )
        tokenizer = MagicMock(spec=PreTrainedTokenizerBase)

        from axolotl.loaders.processor import load_processor

        load_processor(cfg, tokenizer)

        call_kwargs = mock_auto_processor.from_pretrained.call_args
        assert "revision" not in call_kwargs.kwargs


================================================
FILE: tests/test_save_deduplicated.py
================================================
"""Tests to verify that deduplication runs before dataset saving during preprocessing.

This addresses GitHub issue #2719: Save De-duplicated Set During Pre-processing.
"""

from unittest.mock import MagicMock, patch

from datasets import Dataset

from axolotl.utils.dict import DictDefault


class TestSFTSaveDeduplicatedBeforeSave:
    """Verify that in SFT data loading, deduplication occurs before saving."""

    @patch("axolotl.utils.data.sft.save_preprocessed_dataset")
    @patch("axolotl.utils.data.sft.generate_dataset_hash_from_config")
    @patch("axolotl.utils.data.sft.deduplicate_and_log_datasets")
    @patch("axolotl.utils.data.sft.merge_datasets")
    @patch("axolotl.utils.data.sft._load_and_process_single_dataset")
    @patch("axolotl.utils.data.sft.datasets_with_name_generator")
    def test_dedup_called_before_save_sft(
        self,
        mock_datasets_gen,
        mock_load_single,
        mock_merge,
        mock_dedup,
        mock_gen_hash,
        mock_save,
    ):
        """Deduplication should be called before save_preprocessed_dataset in SFT."""
        from axolotl.utils.data.sft import _load_raw_datasets

        # Set up mock data
        dataset = Dataset.from_dict({"text": ["a", "b", "a"], "label": [1, 2, 1]})
        deduped_dataset = Dataset.from_dict({"text": ["a", "b"], "label": [1, 2]})

        mock_datasets_gen.return_value = [
            DictDefault({"path": "test", "type": "alpaca"})
        ]
        mock_load_single.return_value = (dataset, None)
        mock_merge.return_value = dataset
        mock_dedup.return_value = (deduped_dataset, None)
        mock_gen_hash.return_value = "testhash"

        cfg = DictDefault(
            {
                "skip_prepare_dataset": False,
                "dataset_exact_deduplication": True,
                "sequence_len": 1024,
                "eval_sequence_len": None,
                "sample_packing": False,
                "is_preprocess": False,
                "seed": 42,
                "datasets": [{"path": "test", "type": "alpaca"}],
            }
        )

        tokenizer = MagicMock()
        tokenizer.name_or_path = "test-tokenizer"

        # Track call order
        call_order = []
        mock_dedup.side_effect = lambda **kwargs: (
            call_order.append("dedup") or (deduped_dataset, None)
        )
        mock_save.side_effect = lambda *args, **kwargs: call_order.append("save")

        _load_raw_datasets(
            cfg=cfg,
            datasets_configs=cfg.datasets,
            tokenizer=tokenizer,
            split="train",
        )

        # Verify dedup was called
        assert "dedup" in call_order, "Deduplication should have been called"
        # Verify save was called
        assert "save" in call_order, "Save should have been called"
        # Verify dedup happened before save
        assert call_order.index("dedup") < call_order.index("save"), (
            "Deduplication must occur before saving the dataset"
        )

    @patch("axolotl.utils.data.sft.save_preprocessed_dataset")
    @patch("axolotl.utils.data.sft.generate_dataset_hash_from_config")
    @patch("axolotl.utils.data.sft.merge_datasets")
    @patch("axolotl.utils.data.sft._load_and_process_single_dataset")
    @patch("axolotl.utils.data.sft.datasets_with_name_generator")
    def test_no_dedup_when_disabled_sft(
        self,
        mock_datasets_gen,
        mock_load_single,
        mock_merge,
        mock_gen_hash,
        mock_save,
    ):
        """Deduplication should not be called when dataset_exact_deduplication is False."""
        from axolotl.utils.data.sft import _load_raw_datasets

        dataset = Dataset.from_dict({"text": ["a", "b", "a"], "label": [1, 2, 1]})

        mock_datasets_gen.return_value = [
            DictDefault({"path": "test", "type": "alpaca"})
        ]
        mock_load_single.return_value = (dataset, None)
        mock_merge.return_value = dataset
        mock_gen_hash.return_value = "testhash"

        cfg = DictDefault(
            {
                "skip_prepare_dataset": False,
                "dataset_exact_deduplication": False,
                "sequence_len": 1024,
                "eval_sequence_len": None,
                "sample_packing": False,
                "is_preprocess": False,
                "seed": 42,
                "datasets": [{"path": "test", "type": "alpaca"}],
            }
        )

        tokenizer = MagicMock()
        tokenizer.name_or_path = "test-tokenizer"

        with patch("axolotl.utils.data.sft.deduplicate_and_log_datasets") as mock_dedup:
            _load_raw_datasets(
                cfg=cfg,
                datasets_configs=cfg.datasets,
                tokenizer=tokenizer,
                split="train",
            )
            mock_dedup.assert_not_called()


class TestRLSaveDeduplicatedBeforeSave:
    """Verify that in RL data loading, deduplication occurs before saving."""

    @patch.object(Dataset, "filter", lambda self, *args, **kwargs: self)
    @patch("axolotl.utils.data.rl.save_preprocessed_dataset")
    @patch("axolotl.utils.data.rl.generate_dataset_hash_from_config")
    @patch("axolotl.utils.data.rl.deduplicate_and_log_datasets")
    @patch("axolotl.utils.data.rl.merge_datasets")
    @patch("axolotl.utils.data.rl.load_dataset_with_config")
    @patch("axolotl.utils.data.rl.datasets_with_name_generator")
    @patch("axolotl.utils.data.rl.load_tokenizer")
    def test_dedup_called_before_save_rl(
        self,
        mock_load_tokenizer,
        mock_datasets_gen,
        mock_load_dataset,
        mock_merge,
        mock_dedup,
        mock_gen_hash,
        mock_save,
    ):
        """Deduplication should be called before save_preprocessed_dataset in RL."""
        from axolotl.utils.data.rl import _load_split

        dataset = Dataset.from_dict(
            {
                "prompt": ["hi", "bye", "hi"],
                "chosen": ["a", "b", "a"],
                "rejected": ["c", "d", "c"],
            }
        )
        deduped_dataset = Dataset.from_dict(
            {
                "prompt": ["hi", "bye"],
                "chosen": ["a", "b"],
                "rejected": ["c", "d"],
            }
        )

        mock_datasets_gen.return_value = [DictDefault({"path": "test", "type": None})]
        mock_load_dataset.return_value = dataset
        mock_merge.return_value = dataset
        mock_dedup.return_value = (deduped_dataset, None)
        mock_gen_hash.return_value = "testhash"

        tokenizer = MagicMock()
        tokenizer.name_or_path = "test-tokenizer"
        mock_load_tokenizer.return_value = tokenizer

        cfg = DictDefault(
            {
                "skip_prepare_dataset": False,
                "dataset_exact_deduplication": True,
                "sequence_len": 1024,
                "rl": "dpo",
                "datasets": [{"path": "test", "type": None}],
                "hf_use_auth_token": False,
                "dataset_num_proc": 1,
                "is_preprocess": False,
            }
        )

        call_order = []
        mock_dedup.side_effect = lambda **kwargs: (
            call_order.append("dedup") or (deduped_dataset, None)
        )
        mock_save.side_effect = lambda *args, **kwargs: call_order.append("save")

        _load_split(cfg, split="train")

        assert "dedup" in call_order, "Deduplication should have been called"
        assert "save" in call_order, "Save should have been called"
        assert call_order.index("dedup") < call_order.index("save"), (
            "Deduplication must occur before saving the dataset"
        )


================================================
FILE: tests/test_schedulers.py
================================================
"""
test module for the axolotl.utis.data module
"""

import unittest

import torch
from torch.optim import SGD

from axolotl.utils.schedulers import get_cosine_schedule_with_warmup_decay_constant


class TestCosineConstantLr(unittest.TestCase):
    """
    test class for encode pretraining and md5 helper
    """

    def setUp(self):
        self.train_steps = 1000
        self.warmup_steps = 10
        self.min_lr_ratio = 0.1
        self.constant_lr_ratio = 0.8
        self._lr = 0.01
        self.optimizer = SGD([torch.tensor(1)], lr=self._lr)
        self.lr_scheduler = get_cosine_schedule_with_warmup_decay_constant(
            self.optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=self.train_steps,
            min_lr_ratio=self.min_lr_ratio,
            constant_lr_ratio=self.constant_lr_ratio,
        )

    def test_schedulers(self):
        self.assertEqual(self.lr_scheduler.get_last_lr()[0], 0)
        for _ in range(self.warmup_steps):
            self.optimizer.step()
            self.lr_scheduler.step()
        self.assertEqual(self.lr_scheduler.get_last_lr()[0], self._lr)
        constant_step = int(self.train_steps * self.constant_lr_ratio)
        remaining_step = self.train_steps - constant_step
        for _ in range(constant_step):
            self.optimizer.step()
            self.lr_scheduler.step()
        self.assertEqual(
            self.lr_scheduler.get_last_lr()[0], self._lr * self.min_lr_ratio
        )
        for _ in range(remaining_step):
            self.optimizer.step()
            self.lr_scheduler.step()
        self.assertEqual(
            self.lr_scheduler.get_last_lr()[0], self._lr * self.min_lr_ratio
        )


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/test_streaming.py
================================================
"""Test streaming configuration and data loading functionality."""

import unittest
from unittest.mock import Mock, patch

from datasets import IterableDataset

from axolotl.utils.config import validate_config
from axolotl.utils.data.sft import (
    _prepare_streaming_dataset,
    prepare_datasets,
)
from axolotl.utils.dict import DictDefault


class TestStreamingConfig(unittest.TestCase):
    """Test streaming configuration and deprecation handling."""

    def test_streaming_multipack_buffer_size_deprecation(self):
        """Test that pretrain_multipack_buffer_size is properly deprecated."""
        # Test with old config name
        cfg_old = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "pretrain_multipack_buffer_size": 5000,
                "datasets": [{"path": "test/dataset", "type": "alpaca"}],
                "sequence_len": 256,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "learning_rate": 0.0001,
            }
        )

        with self.assertLogs("axolotl.utils.schemas.validation", level="WARNING") as cm:
            validated_cfg = validate_config(cfg_old)
            self.assertIn("pretrain_multipack_buffer_size` is deprecated", cm.output[0])

        self.assertEqual(validated_cfg.streaming_multipack_buffer_size, 5000)
        self.assertIsNone(
            getattr(validated_cfg, "pretrain_multipack_buffer_size", None)
        )

    def test_streaming_multipack_buffer_size_new(self):
        """Test that new streaming_multipack_buffer_size works correctly."""
        cfg_new = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "streaming_multipack_buffer_size": 7000,
                "datasets": [{"path": "test/dataset", "type": "alpaca"}],
                "sequence_len": 256,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "learning_rate": 0.0001,
            }
        )

        validated_cfg = validate_config(cfg_new)
        self.assertEqual(validated_cfg.streaming_multipack_buffer_size, 7000)

    def test_both_buffer_sizes_raises_error(self):
        """Test that having both old and new buffer size configs raises an error."""
        cfg_both = DictDefault(
            {
                "base_model": "HuggingFaceTB/SmolLM2-135M",
                "pretrain_multipack_buffer_size": 5000,
                "streaming_multipack_buffer_size": 7000,
                "datasets": [{"path": "test/dataset", "type": "alpaca"}],
                "sequence_len": 256,
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "learning_rate": 0.0001,
            }
        )

        with self.assertRaises(ValueError) as cm:
            validate_config(cfg_both)
        self.assertIn("both are set", str(cm.exception))


class TestStreamingDatasetPreparation(unittest.TestCase):
    """Test dataset preparation with streaming configuration."""

    def setUp(self):
        self.tokenizer = Mock()
        self.tokenizer.pad_token_id = 0
        self.tokenizer.eos_token_id = 1

    @patch("axolotl.utils.data.sft._prepare_streaming_dataset")
    def test_prepare_datasets_with_streaming_true(self, mock_prepare_streaming):
        """Test that streaming=True triggers streaming dataset preparation."""
        cfg = DictDefault(
            {
                "streaming": True,
                "datasets": [{"path": "test/dataset", "type": "alpaca"}],
            }
        )

        mock_prepare_streaming.return_value = (Mock(), None, 100, [])

        prepare_datasets(cfg, self.tokenizer)

        mock_prepare_streaming.assert_called_once_with(cfg, self.tokenizer, None)

    @patch("axolotl.utils.data.sft._prepare_streaming_dataset")
    def test_prepare_datasets_with_pretraining_dataset(self, mock_prepare_streaming):
        """Test that pretraining_dataset triggers streaming dataset preparation."""
        cfg = DictDefault(
            {
                "pretraining_dataset": "test/dataset",
            }
        )

        mock_prepare_streaming.return_value = (Mock(), None, 100, [])

        prepare_datasets(cfg, self.tokenizer)

        mock_prepare_streaming.assert_called_once_with(cfg, self.tokenizer, None)

    @patch("axolotl.utils.data.sft._prepare_standard_dataset")
    def test_prepare_datasets_without_streaming(self, mock_prepare_standard):
        """Test that without streaming, standard dataset preparation is used."""
        cfg = DictDefault(
            {
                "datasets": [{"path": "test/dataset", "type": "alpaca"}],
            }
        )

        mock_prepare_standard.return_value = (Mock(), None, 100, [])

        prepare_datasets(cfg, self.tokenizer)

        mock_prepare_standard.assert_called_once_with(cfg, self.tokenizer, None)


class TestStreamingWithSamplePacking(unittest.TestCase):
    """Test streaming dataset preparation with sample packing."""

    def setUp(self):
        self.tokenizer = Mock()
        self.tokenizer.pad_token_id = 0
        self.tokenizer.eos_token_id = 1

    @patch("axolotl.utils.data.sft._load_streaming_dataset")
    def test_streaming_sft_with_sample_packing_sets_split(self, mock_load_streaming):
        """Test that streaming SFT with sample_packing sets default split."""
        cfg = DictDefault(
            {
                "streaming": True,
                "sample_packing": True,
                "datasets": [{"path": "test/dataset", "type": "alpaca"}],
                "sequence_len": 256,
                "micro_batch_size": 1,
            }
        )

        mock_load_streaming.return_value = Mock(spec=IterableDataset)

        with patch("axolotl.utils.data.sft._load_and_prepare_datasets"):
            _prepare_streaming_dataset(cfg, self.tokenizer, None)

            # Check that the dataset config has split set to 'train'
            call_args = mock_load_streaming.call_args
            dataset_config = call_args[0][0]
            self.assertEqual(dataset_config.split, "train")

    def test_multipack_attn_forced_true_for_sft(self):
        """Test that multipack_attn is forced to True for SFT with sample packing."""
        from axolotl.utils.data.streaming import wrap_streaming_dataset

        cfg = DictDefault(
            {
                "sample_packing": True,
                "pretrain_multipack_attn": False,  # Should be overridden for SFT
                "pretraining_dataset": None,  # This makes it SFT
                "sequence_len": 256,
                "micro_batch_size": 1,
                "streaming_multipack_buffer_size": 1000,
                "seed": 42,
            }
        )

        mock_dataset = Mock()
        mock_dataset.features = None  # For streaming datasets
        mock_dataset.__iter__ = Mock(return_value=iter([]))  # Empty iterator
        mock_dataset.map = Mock(return_value=mock_dataset)
        mock_ds_wrapper = Mock()

        with patch(
            "axolotl.utils.data.streaming.PretrainingBatchSamplerDataCollatorForSeq2Seq"
        ) as mock_collator:
            with patch("axolotl.utils.data.streaming.encode_packed_streaming"):
                wrap_streaming_dataset(
                    mock_dataset, self.tokenizer, cfg, mock_ds_wrapper
                )

                # Check that multipack_attn=True was used in the collator
                mock_collator.assert_called_once()
                call_kwargs = mock_collator.call_args[1]
                self.assertTrue(call_kwargs["multipack_attn"])

    def test_multipack_attn_respects_config_for_pretraining(self):
        """Test that multipack_attn respects config for pretraining datasets."""
        from axolotl.utils.data.streaming import wrap_streaming_dataset

        cfg = DictDefault(
            {
                "sample_packing": True,
                "pretrain_multipack_attn": False,  # Should be respected for pretraining
                "pretraining_dataset": "test/dataset",  # This makes it pretraining
                "sequence_len": 256,
                "micro_batch_size": 1,
                "streaming_multipack_buffer_size": 1000,
                "seed": 42,
            }
        )

        mock_dataset = Mock()
        mock_dataset.features = None  # For streaming datasets
        mock_dataset.__iter__ = Mock(return_value=iter([]))  # Empty iterator
        mock_dataset.map = Mock(return_value=mock_dataset)
        mock_ds_wrapper = Mock()

        with patch(
            "axolotl.utils.data.streaming.PretrainingBatchSamplerDataCollatorForSeq2Seq"
        ) as mock_collator:
            with patch("axolotl.utils.data.streaming.encode_packed_streaming"):
                wrap_streaming_dataset(
                    mock_dataset, self.tokenizer, cfg, mock_ds_wrapper
                )

                # Check that multipack_attn=False was used (respecting config)
                mock_collator.assert_called_once()
                call_kwargs = mock_collator.call_args[1]
                self.assertFalse(call_kwargs["multipack_attn"])


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/test_tensor_parallel_batch_size.py
================================================
"""Tests for batch_size calculation with tensor parallelism."""

from unittest.mock import patch

import addict
import pytest

from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault


@pytest.fixture(name="tp_base_cfg")
def fixture_tp_base_cfg(min_base_cfg):
    return (
        DictDefault(
            micro_batch_size=2,
            gradient_accumulation_steps=4,
            sequence_len=2048,
            num_epochs=1,
        )
        | min_base_cfg
    )


class TestTensorParallelBatchSize:
    """Verify batch_size scales by effective dp world_size when using tensor parallelism."""

    @pytest.mark.parametrize(
        "world_size, tensor_parallel_size, expected_batch_size",
        [
            (4, 1, 32),  # no TP: 2*4*4 = 32
            (4, 2, 16),  # TP=2: 2*4*(4//2) = 16
            (4, 4, 8),  # TP=4: 2*4*(4//4) = 8
            (2, 2, 8),  # TP=ws: 2*4*(2//2) = 8 (no scaling)
        ],
    )
    def test_batch_size_with_tensor_parallelism(
        self,
        tp_base_cfg,
        monkeypatch,
        world_size,
        tensor_parallel_size,
        expected_batch_size,
    ):
        monkeypatch.setenv("WORLD_SIZE", str(world_size))
        tp_base_cfg["tensor_parallel_size"] = tensor_parallel_size
        cfg = validate_config(tp_base_cfg)
        # Mock load_model_config to avoid downloading the model and to bypass
        # the tie_word_embeddings validation that blocks TP > 1.
        with patch(
            "axolotl.utils.config.load_model_config",
            return_value=addict.Dict({"model_type": "llama"}),
        ):
            normalize_config(cfg)
        assert cfg.batch_size == expected_batch_size


================================================
FILE: tests/test_tokenizers.py
================================================
"""
Test cases for the tokenizer loading
"""

import unittest

import pytest

from axolotl.loaders import load_tokenizer
from axolotl.utils.dict import DictDefault

from tests.hf_offline_utils import enable_hf_offline


class TestTokenizers:
    """
    test class for the load_tokenizer fn
    """

    @pytest.mark.skip("LlamaTokenizer no longer has a Fast/Slow tokenizer")
    @enable_hf_offline
    def test_default_use_fast(self):
        cfg = DictDefault(
            {
                "tokenizer_config": "huggyllama/llama-7b",
            }
        )
        tokenizer = load_tokenizer(cfg)
        assert "Fast" in tokenizer.__class__.__name__

    @pytest.mark.skip("LlamaTokenizer no longer has a Fast/Slow tokenizer")
    @enable_hf_offline
    def test_dont_use_fast(self):
        cfg = DictDefault(
            {
                "tokenizer_config": "huggyllama/llama-7b",
                "tokenizer_use_fast": False,
            }
        )
        tokenizer = load_tokenizer(cfg)
        assert "Fast" not in tokenizer.__class__.__name__

    @enable_hf_offline
    def test_special_tokens_modules_to_save(self):
        # setting special_tokens to new token
        cfg = DictDefault(
            {
                "tokenizer_config": "huggyllama/llama-7b",
                "adapter": "lora",
                "special_tokens": {"bos_token": "[INST]"},
            }
        )
        with pytest.raises(
            ValueError,
            match=r".*Please set lora_modules_to_save*",
        ):
            load_tokenizer(cfg)

        # setting special_tokens but not changing from default
        cfg = DictDefault(
            {
                "tokenizer_config": "huggyllama/llama-7b",
                "adapter": "lora",
                "special_tokens": {"bos_token": "<s>"},
            }
        )
        load_tokenizer(cfg)

        # non-adapter setting special_tokens
        cfg = DictDefault(
            {
                "tokenizer_config": "huggyllama/llama-7b",
                "special_tokens": {"bos_token": "[INST]"},
            }
        )
        load_tokenizer(cfg)

    @enable_hf_offline
    def test_add_additional_special_tokens(self):
        cfg = DictDefault(
            {
                "tokenizer_config": "huggyllama/llama-7b",
                "special_tokens": {"additional_special_tokens": ["<|im_start|>"]},
            }
        )
        tokenizer = load_tokenizer(cfg)
        assert "LlamaTokenizer" in tokenizer.__class__.__name__
        assert tokenizer("<|im_start|>user")["input_ids"] == [1, 32000, 1792]
        assert len(tokenizer) == 32001

        # ensure reloading the tokenizer again from cfg results in same vocab length
        tokenizer = load_tokenizer(cfg)
        assert len(tokenizer) == 32001

    @enable_hf_offline
    def test_added_tokens_overrides(self, temp_dir):
        cfg = DictDefault(
            {
                # use with tokenizer that has reserved_tokens in added_tokens
                "tokenizer_config": "NousResearch/Llama-3.2-1B",
                "added_tokens_overrides": {
                    128041: "RANDOM_OVERRIDE_1",
                    128042: "RANDOM_OVERRIDE_2",
                },
                "output_dir": temp_dir,
            }
        )

        tokenizer = load_tokenizer(cfg)
        assert tokenizer.encode("RANDOM_OVERRIDE_1", add_special_tokens=False) == [
            128041
        ]
        assert tokenizer.encode("RANDOM_OVERRIDE_2", add_special_tokens=False) == [
            128042
        ]
        assert (
            tokenizer.decode([128041, 128042]) == "RANDOM_OVERRIDE_1RANDOM_OVERRIDE_2"
        )

    @pytest.mark.skip("FIXME slow test sdist py3.11 + torch2.8.0")
    @enable_hf_offline
    def test_added_tokens_overrides_gemma3(self, temp_dir):
        cfg = DictDefault(
            {
                # use with tokenizer that has reserved_tokens in added_tokens
                "tokenizer_config": "mlx-community/gemma-3-4b-it-8bit",
                "added_tokens_overrides": {
                    256001: "RANDOM_OVERRIDE_1",
                    256002: "RANDOM_OVERRIDE_2",
                },
                "output_dir": temp_dir,
            }
        )

        tokenizer = load_tokenizer(cfg)
        assert tokenizer.encode("RANDOM_OVERRIDE_1", add_special_tokens=False) == [
            256001
        ]
        assert tokenizer.encode("RANDOM_OVERRIDE_2", add_special_tokens=False) == [
            256002
        ]
        assert (
            tokenizer.decode([256001, 256002]) == "RANDOM_OVERRIDE_1RANDOM_OVERRIDE_2"
        )

    @enable_hf_offline
    def test_added_tokens_overrides_with_toolargeid(self, temp_dir):
        cfg = DictDefault(
            {
                # use with tokenizer that has reserved_tokens in added_tokens
                "tokenizer_config": "HuggingFaceTB/SmolLM2-135M",
                "added_tokens_overrides": {1000000: "BROKEN_RANDOM_OVERRIDE_1"},
                "output_dir": temp_dir,
            }
        )

        with pytest.raises(
            ValueError, match=r".*Token ID 1000000 not found in added_tokens.*"
        ):
            load_tokenizer(cfg)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/test_train.py
================================================
"""Test for batch size calculation for multi-gpu training."""

import pytest

from axolotl.utils.config import normalize_config, validate_config
from axolotl.utils.dict import DictDefault


@pytest.fixture(name="train_base_cfg")
def fixture_train_base_cfg(min_base_cfg):
    return (
        DictDefault(
            micro_batch_size=2,
            gradient_accumulation_steps=4,
            sequence_len=2048,
            sample_packing=True,
            num_epochs=1,
        )
        | min_base_cfg
    )


class TestTrain:
    """test class for train related tests"""

    @pytest.mark.parametrize(
        "world_size, expected_batch_size",
        [
            (1, 8),
            (4, 32),
        ],
    )
    def test_batch_size_ddp(
        self, train_base_cfg, monkeypatch, world_size, expected_batch_size
    ):
        monkeypatch.setenv("WORLD_SIZE", str(world_size))
        cfg = validate_config(train_base_cfg)
        normalize_config(cfg)
        assert cfg.batch_size == expected_batch_size


================================================
FILE: tests/test_triton_kernels.py
================================================
# Copyright 2026 Axolotl AI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.

"""Unit tests for Triton kernels: entropy_from_logits and selective_log_softmax.

Adapted from harness/test_entropy.py and harness/test_selective_logsoftmax.py
into proper pytest tests, plus new OOB index safety tests.
"""

import math

import pytest
import torch
import torch.nn.functional as F

pytestmark = pytest.mark.skipif(
    not torch.cuda.is_available(), reason="CUDA required for Triton kernels"
)


# ---------------------------------------------------------------------------
# Reference implementations
# ---------------------------------------------------------------------------


def _ref_entropy(logits):
    """Reference entropy via log_softmax (numerically stable)."""
    logp = F.log_softmax(logits.float(), dim=-1)
    return -(logp.exp() * logp).sum(dim=-1)


def _ref_selective_log_softmax(logits, index):
    """Reference selective log softmax via PyTorch gather."""
    squeeze = index.ndim == logits.ndim - 1
    if squeeze:
        index = index.unsqueeze(-1)
    log_probs = F.log_softmax(logits.float(), dim=-1)
    result = torch.gather(log_probs, dim=-1, index=index)
    if squeeze:
        result = result.squeeze(-1)
    return result


# ---------------------------------------------------------------------------
# entropy_from_logits
# ---------------------------------------------------------------------------


class TestEntropyFromLogits:
    @pytest.mark.parametrize(
        "B,L",
        [
            (1, 128),
            (1, 2048),
            (4, 512),
            (8, 256),
            (1, 1),
        ],
    )
    def test_correctness_various_shapes(self, B, L):
        from axolotl.monkeypatch.trainer.utils import entropy_from_logits

        V = 1024
        torch.manual_seed(42)
        logits = torch.randn(B, L, V, device="cuda", dtype=torch.float32)
        result = entropy_from_logits(logits)
        expected = _ref_entropy(logits)
        assert result.shape == (B, L)
        torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4)

    def test_2d_input(self):
        from axolotl.monkeypatch.trainer.utils import entropy_from_logits

        logits = torch.randn(16, 256, device="cuda", dtype=torch.float32)
        result = entropy_from_logits(logits)
        expected = _ref_entropy(logits)
        assert result.shape == (16,)
        torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4)

    def test_large_vocab(self):
        from axolotl.monkeypatch.trainer.utils import entropy_from_logits

        V = 32000
        logits = torch.randn(2, V, device="cuda", dtype=torch.float32)
        result = entropy_from_logits(logits)
        expected = _ref_entropy(logits)
        torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4)

    def test_uniform_distribution(self):
        """Uniform logits -> entropy = log(V)."""
        from axolotl.monkeypatch.trainer.utils import entropy_from_logits

        V = 1024
        logits = torch.zeros(2, V, device="cuda", dtype=torch.float32)
        result = entropy_from_logits(logits)
        expected_val = math.log(V)
        torch.testing.assert_close(
            result,
            torch.full((2,), expected_val, device="cuda", dtype=torch.float32),
            atol=1e-4,
            rtol=1e-4,
        )

    def test_peaked_distribution(self):
        """One-hot-like logits -> entropy near 0."""
        from axolotl.monkeypatch.trainer.utils import entropy_from_logits

        logits = torch.full((2, 128), -100.0, device="cuda", dtype=torch.float32)
        logits[:, 0] = 100.0
        result = entropy_from_logits(logits)
        assert (result < 1e-3).all()

    def test_bfloat16(self):
        from axolotl.monkeypatch.trainer.utils import entropy_from_logits

        logits = torch.randn(4, 256, device="cuda", dtype=torch.bfloat16)
        result = entropy_from_logits(logits)
        expected = _ref_entropy(logits.float())
        assert result.dtype == torch.bfloat16
        torch.testing.assert_close(result.float(), expected, atol=5e-2, rtol=5e-2)

    def test_float16(self):
        from axolotl.monkeypatch.trainer.utils import entropy_from_logits

        logits = torch.randn(4, 256, device="cuda", dtype=torch.float16)
        result = entropy_from_logits(logits)
        expected = _ref_entropy(logits.float())
        assert result.dtype == torch.float16
        torch.testing.assert_close(result.float(), expected, atol=5e-2, rtol=5e-2)

    def test_non_contiguous_3d_transpose(self):
        """Non-contiguous 3D tensor via transpose(0,1)."""
        from axolotl.monkeypatch.trainer.utils import entropy_from_logits

        V = 256
        raw = torch.randn(32, 4, V, device="cuda", dtype=torch.float32)
        logits = raw.transpose(0, 1)  # (4, 32, V) non-contiguous
        assert not logits.is_contiguous()
        result = entropy_from_logits(logits)
        expected = _ref_entropy(logits)
        torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4)

    def test_non_contiguous_3d_slice(self):
        """Non-contiguous 3D tensor via batch slicing."""
        from axolotl.monkeypatch.trainer.utils import entropy_from_logits

        V = 256
        raw = torch.randn(8, 32, V, device="cuda", dtype=torch.float32)
        logits = raw[::2]  # (4, 32, V) non-contiguous
        assert not logits.is_contiguous()
        result = entropy_from_logits(logits)
        expected = _ref_entropy(logits)
        torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4)

    def test_many_rows_beyond_max_grid(self):
        """More rows than MAX_GRID (8192) to test chunked dispatch."""
        from axolotl.monkeypatch.trainer.utils import entropy_from_logits

        logits = torch.randn(10000, 128, device="cuda", dtype=torch.float32)
        result = entropy_from_logits(logits)
        expected = _ref_entropy(logits)
        torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4)

    def test_entropy_non_negative(self):
        from axolotl.monkeypatch.trainer.utils import entropy_from_logits

        logits = torch.randn(32, 512, device="cuda", dtype=torch.float32)
        result = entropy_from_logits(logits)
        assert (result >= -1e-5).all(), f"Negative entropy: {result.min()}"


# ---------------------------------------------------------------------------
# selective_log_softmax — forward correctness
# ---------------------------------------------------------------------------


class TestSelectiveLogSoftmax:
    @pytest.mark.parametrize(
        "B,L,K",
        [
            (1, 128, 1),
            (4, 512, 1),
            (8, 256, 1),
            (4, 256, 4),
            (4, 256, 7),
            (15, 129, 1),  # non-power-of-2
        ],
    )
    def test_correctness_various_shapes(self, B, L, K):
        from axolotl.monkeypatch.trainer.utils import selective_log_softmax

        V = 1024
        torch.manual_seed(42)
        logits = torch.randn(B, L, V, device="cuda", dtype=torch.float32)
        if K == 1:
            index = torch.randint(0, V, (B, L), device="cuda")
        else:
            index = torch.randint(0, V, (B, L, K), device="cuda")
        result = selective_log_softmax(logits, index)
        expected = _ref_selective_log_softmax(logits, index)
        torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4)

    def test_squeezed_index(self):
        """Index with ndim == logits.ndim - 1 triggers squeeze path."""
        from axolotl.monkeypatch.trainer.utils import selective_log_softmax

        V = 256
        logits = torch.randn(8, V, device="cuda", dtype=torch.float32)
        index = torch.randint(0, V, (8,), device="cuda")
        result = selective_log_softmax(logits, index)
        expected = _ref_selective_log_softmax(logits, index)
        assert result.shape == (8,)
        torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4)

    def test_large_vocab(self):
        from axolotl.monkeypatch.trainer.utils import selective_log_softmax

        V = 32000
        logits = torch.randn(2, V, device="cuda", dtype=torch.float32)
        index = torch.randint(0, V, (2, 1), device="cuda")
        result = selective_log_softmax(logits, index)
        expected = _ref_selective_log_softmax(logits, index)
        torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4)

    def test_bfloat16(self):
        from axolotl.monkeypatch.trainer.utils import selective_log_softmax

        V = 1024
        torch.manual_seed(42)
        logits = torch.randn(4, 128, V, device="cuda", dtype=torch.bfloat16)
        index = torch.randint(0, V, (4, 128), device="cuda")
        result = selective_log_softmax(logits, index)
        expected = _ref_selective_log_softmax(logits.float(), index)
        assert result.dtype == torch.bfloat16
        torch.testing.assert_close(result.float(), expected, atol=0.1, rtol=0.1)

    def test_fp32_tight_tolerance(self):
        from axolotl.monkeypatch.trainer.utils import selective_log_softmax

        V = 1024
        torch.manual_seed(42)
        logits = torch.randn(2, 256, V, device="cuda", dtype=torch.float32)
        index = torch.randint(0, V, (2, 256), device="cuda")
        result = selective_log_softmax(logits, index)
        expected = _ref_selective_log_softmax(logits, index)
        torch.testing.assert_close(result, expected, atol=1e-5, rtol=1e-5)

    def test_all_same_index(self):
        from axolotl.monkeypatch.trainer.utils import selective_log_softmax

        V = 128
        logits = torch.randn(8, V, device="cuda", dtype=torch.float32)
        index = torch.zeros(8, 1, device="cuda", dtype=torch.long)
        result = selective_log_softmax(logits, index)
        expected = _ref_selective_log_softmax(logits, index)
        torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4)

    def test_last_index(self):
        from axolotl.monkeypatch.trainer.utils import selective_log_softmax

        V = 128
        logits = torch.randn(8, V, device="cuda", dtype=torch.float32)
        index = torch.full((8, 1), V - 1, device="cuda", dtype=torch.long)
        result = selective_log_softmax(logits, index)
        expected = _ref_selective_log_softmax(logits, index)
        torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4)

    def test_output_always_nonpositive(self):
        """Log softmax values should always be <= 0."""
        from axolotl.monkeypatch.trainer.utils import selective_log_softmax

        V = 256
        logits = torch.randn(32, V, device="cuda", dtype=torch.float32)
        index = torch.randint(0, V, (32, 1), device="cuda")
        result = selective_log_softmax(logits, index)
        assert (result <= 1e-5).all(), f"Positive log-prob: {result.max()}"

    def test_many_rows_beyond_max_grid(self):
        from axolotl.monkeypatch.trainer.utils import selective_log_softmax

        V = 128
        logits = torch.randn(10000, V, device="cuda", dtype=torch.float32)
        index = torch.randint(0, V, (10000, 1), device="cuda")
        result = selective_log_softmax(logits, index)
        expected = _ref_selective_log_softmax(logits, index)
        torch.testing.assert_close(result, expected, atol=1e-4, rtol=1e-4)


# ---------------------------------------------------------------------------
# selective_log_softmax — backward / gradient correctness
# ---------------------------------------------------------------------------


class TestSelectiveLogSoftmaxBackward:
    @pytest.mark.parametrize(
        "B,L,V,K",
        [
            (2, 16, 64, 1),
            (2, 16, 64, 4),
            (1, 8, 128, 1),
            (2, 8, 128, 7),
        ],
    )
    def test_gradient_matches_reference(self, B, L, V, K):
        from axolotl.monkeypatch.trainer.utils import selective_log_softmax

        torch.manual_seed(42)
        logits_ref = torch.randn(
            B, L, V, device="cuda", dtype=torch.float32, requires_grad=True
        )
        logits_tri = logits_ref.detach().clone().requires_grad_(True)

        if K == 1:
            index = torch.randint(0, V, (B, L), device="cuda")
        else:
            index = torch.randint(0, V, (B, L, K), device="cuda")

        ref_out = _ref_selective_log_softmax(logits_ref, index)
        tri_out = selective_log_softmax(logits_tri, index)

        ref_out.sum().backward()
        tri_out.sum().backward()

        torch.testing.assert_close(
            logits_tri.grad, logits_ref.grad, atol=1e-5, rtol=1e-5
        )

    def test_gradient_bfloat16_full_vocab(self):
        from axolotl.monkeypatch.trainer.utils import selective_log_softmax

        V = 4096
        torch.manual_seed(42)
        logits_ref = torch.randn(
            2, 64, V, device="cuda", dtype=torch.bfloat16, requires_grad=True
        )
        logits_tri = logits_ref.detach().clone().requires_grad_(True)
        index = torch.randint(0, V, (2, 64), device="cuda")

        _ref_selective_log_softmax(logits_ref, index).sum().backward()
        selective_log_softmax(logits_tri, index).sum().backward()

        torch.testing.assert_close(
            logits_tri.grad.float(), logits_ref.grad.float(), atol=0.1, rtol=0.1
        )

    def test_gradient_k1_squeezed(self):
        """Gradient with squeezed (1D) index."""
        from axolotl.monkeypatch.trainer.utils import selective_log_softmax

        V = 256
        logits = torch.randn(
            8, V, device="cuda", dtype=torch.float32, requires_grad=True
        )
        index = torch.randint(0, V, (8,), device="cuda")

        result = selective_log_softmax(logits, index)
        result.sum().backward()
        triton_grad = logits.grad.clone()

        logits.grad = None
        ref = torch.gather(
            F.log_softmax(logits, dim=-1), dim=-1, index=index.unsqueeze(-1)
        ).squeeze(-1)
        ref.sum().backward()

        torch.testing.assert_close(triton_grad, logits.grad, atol=1e-4, rtol=1e-4)


# ---------------------------------------------------------------------------
# selective_log_softmax — out-of-bounds index safety
# ---------------------------------------------------------------------------


class TestSelectiveLogSoftmaxOOBSafety:
    """Verify that out-of-range indices don't crash or corrupt valid results."""

    def test_negative_indices_no_crash(self):
        from axolotl.monkeypatch.trainer.utils import selective_log_softmax

        V = 128
        logits = torch.randn(4, V, device="cuda", dtype=torch.float32)
        index = torch.tensor(
            [[-1], [0], [V - 1], [-5]], device="cuda", dtype=torch.long
        )
        result = selective_log_softmax(logits, index)
        assert result.shape == (4, 1)
        # Valid rows should be finite and match reference
        valid_idx = torch.tensor([[0], [V - 1]], device="cuda", dtype=torch.long)
        valid_logits = logits[1:3]
        expected = _ref_selective_log_softmax(valid_logits, valid_idx)
        torch.testing.assert_close(result[1:3], expected, atol=1e-4, rtol=1e-4)

    def test_index_exceeds_vocab_no_crash(self):
        from axolotl.monkeypatch.trainer.utils import selective_log_softmax

        V = 128
        logits = torch.randn(4, V, device="cuda", dtype=torch.float32)
        index = torch.tensor(
            [[0], [V], [V + 100], [V - 1]], device="cuda", dtype=torch.long
        )
        result = selective_log_softmax(logits, index)
        assert result.shape == (4, 1)
        # Valid rows (0 and 3) should match reference
        for row_idx, idx_val in [(0, 0), (3, V - 1)]:
            ref = _ref_selective_log_softmax(
                logits[row_idx : row_idx + 1],
                torch.tensor([[idx_val]], device="cuda", dtype=torch.long),
            )
            torch.testing.assert_close(
                result[row_idx : row_idx + 1], ref, atol=1e-4, rtol=1e-4
            )

    def test_mixed_valid_invalid_multi_index(self):
        from axolotl.monkeypatch.trainer.utils import selective_log_softmax

        V = 256
        K = 3
        logits = torch.randn(4, V, device="cuda", dtype=torch.float32)
        index = torch.tensor(
            [
                [0, 10, -1],  # last invalid
                [V, 5, 100],  # first invalid
                [50, 60, 70],  # all valid
                [-1, V + 1, -100],  # all invalid
            ],
            device="cuda",
            dtype=torch.long,
        )
        result = selective_log_softmax(logits, index)
        assert result.shape == (4, K)
        # Row 2 (all valid) must match reference exactly
        valid_index = torch.tensor([[50, 60, 70]], device="cuda", dtype=torch.long)
        expected = _ref_selective_log_softmax(logits[2:3], valid_index)
        torch.testing.assert_close(result[2:3], expected, atol=1e-4, rtol=1e-4)

    def test_oob_backward_no_crash(self):
        """Backward with OOB indices should not crash and grads should be finite."""
        from axolotl.monkeypatch.trainer.utils import selective_log_softmax

        V = 128
        logits = torch.randn(
            4, V, device="cuda", dtype=torch.float32, requires_grad=True
        )
        index = torch.tensor(
            [[-1], [0], [V + 10], [V - 1]], device="cuda", dtype=torch.long
        )
        result = selective_log_softmax(logits, index)
        result.sum().backward()
        assert logits.grad is not None
        assert torch.isfinite(logits.grad).all()

    def test_oob_backward_valid_rows_correct(self):
        """Gradients for valid-index rows should match reference even when other rows have OOB."""
        from axolotl.monkeypatch.trainer.utils import selective_log_softmax

        V = 128
        logits = torch.randn(
            4, V, device="cuda", dtype=torch.float32, requires_grad=True
        )
        # Row 0: invalid, Row 1: valid, Row 2: invalid, Row 3: valid
        index = torch.tensor(
            [[-1], [42], [V + 5], [100]], device="cuda", dtype=torch.long
        )
        result = selective_log_softmax(logits, index)
        result.sum().backward()

        # Compute reference gradient for valid rows only
        logits_ref = logits.detach().clone().requires_grad_(True)
        valid_rows = [1, 3]
        valid_indices = [42, 100]
        for r, idx in zip(valid_rows, valid_indices, strict=True):
            ref_lp = F.log_softmax(logits_ref[r : r + 1], dim=-1)
            ref_val = ref_lp[0, idx]
            ref_val.backward(retain_graph=True)

        for r in valid_rows:
            torch.testing.assert_close(
                logits.grad[r], logits_ref.grad[r], atol=1e-4, rtol=1e-4
            )


================================================
FILE: tests/test_utils_tee.py
================================================
import os
import tempfile


def _dummy_cfg(output_dir: str, append: bool = False):
    # Minimal object with attributes used by prepare_debug_log
    class Cfg:
        def __init__(self, out, append):
            self.output_dir = out
            self._append = append

        def get(self, key, default=None):
            if key in {"resume_from_checkpoint", "auto_resume_from_checkpoints"}:
                return self._append
            return default

    return Cfg(output_dir, append)


def read(path: str) -> str:
    with open(path, "r", encoding="utf-8") as f:
        return f.read()


def test_file_only_stream_writes_after_prepare(monkeypatch):
    from axolotl.utils import tee

    with tempfile.TemporaryDirectory() as td:
        # Avoid stdout tee in this test
        monkeypatch.setenv("AXOLOTL_TEE_STDOUT", "0")
        cfg = _dummy_cfg(td, append=False)

        # before prepare: writing to file_only_stream creates no file
        tee.file_only_stream.write("before\n")
        tee.file_only_stream.flush()
        assert not os.path.exists(os.path.join(td, "debug.log"))

        # prepare and write
        path = tee.prepare_debug_log(cfg)
        assert os.path.basename(path) == "debug.log"
        tee.file_only_stream.write("hello\n")
        tee.file_only_stream.flush()

        content = read(path)
        assert "hello" in content

        tee.close_debug_log()


def test_stdout_is_mirrored_after_prepare(capsys, monkeypatch):
    from axolotl.utils import tee

    with tempfile.TemporaryDirectory() as td:
        cfg = _dummy_cfg(td, append=False)
        try:
            # Install tee while capture is disabled so stdout tee wraps real stdout.
            with capsys.disabled():
                monkeypatch.setenv("AXOLOTL_TEE_STDOUT", "1")
                path = tee.prepare_debug_log(cfg)
                import sys

                print("printed-line")
                sys.stdout.flush()

            # Now verify file contains the line
            content = read(path)
            assert "printed-line" in content
        finally:
            tee.close_debug_log()


def test_truncate_vs_append_behavior(monkeypatch):
    from axolotl.utils import tee

    with tempfile.TemporaryDirectory() as td:
        # Avoid stdout tee in this test
        monkeypatch.setenv("AXOLOTL_TEE_STDOUT", "0")
        # First run creates file with A
        cfg = _dummy_cfg(td, append=False)
        _ = tee.prepare_debug_log(cfg)
        try:
            tee.file_only_stream.write("A\n")
            tee.file_only_stream.flush()
        finally:
            tee.close_debug_log()

        # Second run with append=False truncates
        cfg2 = _dummy_cfg(td, append=False)
        path2 = tee.prepare_debug_log(cfg2)
        try:
            tee.file_only_stream.write("B\n")
            tee.file_only_stream.flush()
            content = read(path2)
            assert "A\n" not in content and "B\n" in content
        finally:
            tee.close_debug_log()

        # Third run with append=True preserves existing
        cfg3 = _dummy_cfg(td, append=True)
        path3 = tee.prepare_debug_log(cfg3)
        try:
            tee.file_only_stream.write("C\n")
            tee.file_only_stream.flush()
            content = read(path3)
            assert "B\n" in content and "C\n" in content
        finally:
            tee.close_debug_log()


================================================
FILE: tests/test_validation_dataset.py
================================================
"""Module for testing the validation module for the dataset config"""

import warnings
from typing import Optional

import pytest

from axolotl.utils.config import validate_config
from axolotl.utils.dict import DictDefault
from axolotl.utils.schemas.datasets import ChatTemplate

warnings.filterwarnings("error")


@pytest.fixture(name="minimal_cfg")
def fixture_cfg():
    return DictDefault(
        {
            "base_model": "TinyLlama/TinyLlama-1.1B-Chat-v0.6",
            "learning_rate": 0.000001,
            "micro_batch_size": 1,
            "gradient_accumulation_steps": 1,
        }
    )


class BaseValidation:
    """
    Base validation module to setup the log capture
    """

    _caplog: Optional[pytest.LogCaptureFixture] = None

    @pytest.fixture(autouse=True)
    def inject_fixtures(self, caplog):
        self._caplog = caplog


class TestValidationCheckDatasetConfig(BaseValidation):
    """
    Test the validation for the dataset config to ensure no correct parameters are dropped
    """

    def test_dataset_config_no_drop_param(self, minimal_cfg):
        cfg = DictDefault(
            minimal_cfg
            | {
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                        "shards": 10,
                    }
                ]
            }
        )

        checked_cfg = validate_config(cfg)

        def _check_config():
            assert checked_cfg.datasets[0].path == cfg.datasets[0].path
            assert checked_cfg.datasets[0].type == cfg.datasets[0].type
            assert checked_cfg.datasets[0].shards == cfg.datasets[0].shards

        _check_config()

        checked_cfg = validate_config(
            cfg,
            capabilities={
                "bf16": "false",
                "tf32": "false",
                "n_gpu": 1,
                "compute_capability": "8.0",
            },
            env_capabilities={
                "torch_version": "2.6.0",
            },
        )

        _check_config()

    def test_dataset_default_chat_template_no_drop_param(self, minimal_cfg):
        cfg = DictDefault(
            minimal_cfg
            | {
                "datasets": [
                    {
                        "path": "LDJnr/Puffin",
                        "type": "chat_template",
                        "field_messages": "conversations",
                        "shards": 10,
                        "message_field_role": "from",
                        "message_field_content": "value",
                    }
                ],
            }
        )

        checked_cfg = validate_config(cfg)

        def _check_config():
            assert checked_cfg.datasets[0].path == cfg.datasets[0].path
            assert checked_cfg.datasets[0].type == cfg.datasets[0].type
            assert checked_cfg.chat_template is None
            assert (
                checked_cfg.datasets[0].chat_template == ChatTemplate.tokenizer_default
            )
            assert (
                checked_cfg.datasets[0].field_messages == cfg.datasets[0].field_messages
            )
            assert checked_cfg.datasets[0].shards == cfg.datasets[0].shards
            assert (
                checked_cfg.datasets[0].message_field_role
                == cfg.datasets[0].message_field_role
            )
            assert (
                checked_cfg.datasets[0].message_field_content
                == cfg.datasets[0].message_field_content
            )

        _check_config()

        checked_cfg = validate_config(
            cfg,
            capabilities={
                "bf16": "false",
                "n_gpu": 1,
                "compute_capability": "8.0",
            },
            env_capabilities={
                "torch_version": "2.6.0",
            },
        )

        _check_config()

    def test_dataset_partial_default_chat_template_no_drop_param(self, minimal_cfg):
        cfg = DictDefault(
            minimal_cfg
            | {
                "chat_template": "chatml",
                "datasets": [
                    {
                        "path": "LDJnr/Puffin",
                        "type": "chat_template",
                        "field_messages": "conversations",
                        "shards": 10,
                        "message_field_role": "from",
                        "message_field_content": "value",
                    }
                ],
            }
        )

        checked_cfg = validate_config(cfg)

        def _check_config():
            assert checked_cfg.datasets[0].path == cfg.datasets[0].path
            assert checked_cfg.datasets[0].type == cfg.datasets[0].type
            assert checked_cfg.chat_template == ChatTemplate.chatml
            assert (
                checked_cfg.datasets[0].chat_template == ChatTemplate.tokenizer_default
            )
            assert (
                checked_cfg.datasets[0].field_messages == cfg.datasets[0].field_messages
            )
            assert checked_cfg.datasets[0].shards == cfg.datasets[0].shards
            assert (
                checked_cfg.datasets[0].message_field_role
                == cfg.datasets[0].message_field_role
            )
            assert (
                checked_cfg.datasets[0].message_field_content
                == cfg.datasets[0].message_field_content
            )

        _check_config()

        checked_cfg = validate_config(
            cfg,
            capabilities={
                "bf16": "false",
                "n_gpu": 1,
                "compute_capability": "8.0",
            },
            env_capabilities={
                "torch_version": "2.6.0",
            },
        )

        _check_config()

    def test_dataset_chatml_chat_template_no_drop_param(self, minimal_cfg):
        cfg = DictDefault(
            minimal_cfg
            | {
                "chat_template": "chatml",
                "datasets": [
                    {
                        "path": "LDJnr/Puffin",
                        "type": "chat_template",
                        "chat_template": "gemma",
                        "field_messages": "conversations",
                        "shards": 10,
                        "message_field_role": "from",
                        "message_field_content": "value",
                    }
                ],
            }
        )

        checked_cfg = validate_config(cfg)

        def _check_config():
            assert checked_cfg.datasets[0].path == cfg.datasets[0].path
            assert checked_cfg.datasets[0].type == cfg.datasets[0].type
            assert checked_cfg.chat_template == cfg.chat_template
            assert (
                checked_cfg.datasets[0].chat_template == cfg.datasets[0].chat_template
            )
            assert (
                checked_cfg.datasets[0].field_messages == cfg.datasets[0].field_messages
            )
            assert checked_cfg.datasets[0].shards == cfg.datasets[0].shards
            assert (
                checked_cfg.datasets[0].message_field_role
                == cfg.datasets[0].message_field_role
            )
            assert (
                checked_cfg.datasets[0].message_field_content
                == cfg.datasets[0].message_field_content
            )

        _check_config()

        checked_cfg = validate_config(
            cfg,
            capabilities={
                "bf16": "false",
                "n_gpu": 1,
                "compute_capability": "8.0",
            },
            env_capabilities={
                "torch_version": "2.6.0",
            },
        )

        _check_config()

    def test_dataset_sharegpt_deprecation(self, minimal_cfg):
        cfg = DictDefault(
            minimal_cfg
            | {
                "chat_template": "chatml",
                "datasets": [
                    {
                        "path": "LDJnr/Puffin",
                        "type": "sharegpt",
                        "conversation": "chatml",
                    }
                ],
            }
        )

        # Check sharegpt deprecation is raised
        with pytest.raises(ValueError, match=r".*type: sharegpt.*` is deprecated.*"):
            validate_config(cfg)

        # Check that deprecation is not thrown for non-str type
        cfg = DictDefault(
            minimal_cfg
            | {
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": {
                            "field_instruction": "instruction",
                            "field_output": "output",
                            "field_system": "system",
                            "format": "<|user|> {instruction} {input} <|model|>",
                            "no_input_format": "<|user|> {instruction} <|model|>",
                            "system_prompt": "",
                        },
                    }
                ],
            }
        )

        validate_config(cfg)

        # Check that deprecation is not thrown for non-sharegpt type
        cfg = DictDefault(
            minimal_cfg
            | {
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    }
                ],
            }
        )

        validate_config(cfg)

    def test_message_property_mappings(self, minimal_cfg):
        cfg = DictDefault(
            minimal_cfg
            | {
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                        "message_property_mappings": {
                            "role": "role",
                            "content": "content",
                        },
                    }
                ],
            }
        )

        validate_config(cfg)


class TestOptimizerValidation(BaseValidation):
    """
    Test muon optimizer validation
    """

    def test_muon_deepspeed(self, minimal_cfg):
        cfg = DictDefault(
            minimal_cfg
            | {
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    }
                ],
                "optimizer": "muon",
                "deepspeed": "deepspeed_configs/zero3.json",
            }
        )

        with pytest.raises(ValueError, match=r".*is currently incompatible with*"):
            validate_config(cfg)

    def test_muon_fsdp(self, minimal_cfg):
        cfg = DictDefault(
            minimal_cfg
            | {
                "datasets": [
                    {
                        "path": "mhenrichsen/alpaca_2k_test",
                        "type": "alpaca",
                    }
                ],
                "optimizer": "muon",
                "fsdp": ["full_shard"],
                "fsdp_config": {
                    "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                },
            }
        )

        with pytest.raises(ValueError, match=r".*only compatible with FSDP2.*"):
            validate_config(cfg)


================================================
FILE: tests/utils/callbacks/test_dynamic_checkpoint.py
================================================
"""Unit tests for dynamic checkpoint callback"""

import tempfile
from pathlib import Path
from unittest.mock import MagicMock, Mock, patch

from axolotl.utils.callbacks.dynamic_checkpoint import (
    DEFAULT_TRIGGER_FILENAME,
    DynamicCheckpointCallback,
)
from axolotl.utils.dict import DictDefault


class TestDynamicCheckpointCallbackInit:
    """Test callback initialization"""

    def test_callback_disabled_by_default(self):
        """Test that callback is disabled when config.enabled=False"""
        with tempfile.TemporaryDirectory() as tmpdir:
            cfg = DictDefault(
                {
                    "dynamic_checkpoint": {"enabled": False},
                    "output_dir": tmpdir,
                }
            )
            callback = DynamicCheckpointCallback(cfg)
            assert callback.enabled is False

    def test_callback_disabled_when_none(self):
        """Test that callback is disabled when dynamic_checkpoint is None"""
        with tempfile.TemporaryDirectory() as tmpdir:
            cfg = DictDefault(
                {
                    "dynamic_checkpoint": None,
                    "output_dir": tmpdir,
                }
            )
            callback = DynamicCheckpointCallback(cfg)
            assert callback.enabled is False

    def test_callback_enabled_when_configured(self):
        """Test that callback is enabled when config.enabled=True"""
        with tempfile.TemporaryDirectory() as tmpdir:
            cfg = DictDefault(
                {
                    "dynamic_checkpoint": {"enabled": True, "check_interval": 10},
                    "output_dir": tmpdir,
                }
            )
            callback = DynamicCheckpointCallback(cfg)
            assert callback.enabled is True
            assert callback.check_interval == 10

    def test_default_trigger_filename(self):
        """Test that default trigger filename is used"""
        with tempfile.TemporaryDirectory() as tmpdir:
            cfg = DictDefault(
                {
                    "dynamic_checkpoint": {"enabled": True, "check_interval": 10},
                    "output_dir": tmpdir,
                }
            )
            callback = DynamicCheckpointCallback(cfg)
            assert callback.trigger_filename == DEFAULT_TRIGGER_FILENAME

    def test_check_interval_default(self):
        """Test default check interval"""
        with tempfile.TemporaryDirectory() as tmpdir:
            cfg = DictDefault(
                {
                    "dynamic_checkpoint": {"enabled": True},
                    "output_dir": tmpdir,
                }
            )
            callback = DynamicCheckpointCallback(cfg)
            assert callback.check_interval == 100  # Default from schema


class TestDynamicCheckpointFileDetection:
    """Test file-based checkpoint triggering"""

    def test_trigger_file_detected_and_deleted(self):
        """Test that trigger file is detected and deleted"""
        with tempfile.TemporaryDirectory() as tmpdir:
            cfg = DictDefault(
                {
                    "dynamic_checkpoint": {"enabled": True, "check_interval": 1},
                    "output_dir": tmpdir,
                }
            )
            callback = DynamicCheckpointCallback(cfg)

            trigger_file = Path(tmpdir) / DEFAULT_TRIGGER_FILENAME
            trigger_file.touch()
            assert trigger_file.exists()

            args = Mock(output_dir=tmpdir)
            state = Mock(global_step=1)
            control = Mock(should_save=False)

            with patch(
                "axolotl.utils.callbacks.dynamic_checkpoint.is_main_process",
                return_value=True,
            ):
                with patch(
                    "axolotl.utils.callbacks.dynamic_checkpoint.is_distributed",
                    return_value=False,
                ):
                    result = callback.on_step_end(args, state, control)

            assert not trigger_file.exists()
            assert result.should_save is True

    def test_check_interval_honored(self):
        """Test that file is only checked at check_interval steps"""
        with tempfile.TemporaryDirectory() as tmpdir:
            cfg = DictDefault(
                {
                    "dynamic_checkpoint": {"enabled": True, "check_interval": 10},
                    "output_dir": tmpdir,
                }
            )
            callback = DynamicCheckpointCallback(cfg)

            args = Mock(output_dir=tmpdir)
            control = Mock(should_save=False)

            trigger_file = Path(tmpdir) / DEFAULT_TRIGGER_FILENAME
            trigger_file.touch()

            with patch(
                "axolotl.utils.callbacks.dynamic_checkpoint.is_main_process",
                return_value=True,
            ):
                with patch(
                    "axolotl.utils.callbacks.dynamic_checkpoint.is_distributed",
                    return_value=False,
                ):
                    # Step 5 - shouldn't check (not divisible by 10)
                    state = Mock(global_step=5)
                    result = callback.on_step_end(args, state, control)
                    assert trigger_file.exists()  # Still there
                    assert result.should_save is False

                    # Step 10 - should check
                    state = Mock(global_step=10)
                    result = callback.on_step_end(args, state, control)
                    assert not trigger_file.exists()  # Deleted
                    assert result.should_save is True

    def test_no_file_no_trigger(self):
        """Test that no trigger occurs when file doesn't exist"""
        with tempfile.TemporaryDirectory() as tmpdir:
            cfg = DictDefault(
                {
                    "dynamic_checkpoint": {"enabled": True, "check_interval": 1},
                    "output_dir": tmpdir,
                }
            )
            callback = DynamicCheckpointCallback(cfg)

            args = Mock(output_dir=tmpdir)
            state = Mock(global_step=1)
            control = Mock(should_save=False)

            with patch(
                "axolotl.utils.callbacks.dynamic_checkpoint.is_main_process",
                return_value=True,
            ):
                with patch(
                    "axolotl.utils.callbacks.dynamic_checkpoint.is_distributed",
                    return_value=False,
                ):
                    result = callback.on_step_end(args, state, control)

            assert result.should_save is False

    def test_file_deletion_error_handling(self):
        """Test that file deletion errors are handled gracefully"""
        with tempfile.TemporaryDirectory() as tmpdir:
            cfg = DictDefault(
                {
                    "dynamic_checkpoint": {"enabled": True, "check_interval": 1},
                    "output_dir": tmpdir,
                }
            )
            callback = DynamicCheckpointCallback(cfg)

            trigger_file = Path(tmpdir) / DEFAULT_TRIGGER_FILENAME
            trigger_file.touch()

            args = Mock(output_dir=tmpdir)
            state = Mock(global_step=1)
            control = Mock(should_save=False)

            with patch(
                "axolotl.utils.callbacks.dynamic_checkpoint.is_main_process",
                return_value=True,
            ):
                with patch(
                    "axolotl.utils.callbacks.dynamic_checkpoint.is_distributed",
                    return_value=False,
                ):
                    with patch.object(
                        Path, "unlink", side_effect=OSError("Permission denied")
                    ):
                        result = callback.on_step_end(args, state, control)

            assert result.should_save is True


class TestDynamicCheckpointMultiGPU:
    """Test multi-GPU synchronization"""

    def test_only_rank_0_checks_file(self):
        """Test that only rank 0 checks filesystem in multi-GPU setup"""
        with tempfile.TemporaryDirectory() as tmpdir:
            cfg = DictDefault(
                {
                    "dynamic_checkpoint": {"enabled": True, "check_interval": 1},
                    "output_dir": tmpdir,
                }
            )
            callback = DynamicCheckpointCallback(cfg)

            trigger_file = Path(tmpdir) / DEFAULT_TRIGGER_FILENAME
            trigger_file.touch()

            args = Mock(output_dir=tmpdir)
            state = Mock(global_step=1)
            control = Mock(should_save=False)

            # Rank 1 (not main process) - shouldn't check file
            with patch(
                "axolotl.utils.callbacks.dynamic_checkpoint.is_main_process",
                return_value=False,
            ):
                with patch(
                    "axolotl.utils.callbacks.dynamic_checkpoint.is_distributed",
                    return_value=True,
                ):
                    with patch("torch.distributed.broadcast") as mock_broadcast:
                        with patch(
                            "axolotl.utils.callbacks.dynamic_checkpoint.barrier"
                        ):
                            mock_tensor = MagicMock()
                            mock_tensor.item.return_value = 0
                            with patch("torch.tensor", return_value=mock_tensor):
                                callback.on_step_end(args, state, control)

            assert trigger_file.exists()
            # Broadcast should have been called
            assert mock_broadcast.called

    def test_broadcast_synchronization(self):
        """Test that trigger decision is broadcasted to all ranks"""
        with tempfile.TemporaryDirectory() as tmpdir:
            cfg = DictDefault(
                {
                    "dynamic_checkpoint": {"enabled": True, "check_interval": 1},
                    "output_dir": tmpdir,
                }
            )
            callback = DynamicCheckpointCallback(cfg)

            trigger_file = Path(tmpdir) / DEFAULT_TRIGGER_FILENAME
            trigger_file.touch()

            args = Mock(output_dir=tmpdir)
            state = Mock(global_step=1)
            control = Mock(should_save=False)

            # Rank 0 detects file
            with patch(
                "axolotl.utils.callbacks.dynamic_checkpoint.is_main_process",
                return_value=True,
            ):
                with patch(
                    "axolotl.utils.callbacks.dynamic_checkpoint.is_distributed",
                    return_value=True,
                ):
                    with patch("torch.distributed.broadcast") as mock_broadcast:
                        with patch(
                            "axolotl.utils.callbacks.dynamic_checkpoint.barrier"
                        ) as mock_barrier:
                            mock_tensor = MagicMock()
                            mock_tensor.item.return_value = 1
                            with patch("torch.tensor", return_value=mock_tensor):
                                with patch("torch.cuda.current_device", return_value=0):
                                    result = callback.on_step_end(args, state, control)

            assert mock_broadcast.called
            assert mock_barrier.called
            # All ranks should trigger
            assert result.should_save is True


class TestDynamicCheckpointSignalHandling:
    """Test signal-based checkpoint triggering"""

    def test_signal_trigger_via_callback(self):
        """Test that signal flag triggers checkpoint save"""
        with tempfile.TemporaryDirectory() as tmpdir:
            cfg = DictDefault(
                {
                    "dynamic_checkpoint": {
                        "enabled": True,
                        "check_interval": 1,
                        "enable_signal": True,
                    },
                    "output_dir": tmpdir,
                }
            )

            with patch("signal.signal"):
                with patch(
                    "axolotl.utils.callbacks.dynamic_checkpoint.is_main_process",
                    return_value=True,
                ):
                    with patch(
                        "axolotl.utils.callbacks.dynamic_checkpoint.hasattr",
                        return_value=True,
                    ):
                        callback = DynamicCheckpointCallback(cfg)

            callback.should_save_checkpoint = True

            args = Mock(output_dir=tmpdir)
            state = Mock(global_step=1)
            control = Mock(should_save=False)

            with patch(
                "axolotl.utils.callbacks.dynamic_checkpoint.is_main_process",
                return_value=True,
            ):
                with patch(
                    "axolotl.utils.callbacks.dynamic_checkpoint.is_distributed",
                    return_value=False,
                ):
                    result = callback.on_step_end(args, state, control)

            assert result.should_save is True
            assert callback.should_save_checkpoint is False

    def test_signal_not_registered_when_disabled(self):
        """Test that signal handler is not registered when disabled"""
        with tempfile.TemporaryDirectory() as tmpdir:
            cfg = DictDefault(
                {
                    "dynamic_checkpoint": {
                        "enabled": True,
                        "check_interval": 10,
                        "enable_signal": False,
                    },
                    "output_dir": tmpdir,
                }
            )

            with patch("signal.signal") as mock_signal_register:
                _ = DynamicCheckpointCallback(cfg)

            assert not mock_signal_register.called


class TestDynamicCheckpointDisabled:
    """Test behavior when callback is disabled"""

    def test_disabled_callback_does_nothing(self):
        """Test that disabled callback doesn't check or trigger"""
        with tempfile.TemporaryDirectory() as tmpdir:
            cfg = DictDefault(
                {
                    "dynamic_checkpoint": {"enabled": False},
                    "output_dir": tmpdir,
                }
            )
            callback = DynamicCheckpointCallback(cfg)

            trigger_file = Path(tmpdir) / DEFAULT_TRIGGER_FILENAME
            trigger_file.touch()

            args = Mock(output_dir=tmpdir)
            state = Mock(global_step=1)
            control = Mock(should_save=False)

            result = callback.on_step_end(args, state, control)

            assert trigger_file.exists()
            assert result.should_save is False


================================================
FILE: tests/utils/data/test_utils.py
================================================
"""
Unit tests for data utility functions
"""

import unittest
from unittest.mock import MagicMock

from datasets import Dataset

from axolotl.utils.data.utils import handle_long_seq_in_dataset
from axolotl.utils.dict import DictDefault


class TestHandleLongSeqInDataset(unittest.TestCase):
    """
    Test class for handle_long_seq_in_dataset function
    """

    def test_drop_strategy_removes_long_sequences(self):
        """Test that 'drop' strategy removes sequences longer than sequence_len"""
        # Create dataset with mixed length sequences
        dataset = Dataset.from_dict(
            {
                "input_ids": [
                    [1, 2, 3],  # length 3 - keep
                    [1, 2, 3, 4, 5],  # length 5 - keep
                    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],  # length 11 - drop
                    [1, 2],  # length 2 - keep
                ]
            }
        )

        cfg = DictDefault(
            {
                "excess_length_strategy": "drop",
                "min_sample_len": 2,
                "dataset_num_proc": None,
                "is_preprocess": False,
            }
        )

        result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg)

        # Should have dropped the sequence with length 11
        self.assertEqual(len(result), 3)
        self.assertEqual(len(result[0]["input_ids"]), 3)
        self.assertEqual(len(result[1]["input_ids"]), 5)
        self.assertEqual(len(result[2]["input_ids"]), 2)

    def test_drop_strategy_is_default(self):
        """Test that 'drop' is the default strategy when not specified"""
        dataset = Dataset.from_dict(
            {
                "input_ids": [
                    [1, 2, 3],
                    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],  # length 11 - should drop
                ]
            }
        )

        cfg = DictDefault(
            {
                "min_sample_len": 2,
                "dataset_num_proc": None,
                "is_preprocess": False,
            }
        )

        result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg)

        # Should have dropped the long sequence
        self.assertEqual(len(result), 1)

    def test_truncate_strategy_truncates_long_sequences(self):
        """Test that 'truncate' strategy truncates sequences to sequence_len"""
        dataset = Dataset.from_dict(
            {
                "input_ids": [
                    [1, 2, 3],  # length 3 - keep as is
                    [
                        1,
                        2,
                        3,
                        4,
                        5,
                        6,
                        7,
                        8,
                        9,
                        10,
                        11,
                        12,
                    ],  # length 12 - truncate to 10
                ]
            }
        )

        cfg = DictDefault(
            {
                "excess_length_strategy": "truncate",
                "min_sample_len": 2,
                "dataset_num_proc": None,
                "is_preprocess": False,
            }
        )

        result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg)

        # Should have 2 samples
        self.assertEqual(len(result), 2)
        # First sample unchanged
        self.assertEqual(len(result[0]["input_ids"]), 3)
        # Second sample truncated to 10
        self.assertEqual(len(result[1]["input_ids"]), 10)
        self.assertEqual(result[1]["input_ids"], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

    def test_truncate_strategy_truncates_all_auxiliary_fields(self):
        """Test that truncation applies to all auxiliary fields consistently"""
        dataset = Dataset.from_dict(
            {
                "input_ids": [
                    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                ],
                "attention_mask": [
                    [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
                ],
                "labels": [
                    [-100, -100, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                ],
                "position_ids": [
                    [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],
                ],
            }
        )

        cfg = DictDefault(
            {
                "excess_length_strategy": "truncate",
                "min_sample_len": 2,
                "dataset_num_proc": None,
                "is_preprocess": False,
            }
        )

        result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg)

        # All fields should be truncated to 10
        self.assertEqual(len(result[0]["input_ids"]), 10)
        self.assertEqual(len(result[0]["attention_mask"]), 10)
        self.assertEqual(len(result[0]["labels"]), 10)
        self.assertEqual(len(result[0]["position_ids"]), 10)

        # Verify content is correct
        self.assertEqual(result[0]["input_ids"], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        self.assertEqual(result[0]["attention_mask"], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1])
        self.assertEqual(result[0]["labels"], [-100, -100, 3, 4, 5, 6, 7, 8, 9, 10])
        self.assertEqual(result[0]["position_ids"], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

    def test_raise_strategy_raises_on_long_sequences(self):
        """Test that 'raise' strategy raises ValueError when encountering long sequences"""
        dataset = Dataset.from_dict(
            {
                "input_ids": [
                    [1, 2, 3],
                    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],  # length 11 - should raise
                ]
            }
        )

        cfg = DictDefault(
            {
                "excess_length_strategy": "raise",
                "min_sample_len": 2,
                "dataset_num_proc": None,
                "is_preprocess": False,
            }
        )

        with self.assertRaises(ValueError):
            handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg)

    def test_min_sequence_len_filters_short_sequences(self):
        """Test that sequences shorter than min_sample_len are filtered out"""
        dataset = Dataset.from_dict(
            {
                "input_ids": [
                    [1],  # length 1 - drop (< min_sample_len=3)
                    [1, 2],  # length 2 - drop
                    [1, 2, 3],  # length 3 - keep
                    [1, 2, 3, 4, 5],  # length 5 - keep
                ]
            }
        )

        cfg = DictDefault(
            {
                "excess_length_strategy": "drop",
                "min_sample_len": 3,
                "dataset_num_proc": None,
                "is_preprocess": False,
            }
        )

        result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg)

        # Should only keep sequences with length >= 3
        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]["input_ids"]), 3)
        self.assertEqual(len(result[1]["input_ids"]), 5)

    def test_dataset_without_input_ids_column(self):
        """Test that datasets without 'input_ids' column are returned unchanged"""
        dataset = Dataset.from_dict(
            {
                "chosen": [1, 2, 3],
                "rejected": [4, 5, 6],
            }
        )

        cfg = DictDefault(
            {
                "excess_length_strategy": "drop",
                "min_sample_len": 2,
            }
        )

        result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg)

        # Dataset should be unchanged
        self.assertEqual(len(result), len(dataset))
        self.assertListEqual(list(result.column_names), ["chosen", "rejected"])

    def test_truncate_filters_short_before_truncating(self):
        """Test that truncate strategy filters short sequences before truncating long ones

        This is important for efficiency - we should not waste time truncating
        sequences that will be filtered out anyway.
        """
        dataset = Dataset.from_dict(
            {
                "input_ids": [
                    [1],  # length 1 - filter out first
                    [1, 2, 3],  # length 3 - keep, no truncation needed
                    [
                        1,
                        2,
                        3,
                        4,
                        5,
                        6,
                        7,
                        8,
                        9,
                        10,
                        11,
                        12,
                    ],  # length 12 - keep and truncate
                ]
            }
        )

        cfg = DictDefault(
            {
                "excess_length_strategy": "truncate",
                "min_sample_len": 2,
                "dataset_num_proc": None,
                "is_preprocess": False,
            }
        )

        result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg)

        # Should have filtered out the first (short) sequence
        self.assertEqual(len(result), 2)
        # Second sample unchanged
        self.assertEqual(len(result[0]["input_ids"]), 3)
        # Third sample truncated to 10
        self.assertEqual(len(result[1]["input_ids"]), 10)

    def test_case_insensitive_strategy(self):
        """Test that excess_length_strategy is case-insensitive"""
        dataset = Dataset.from_dict(
            {
                "input_ids": [
                    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                ]
            }
        )

        cfg = DictDefault(
            {
                "excess_length_strategy": "TRUNCATE",  # uppercase
                "min_sample_len": 2,
                "dataset_num_proc": None,
                "is_preprocess": False,
            }
        )

        result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg)

        # Should still truncate
        self.assertEqual(len(result[0]["input_ids"]), 10)

    def test_raise_strategy_silently_drops_short_sequences(self):
        """Test that 'raise' strategy drops short sequences without raising"""
        dataset = Dataset.from_dict(
            {
                "input_ids": [
                    [1],  # length 1 - too short, should be dropped silently
                    [1, 2, 3, 4, 5],  # length 5 - keep
                ]
            }
        )

        cfg = DictDefault(
            {
                "excess_length_strategy": "raise",
                "min_sample_len": 3,
                "dataset_num_proc": None,
                "is_preprocess": False,
            }
        )

        # Should NOT raise, just silently drop the short sequence
        result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg)

        self.assertEqual(len(result), 1)
        self.assertEqual(len(result[0]["input_ids"]), 5)

    def test_drop_boundary_sequence_equal_to_sequence_len(self):
        """Test that drop strategy keeps sequences with length exactly equal to sequence_len"""
        dataset = Dataset.from_dict(
            {
                "input_ids": [
                    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],  # length 10 == sequence_len
                    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],  # length 11 > sequence_len
                ]
            }
        )

        cfg = DictDefault(
            {
                "excess_length_strategy": "drop",
                "min_sample_len": 2,
                "dataset_num_proc": None,
                "is_preprocess": False,
            }
        )

        result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg)

        # Exactly equal should be kept, one over should be dropped
        self.assertEqual(len(result), 1)
        self.assertEqual(len(result[0]["input_ids"]), 10)

    def test_truncate_boundary_sequence_equal_to_sequence_len(self):
        """Test that truncate strategy leaves sequences with length exactly equal to sequence_len unchanged"""
        dataset = Dataset.from_dict(
            {
                "input_ids": [
                    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],  # length 10 == sequence_len
                ]
            }
        )

        cfg = DictDefault(
            {
                "excess_length_strategy": "truncate",
                "min_sample_len": 2,
                "dataset_num_proc": None,
                "is_preprocess": False,
            }
        )

        result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg)

        # Should be unchanged - not truncated
        self.assertEqual(len(result), 1)
        self.assertEqual(result[0]["input_ids"], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])

    def test_empty_dataset(self):
        """Test that an empty dataset is handled gracefully"""
        dataset = Dataset.from_dict({"input_ids": []})

        cfg = DictDefault(
            {
                "excess_length_strategy": "drop",
                "min_sample_len": 2,
                "dataset_num_proc": None,
                "is_preprocess": False,
            }
        )

        result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg)

        self.assertEqual(len(result), 0)

    def test_all_sequences_dropped_returns_empty_dataset(self):
        """Test that dropping all sequences results in an empty dataset"""
        dataset = Dataset.from_dict(
            {
                "input_ids": [
                    [1],  # too short
                    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],  # too long
                ]
            }
        )

        cfg = DictDefault(
            {
                "excess_length_strategy": "drop",
                "min_sample_len": 5,
                "dataset_num_proc": None,
                "is_preprocess": False,
            }
        )

        result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg)

        self.assertEqual(len(result), 0)

    def test_iterable_dataset_skips_processing(self):
        """Test that streaming datasets (column_names is None) are returned unchanged.

        The skip check in _should_skip_processing triggers when column_names is
        None, which happens with true streaming datasets loaded via
        load_dataset(..., streaming=True).
        """
        mock_dataset = MagicMock()
        mock_dataset.column_names = None

        cfg = DictDefault(
            {
                "excess_length_strategy": "drop",
                "min_sample_len": 2,
                "dataset_num_proc": None,
                "is_preprocess": False,
            }
        )

        result = handle_long_seq_in_dataset(mock_dataset, sequence_len=10, cfg=cfg)

        # Should be returned unchanged (same object)
        self.assertIs(result, mock_dataset)

    def test_truncate_with_partial_auxiliary_fields(self):
        """Test truncation when only some auxiliary fields are present"""
        dataset = Dataset.from_dict(
            {
                "input_ids": [
                    [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                ],
                "labels": [
                    [-100, -100, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],
                ],
                # No attention_mask or position_ids
            }
        )

        cfg = DictDefault(
            {
                "excess_length_strategy": "truncate",
                "min_sample_len": 2,
                "dataset_num_proc": None,
                "is_preprocess": False,
            }
        )

        result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg)

        self.assertEqual(len(result[0]["input_ids"]), 10)
        self.assertEqual(len(result[0]["labels"]), 10)
        self.assertEqual(result[0]["input_ids"], [1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
        self.assertEqual(result[0]["labels"], [-100, -100, 3, 4, 5, 6, 7, 8, 9, 10])
        # Confirm no extra columns were introduced
        self.assertListEqual(sorted(result.column_names), ["input_ids", "labels"])

    def test_min_sample_len_defaults_to_two_when_not_set(self):
        """Test that min_sample_len defaults to 2 when not specified in config"""
        dataset = Dataset.from_dict(
            {
                "input_ids": [
                    [1],  # length 1 - should be dropped (< default 2)
                    [1, 2],  # length 2 - should be kept (>= default 2)
                    [1, 2, 3],  # length 3 - should be kept
                ]
            }
        )

        cfg = DictDefault(
            {
                "excess_length_strategy": "drop",
                # min_sample_len not set
                "dataset_num_proc": None,
                "is_preprocess": False,
            }
        )

        result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg)

        self.assertEqual(len(result), 2)
        self.assertEqual(len(result[0]["input_ids"]), 2)
        self.assertEqual(len(result[1]["input_ids"]), 3)

    def test_invalid_strategy_falls_through_to_drop(self):
        """Test that an unrecognized strategy value falls through to drop behavior"""
        dataset = Dataset.from_dict(
            {
                "input_ids": [
                    [1, 2, 3],  # keep
                    [
                        1,
                        2,
                        3,
                        4,
                        5,
                        6,
                        7,
                        8,
                        9,
                        10,
                        11,
                    ],  # length 11 - should be dropped
                ]
            }
        )

        cfg = DictDefault(
            {
                "excess_length_strategy": "not_a_real_strategy",
                "min_sample_len": 2,
                "dataset_num_proc": None,
                "is_preprocess": False,
            }
        )

        result = handle_long_seq_in_dataset(dataset, sequence_len=10, cfg=cfg)

        # Should behave like 'drop'
        self.assertEqual(len(result), 1)
        self.assertEqual(len(result[0]["input_ids"]), 3)


if __name__ == "__main__":
    unittest.main()


================================================
FILE: tests/utils/lora/test_config_validation_lora.py
================================================
import pytest

from axolotl.utils.config import validate_config
from axolotl.utils.dict import DictDefault


class TestLoRAConfigValidation:
    """Test suite for LoRA/QLoRA configuration validation"""

    def test_basic_configuration_validation(self):
        """Test basic LoRA configuration validation"""

        valid_config = DictDefault(
            {
                "adapter": "lora",
                "lora_r": 8,
                "lora_alpha": 16,
                "lora_dropout": 0.1,
                "lora_target_modules": ["q_proj", "v_proj"],
                "datasets": [{"path": "dummy_dataset", "type": "alpaca"}],
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "learning_rate": 1e-5,
                "base_model": "dummy_model",
            }
        )

        result = validate_config(valid_config)
        assert result["adapter"] == "lora"

        with pytest.raises(ValueError, match="not compatible with DoRA"):
            invalid_config = DictDefault(
                {
                    "adapter": "lora",
                    "lora_mlp_kernel": True,
                    "peft_use_dora": True,
                    "datasets": [{"path": "dummy_dataset", "type": "alpaca"}],
                    "micro_batch_size": 1,
                    "gradient_accumulation_steps": 1,
                    "learning_rate": 1e-5,
                    "base_model": "dummy_model",
                }
            )
            validate_config(invalid_config)

    def test_qlora_4bit_validation(self):
        """Test QLoRA 4-bit configuration validation"""
        valid_config = DictDefault(
            {
                "adapter": "qlora",
                "load_in_4bit": True,
                "bnb_4bit_compute_dtype": "float16",
                "datasets": [{"path": "dummy_dataset", "type": "alpaca"}],
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "learning_rate": 1e-5,
                "base_model": "dummy_model",
            }
        )
        result = validate_config(valid_config)
        assert result["adapter"] == "qlora"
        assert result["load_in_4bit"] is True

        # Test QLoRA without 4-bit (should fail via PEFT validation)
        with pytest.raises(ValueError, match=r"Require cfg\.load_in_4bit"):
            invalid_config = DictDefault(
                {
                    "adapter": "qlora",
                    "load_in_4bit": False,
                    "datasets": [{"path": "dummy_dataset", "type": "alpaca"}],
                    "micro_batch_size": 1,
                    "gradient_accumulation_steps": 1,
                    "learning_rate": 1e-5,
                    "base_model": "dummy_model",
                }
            )
            validate_config(invalid_config)

        # Test QLoRA with 8-bit (incompatible)
        with pytest.raises(ValueError, match="Can't load qlora in 8bit"):
            invalid_config = DictDefault(
                {
                    "adapter": "qlora",
                    "load_in_8bit": True,
                    "datasets": [{"path": "dummy_dataset", "type": "alpaca"}],
                    "micro_batch_size": 1,
                    "gradient_accumulation_steps": 1,
                    "learning_rate": 1e-5,
                    "base_model": "dummy_model",
                }
            )
            validate_config(invalid_config)

    @pytest.mark.parametrize(
        "kernel_field", ["lora_mlp_kernel", "lora_qkv_kernel", "lora_o_kernel"]
    )
    def test_lora_kernels_trust_remote_code_incompatible(self, kernel_field):
        """Test that lora kernels are incompatible with trust_remote_code"""
        with pytest.raises(ValueError, match="not compatible with trust_remote_code"):
            invalid_config = DictDefault(
                {
                    "adapter": "lora",
                    kernel_field: True,
                    "trust_remote_code": True,
                    "datasets": [{"path": "dummy_dataset", "type": "alpaca"}],
                    "micro_batch_size": 1,
                    "gradient_accumulation_steps": 1,
                    "learning_rate": 1e-5,
                    "base_model": "dummy_model",
                }
            )
            validate_config(invalid_config)

    def test_lora_kernels_trust_remote_code_false(self):
        """Test that lora kernels work when trust_remote_code is false"""
        # Test with trust_remote_code=False, lora kernels should be allowed
        valid_config = DictDefault(
            {
                "adapter": "lora",
                "lora_mlp_kernel": True,
                "lora_qkv_kernel": True,
                "lora_o_kernel": True,
                "trust_remote_code": False,
                "datasets": [{"path": "dummy_dataset", "type": "alpaca"}],
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "learning_rate": 1e-5,
                "base_model": "dummy_model",
            }
        )
        result = validate_config(valid_config)
        assert result["lora_mlp_kernel"] is True
        assert result["lora_qkv_kernel"] is True
        assert result["lora_o_kernel"] is True

        # Test with trust_remote_code=None (unset), kernels should be allowed
        valid_config = DictDefault(
            {
                "adapter": "lora",
                "lora_qkv_kernel": True,
                "trust_remote_code": None,
                "datasets": [{"path": "dummy_dataset", "type": "alpaca"}],
                "micro_batch_size": 1,
                "gradient_accumulation_steps": 1,
                "learning_rate": 1e-5,
                "base_model": "dummy_model",
            }
        )
        result = validate_config(valid_config)
        assert result["lora_qkv_kernel"] is True
        assert result["trust_remote_code"] is None


================================================
FILE: tests/utils/lora/test_freeze_lora.py
================================================
import importlib.util
from unittest.mock import Mock

import pytest
import torch
import torch.nn as nn

from axolotl.kernels.lora import get_lora_parameters

PEFT_AVAILABLE = importlib.util.find_spec("peft") is not None


class TestLoRAParameterFreezing:
    """Test suite for LoRA parameter freezing validation."""

    def setup_method(self):
        self.dtype = torch.float32

    def create_mock_lora_layer(
        self, has_adapters=True, adapters_disabled=False, merged=False
    ):
        """Create a mock LoRA layer for testing."""
        mock_layer = Mock()

        base_layer = Mock()
        base_layer.weight = torch.randn(512, 256, dtype=self.dtype)
        base_layer.bias = torch.randn(512, dtype=self.dtype)

        if has_adapters:
            mock_layer.base_layer = base_layer
            mock_layer.disable_adapters = adapters_disabled
            mock_layer.merged = merged

            mock_layer.active_adapters = ["default"]
            mock_layer.lora_A = {"default": Mock()}
            mock_layer.lora_B = {"default": Mock()}
            mock_layer.scaling = {"default": 0.1}

            mock_layer.lora_A["default"].weight = torch.randn(16, 256, dtype=self.dtype)
            mock_layer.lora_B["default"].weight = torch.randn(512, 16, dtype=self.dtype)
        else:
            mock_layer.weight = base_layer.weight
            mock_layer.bias = base_layer.bias

        return mock_layer

    def test_parameter_freezing_adapters_disabled(self):
        """Test that LoRA parameters are None when adapters are disabled."""
        layer = self.create_mock_lora_layer(has_adapters=True, adapters_disabled=True)

        W, b, quant_state, A, B, s = get_lora_parameters(layer)

        # Base parameters should be returned
        assert W is not None
        assert b is not None
        # LoRA parameters should be None (frozen)
        assert A is None
        assert B is None
        assert s is None

    def test_parameter_freezing_adapters_merged(self):
        """Test that LoRA parameters are None when adapters are merged."""
        layer = self.create_mock_lora_layer(has_adapters=True, merged=True)

        W, b, quant_state, A, B, s = get_lora_parameters(layer)

        # Base parameters should be returned
        assert W is not None
        assert b is not None

        # LoRA parameters should be None (frozen)
        assert A is None
        assert B is None
        assert s is None

    def test_parameter_freezing_no_adapters(self):
        """Test parameter behavior when no adapters are present."""
        layer = self.create_mock_lora_layer(has_adapters=False)

        W, b, quant_state, A, B, s = get_lora_parameters(layer)

        # Base parameters should be returned
        assert W is not None
        assert b is not None

        # LoRA parameters should be None (frozen)
        assert A is None
        assert B is None
        assert s is None

    def test_parameter_active_adapters_enabled(self):
        """Test that LoRA parameters are returned when adapters are active."""
        layer = self.create_mock_lora_layer(
            has_adapters=True, adapters_disabled=False, merged=False
        )

        W, b, quant_state, A, B, s = get_lora_parameters(layer)

        # All parameters should be returned
        assert W is not None
        assert b is not None
        assert A is not None
        assert B is not None
        assert s is not None
        assert s == 0.1

    def test_parameter_shapes_consistency(self):
        """Test that parameter shapes are consistent when active."""
        layer = self.create_mock_lora_layer(
            has_adapters=True, adapters_disabled=False, merged=False
        )

        W, b, quant_state, A, B, s = get_lora_parameters(layer)

        # Check shape consistency
        assert W.shape == (512, 256)
        assert b.shape == (512,)
        assert A.shape == (16, 256)
        assert B.shape == (512, 16)

    def test_parameter_dtypes_consistency(self):
        """Test that parameter dtypes are consistent."""
        layer = self.create_mock_lora_layer(
            has_adapters=True, adapters_disabled=False, merged=False
        )

        W, b, quant_state, A, B, s = get_lora_parameters(layer)

        assert W.dtype == self.dtype
        assert b.dtype == self.dtype
        assert A.dtype == self.dtype
        assert B.dtype == self.dtype

    def test_quantization_state_handling(self):
        """Test that quantization state is properly handled."""
        layer = self.create_mock_lora_layer(has_adapters=True)

        quant_state_mock = Mock()
        layer.base_layer.weight.quant_state = quant_state_mock

        W, b, quant_state, A, B, s = get_lora_parameters(layer)

        assert quant_state == quant_state_mock

    def test_multiple_adapters_active_adapter_selection(self):
        """Test that the correct adapter is selected when multiple adapters exist."""
        layer = self.create_mock_lora_layer(
            has_adapters=True, adapters_disabled=False, merged=False
        )

        layer.lora_A["adapter2"] = Mock()
        layer.lora_B["adapter2"] = Mock()
        layer.scaling["adapter2"] = 0.2

        layer.lora_A["adapter2"].weight = torch.randn(16, 256, dtype=self.dtype)
        layer.lora_B["adapter2"].weight = torch.randn(512, 16, dtype=self.dtype)

        layer.active_adapters = ["adapter2"]

        W, b, quant_state, A, B, s = get_lora_parameters(layer)

        assert s == 0.2
        assert torch.equal(A, layer.lora_A["adapter2"].weight)
        assert torch.equal(B, layer.lora_B["adapter2"].weight)


class TestLoRAParameterFreezingIntegration:
    """Integration tests for parameter freezing with actual LoRA layers."""

    @pytest.mark.skipif(
        not PEFT_AVAILABLE, reason="PEFT not available for integration tests"
    )
    def test_parameter_freezing_with_real_lora_layer(self):
        """Test parameter freezing with actual PEFT LoRA layer."""
        from peft import LoraConfig, get_peft_model

        class SimpleModel(nn.Module):
            def __init__(self):
                super().__init__()
                self.linear = nn.Linear(256, 512)

            def forward(self, x):
                return self.linear(x)

        base_model = SimpleModel()
        lora_config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=["linear"],
            lora_dropout=0.1,
        )
        model = get_peft_model(base_model, lora_config)
        lora_layer = model.base_model.model.linear
        # Test with adapters enabled
        W, b, quant_state, A, B, s = get_lora_parameters(lora_layer)
        assert A is not None
        assert B is not None
        assert s is not None
        # Test with adapters disabled
        model.disable_adapter_layers()
        W, b, quant_state, A, B, s = get_lora_parameters(lora_layer)
        assert A is None
        assert B is None
        assert s is None

    @pytest.mark.skipif(
        not PEFT_AVAILABLE, reason="PEFT not available for integration tests"
    )
    def test_parameter_freezing_gradient_behavior(self):
        """Test that frozen parameters don't receive gradients."""
        from peft import LoraConfig, get_peft_model

        class SimpleModel(nn.Module):
            def __init__(self):
                super().__init__()
                self.linear = nn.Linear(256, 512)

            def forward(self, x):
                return self.linear(x)

        base_model = SimpleModel()
        lora_config = LoraConfig(
            r=16,
            lora_alpha=32,
            target_modules=["linear"],
            lora_dropout=0.1,
        )
        model = get_peft_model(base_model, lora_config)
        x = torch.randn(1, 256)
        target = torch.randn(1, 512)
        model.enable_adapter_layers()
        output = model(x)
        loss = nn.MSELoss()(output, target)
        loss.backward()
        lora_layer = model.base_model.model.linear
        has_lora_grads = any(
            param.grad is not None
            for name, param in lora_layer.named_parameters()
            if "lora_" in name
        )
        assert has_lora_grads, (
            "LoRA parameters should have gradients when adapters are enabled"
        )
        model.zero_grad()
        model.disable_adapter_layers()
        output = model(x)
        loss = nn.MSELoss()(output, target)
        any_requires_grad = any(param.requires_grad for param in model.parameters())
        if any_requires_grad:
            loss.backward()
        has_lora_grads_disabled = any(
            param.grad is not None
            for name, param in lora_layer.named_parameters()
            if "lora_" in name
        )
        assert not has_lora_grads_disabled, (
            "LoRA parameters should not have gradients when adapters are disabled"
        )
        model.zero_grad()
        del model, base_model, lora_layer, x, target, output, loss
        torch.cuda.empty_cache() if torch.cuda.is_available() else None


================================================
FILE: tests/utils/lora/test_merge_lora.py
================================================
from unittest.mock import Mock, patch

import torch

from axolotl.cli.merge_lora import do_merge_lora
from axolotl.utils.dict import DictDefault


class TestAdapterMergeUnmerge:
    """Test suite for LoRA adapter merging/unmerging functionality"""

    def setup_method(self):
        self.dtype = torch.float32
        self.device = torch.device("cpu")

    def create_mock_base_model(self, vocab_size=1000, hidden_size=256):
        """Create a mock base model with linear layers"""
        mock_model = Mock()

        mock_model.config = Mock()
        mock_model.config.vocab_size = vocab_size
        mock_model.config.hidden_size = hidden_size

        mock_model.q_proj = Mock()
        mock_model.q_proj.weight = torch.randn(
            hidden_size, hidden_size, dtype=self.dtype
        )
        mock_model.q_proj.bias = torch.randn(hidden_size, dtype=self.dtype)

        mock_model.v_proj = Mock()
        mock_model.v_proj.weight = torch.randn(
            hidden_size, hidden_size, dtype=self.dtype
        )
        mock_model.v_proj.bias = torch.randn(hidden_size, dtype=self.dtype)

        return mock_model

    def create_mock_lora_model(self, base_model, r=8, alpha=16):
        """Create a mock LoRA model wrapping the base model"""
        mock_lora_model = Mock()
        mock_lora_model.base_model = base_model

        mock_lora_model.merge_and_unload = None
        mock_lora_model.to = Mock(return_value=mock_lora_model)

        mock_lora_model.generation_config = Mock()
        mock_lora_model.config = Mock()

        self.original_q_weight = base_model.q_proj.weight.clone()
        self.original_v_weight = base_model.v_proj.weight.clone()

        mock_lora_model.peft_config = {"default": Mock()}
        mock_lora_model.peft_config["default"].r = r
        mock_lora_model.peft_config["default"].lora_alpha = alpha

        self.lora_A_q = torch.randn(
            r, base_model.q_proj.weight.shape[1], dtype=self.dtype
        )
        self.lora_B_q = torch.randn(
            base_model.q_proj.weight.shape[0], r, dtype=self.dtype
        )

        self.lora_A_v = torch.randn(
            r, base_model.v_proj.weight.shape[1], dtype=self.dtype
        )
        self.lora_B_v = torch.randn(
            base_model.v_proj.weight.shape[0], r, dtype=self.dtype
        )

        self.scaling = alpha / r

        def mock_merge_and_unload(progressbar=False):
            """Simulate the actual merge operation"""
            # Apply LoRA delta to base weights: W_new = W_base + (B @ A) * scaling
            delta_q = (self.lora_B_q @ self.lora_A_q) * self.scaling
            delta_v = (self.lora_B_v @ self.lora_A_v) * self.scaling

            base_model.q_proj.weight = self.original_q_weight + delta_q
            base_model.v_proj.weight = self.original_v_weight + delta_v

            return base_model

        mock_lora_model.merge_and_unload = mock_merge_and_unload
        return mock_lora_model

    def test_basic_lora_merge_unmerge_cycle(self):
        """Test: original_weights -> merge -> unmerge -> should equal original_weights"""

        base_model = self.create_mock_base_model()
        lora_model = self.create_mock_lora_model(base_model)

        original_q_weight = self.original_q_weight.clone()
        original_v_weight = self.original_v_weight.clone()

        merged_model = lora_model.merge_and_unload()

        assert not torch.equal(merged_model.q_proj.weight, original_q_weight)
        assert not torch.equal(merged_model.v_proj.weight, original_v_weight)

        delta_q = (self.lora_B_q @ self.lora_A_q) * self.scaling
        delta_v = (self.lora_B_v @ self.lora_A_v) * self.scaling

        unmerged_q_weight = merged_model.q_proj.weight - delta_q
        unmerged_v_weight = merged_model.v_proj.weight - delta_v

        assert torch.allclose(unmerged_q_weight, original_q_weight, atol=1e-6)
        assert torch.allclose(unmerged_v_weight, original_v_weight, atol=1e-6)

    def test_merge_weight_calculation_accuracy(self):
        """Test: merged_weight = base_weight + (lora_B @ lora_A * scaling)"""
        base_model = self.create_mock_base_model()
        lora_model = self.create_mock_lora_model(base_model, r=16, alpha=32)

        expected_delta_q = (self.lora_B_q @ self.lora_A_q) * self.scaling
        expected_merged_q = self.original_q_weight + expected_delta_q
        merged_model = lora_model.merge_and_unload()

        assert torch.allclose(merged_model.q_proj.weight, expected_merged_q, atol=1e-6)

    @patch("axolotl.cli.merge_lora.load_model_and_tokenizer")
    def test_cli_do_merge_functionality(self, mock_load_model, tmp_path):
        base_model = self.create_mock_base_model()
        lora_model = self.create_mock_lora_model(base_model)
        tokenizer = Mock()
        processor = None

        mock_load_model.return_value = (lora_model, tokenizer, processor)

        cfg = DictDefault(
            {
                "save_safetensors": True,
                "torch_dtype": torch.float32,
                "local_rank": 0,
                "output_dir": str(tmp_path),
            }
        )

        with (
            patch("pathlib.Path.mkdir"),
            patch.object(base_model, "save_pretrained") as mock_save_model,
            patch.object(tokenizer, "save_pretrained") as mock_save_tokenizer,
        ):
            do_merge_lora(cfg=cfg)

        mock_save_model.assert_called_once()
        mock_save_tokenizer.assert_called_once()

    def test_quantized_model_merge_compatibility(self):
        """Test 4-bit/8-bit model merging scenarios"""
        base_model = self.create_mock_base_model()

        # Mock quantized weights
        base_model.q_proj.weight.quant_state = Mock()
        base_model.q_proj.weight.quant_state.dtype = torch.uint8

        lora_model = self.create_mock_lora_model(base_model)

        merged_model = lora_model.merge_and_unload()
        assert merged_model is not None

    @patch.dict("os.environ", {"CUDA_VISIBLE_DEVICES": ""})
    def test_memory_efficient_merge_with_cpu_offload(self, tmp_path):
        """Test lora_on_cpu configuration during merge"""
        cfg = DictDefault(
            {
                "lora_on_cpu": True,
                "save_safetensors": True,
                "output_dir": str(tmp_path),
                "local_rank": 0,
            }
        )

        with patch("axolotl.cli.merge_lora.load_model_and_tokenizer") as mock_load:
            base_model = self.create_mock_base_model()
            lora_model = self.create_mock_lora_model(base_model)
            mock_load.return_value = (lora_model, Mock(), None)

            with patch("pathlib.Path.mkdir"), patch("torch.save"):
                do_merge_lora(cfg=cfg)

            assert mock_load.called


================================================
FILE: tests/utils/schemas/validation/test_activation_offloading.py
================================================
"""Test for config validation for activation offloading."""

from axolotl.utils.config import validate_config
from axolotl.utils.dict import DictDefault


class TestActivationOffloading:
    """
    Test cases for activation offloading schema validation
    """

    def test_gc_converts_offload_wo_lora(self, min_base_cfg):
        cfg = (
            DictDefault(
                gradient_checkpointing="offload",
            )
            | min_base_cfg
        )

        cfg = validate_config(cfg)
        assert cfg.gradient_checkpointing is True
        assert cfg.activation_offloading is True

    def test_ac_offload_impl_noop_wo_adapter(self, min_base_cfg):
        cfg = (
            DictDefault(
                gradient_checkpointing=True,
                activation_offloading=True,
            )
            | min_base_cfg
        )

        cfg = validate_config(cfg)
        assert cfg.gradient_checkpointing is True
        assert cfg.activation_offloading is True


================================================
FILE: tests/utils/schemas/validation/test_default_values.py
================================================
"""Tests for default values for configurations"""

from axolotl.utils.config import validate_config
from axolotl.utils.dict import DictDefault


class TestDefaultConfigValues:
    """Tests for default values for configurations"""

    def test_pad_to_sequence_len(self, min_base_cfg):
        """Tests that sample packing automatically sets pad_to_sequence_len to True"""
        cfg = (
            DictDefault(
                sample_packing=True,
            )
            | min_base_cfg
        )

        cfg = validate_config(cfg)

        assert cfg.pad_to_sequence_len is True


================================================
FILE: tests/utils/schemas/validation/test_fsdp.py
================================================
"""
tests for pydantic fsdp validation
"""

import pytest

from axolotl.utils.config import validate_config
from axolotl.utils.dict import DictDefault


class TestFSDPValidation:
    """
    test class for pydantic fsdp validation
    """

    def test_fsdp_version_from_fsdp_config(self, min_base_cfg):
        cfg = min_base_cfg | DictDefault(
            fsdp_config={
                "version": 2,
            },
        )
        cfg = validate_config(
            cfg,
        )
        assert cfg.fsdp_version == 2

    def test_fsdp_version_in_fsdp_config(self, min_base_cfg):
        cfg = min_base_cfg | DictDefault(
            fsdp_version=2,
            fsdp_config={
                "reshard_after_forward": True,
            },
        )
        cfg = validate_config(
            cfg,
        )
        assert cfg.fsdp_version == 2
        assert cfg.fsdp_config.fsdp_version == 2

    def test_fsdp_offload_w_8bit_optim(self, min_base_cfg):
        cfg = min_base_cfg | DictDefault(
            fsdp_config={
                "offload_params": True,
            },
            optimizer="adamw_8bit",
            fsdp_version=1,
        )
        with pytest.raises(
            ValueError, match="FSDP Offload not compatible with adamw_8bit"
        ):
            validate_config(cfg)

    def test_fsdp2_w_8bit_optim(self, min_base_cfg):
        cfg = min_base_cfg | DictDefault(
            fsdp_config={
                "offload_params": True,
            },
            optimizer="adamw_8bit",
            fsdp_version=2,
        )
        with pytest.raises(
            ValueError,
            match="FSDP2 not compatible with adamw_8bit, use `adamw_torch_8bit` instead",
        ):
            validate_config(cfg)

    def test_fsdp2_w_cpu_ram_efficient_loading(self, min_base_cfg):
        cfg = min_base_cfg | DictDefault(
            load_in_8bit=True,
            adapter="lora",
            fsdp_config={
                "cpu_ram_efficient_loading": True,
            },
            fsdp_version=2,
        )
        validated_cfg = validate_config(cfg)
        assert validated_cfg.fsdp_version == 2
        assert validated_cfg.fsdp_config.cpu_ram_efficient_loading is True

    def test_fsdp2_cpu_offload_pin_memory_requires_offload_params(self, min_base_cfg):
        cfg = min_base_cfg | DictDefault(
            fsdp_config={
                "cpu_offload_pin_memory": False,
                "offload_params": False,
            },
            fsdp_version=2,
        )
        with pytest.raises(
            ValueError,
            match="disabling cpu_offload_pin_memory requires enabling offload_params",
        ):
            validate_config(cfg)

    def test_fsdp1_cpu_offload_pin_memory_not_supported(self, min_base_cfg):
        cfg = min_base_cfg | DictDefault(
            fsdp_config={
                "cpu_offload_pin_memory": False,
                "offload_params": True,
            },
            fsdp_version=1,
        )
        with pytest.raises(
            ValueError,
            match="FSDP1 does not support disabling cpu_offload_pin_memory, please set `fsdp_version` to 2",
        ):
            validate_config(cfg)

    def test_fsdp2_cpu_offload_pin_memory_w_offload_params(self, min_base_cfg):
        cfg = min_base_cfg | DictDefault(
            fsdp_config={
                "cpu_offload_pin_memory": False,
                "offload_params": True,
            },
            fsdp_version=2,
        )
        validated_cfg = validate_config(cfg)
        assert validated_cfg.fsdp_config.cpu_offload_pin_memory is False
        assert validated_cfg.fsdp_config.offload_params is True

    def test_fsdp_prefixes_removed(self, min_base_cfg):
        cfg = min_base_cfg | DictDefault(
            fsdp_config={
                "fsdp_version": 2,
                "fsdp_auto_wrap_policy": "TRANSFORMER_BASED_WRAP",
                "fsdp_transformer_layer_cls_to_wrap": "LlamaDecoderLayer",
                "fsdp_reshard_after_forward": True,
            }
        )
        cfg = validate_config(cfg)
        assert cfg.fsdp_version == 2
        assert cfg.fsdp_config.fsdp_version == 2
        for key in cfg.fsdp_config.keys():
            if key != "fsdp_version":
                assert not key.startswith("fsdp_")
        assert cfg.fsdp_config.auto_wrap_policy == "TRANSFORMER_BASED_WRAP"
        assert cfg.fsdp_config.transformer_layer_cls_to_wrap == "LlamaDecoderLayer"
        assert cfg.fsdp_config.reshard_after_forward is True

    def test_muon_fsdp1_rejected(self, min_base_cfg):
        cfg = min_base_cfg | DictDefault(
            optimizer="muon",
            fsdp_version=1,
            fsdp_config={"reshard_after_forward": True},
        )
        with pytest.raises(
            ValueError, match="Muon optimizer is only compatible with FSDP2"
        ):
            validate_config(cfg)

    @pytest.mark.parametrize(
        "rl",
        [
            "dpo",
            "kto",
            "orpo",
            "ipo",
        ],
    )
    def test_fsdp2_dpo(self, min_base_cfg, rl):
        cfg = min_base_cfg | DictDefault(
            fsdp_version=2,
            fsdp_config={
                "reshard_after_forward": True,
            },
            rl=rl,
            load_in_8bit=True,
            adapter="lora",
            remove_unused_columns=False,
        )
        with pytest.raises(
            ValueError,
            match="FSDP2 does not support load_in_8bit or load_in_4bit with ",
        ):
            validate_config(cfg)


================================================
FILE: tests/utils/schemas/validation/test_moe_quant.py
================================================
"""Tests for MoE expert quantization config validation and PEFT patch idempotency."""

import pytest

from axolotl.utils.config import validate_config
from axolotl.utils.dict import DictDefault


@pytest.fixture()
def gpu_caps():
    return {
        "compute_capability": "sm_89",
        "bf16": True,
        "tf32": False,
        "n_gpu": 1,
        "n_node": 1,
    }


@pytest.fixture()
def env_caps():
    return {"torch_version": "2.7.0"}


class TestQuantizeMoeExpertsValidation:
    """Test suite for quantize_moe_experts config validator."""

    def test_requires_adapter(self, min_base_cfg, gpu_caps, env_caps):
        """quantize_moe_experts without adapter should fail."""
        cfg = (
            DictDefault(
                quantize_moe_experts=True,
            )
            | min_base_cfg
        )
        with pytest.raises(ValueError, match="requires adapter"):
            validate_config(cfg, capabilities=gpu_caps, env_capabilities=env_caps)

    def test_requires_quantization(self, min_base_cfg, gpu_caps, env_caps):
        """quantize_moe_experts without load_in_4bit/8bit should fail."""
        cfg = (
            DictDefault(
                quantize_moe_experts=True,
                adapter="lora",
            )
            | min_base_cfg
        )
        with pytest.raises(ValueError, match="requires load_in_4bit or load_in_8bit"):
            validate_config(cfg, capabilities=gpu_caps, env_capabilities=env_caps)

    def test_valid_qlora_4bit(self, min_base_cfg, gpu_caps, env_caps):
        """quantize_moe_experts with qlora + 4bit should pass."""
        cfg = (
            DictDefault(
                quantize_moe_experts=True,
                adapter="qlora",
                load_in_4bit=True,
            )
            | min_base_cfg
        )
        result = validate_config(cfg, capabilities=gpu_caps, env_capabilities=env_caps)
        assert result["quantize_moe_experts"] is True

    def test_valid_lora_8bit(self, min_base_cfg, gpu_caps, env_caps):
        """quantize_moe_experts with lora + 8bit should pass."""
        cfg = (
            DictDefault(
                quantize_moe_experts=True,
                adapter="lora",
                load_in_8bit=True,
            )
            | min_base_cfg
        )
        result = validate_config(cfg, capabilities=gpu_caps, env_capabilities=env_caps)
        assert result["quantize_moe_experts"] is True

    def test_false_skips_validation(self, min_base_cfg, gpu_caps, env_caps):
        """quantize_moe_experts=false should not check adapter/quantization."""
        cfg = (
            DictDefault(
                quantize_moe_experts=False,
            )
            | min_base_cfg
        )
        result = validate_config(cfg, capabilities=gpu_caps, env_capabilities=env_caps)
        assert result["quantize_moe_experts"] is False

    def test_rejects_lora_target_linear(self, min_base_cfg, gpu_caps, env_caps):
        """quantize_moe_experts with lora_target_linear should fail."""
        cfg = (
            DictDefault(
                quantize_moe_experts=True,
                adapter="qlora",
                load_in_4bit=True,
                lora_target_linear=True,
            )
            | min_base_cfg
        )
        with pytest.raises(ValueError, match="lora_target_linear is not compatible"):
            validate_config(cfg, capabilities=gpu_caps, env_capabilities=env_caps)

    def test_default_is_false(self, min_base_cfg, gpu_caps, env_caps):
        """quantize_moe_experts should default to false."""
        cfg = DictDefault({}) | min_base_cfg
        result = validate_config(cfg, capabilities=gpu_caps, env_capabilities=env_caps)
        assert result["quantize_moe_experts"] is False


class TestLoraTargetParametersDropout:
    """Test that lora_dropout must be 0 when lora_target_parameters is set."""

    def test_rejects_nonzero_dropout(self, min_base_cfg):
        """lora_dropout > 0 with lora_target_parameters should fail."""
        cfg = (
            DictDefault(
                adapter="lora",
                lora_target_parameters=["mlp.experts.gate_up_proj"],
                lora_dropout=0.1,
                load_in_8bit=True,
            )
            | min_base_cfg
        )
        with pytest.raises(ValueError, match="lora_dropout must be 0"):
            validate_config(cfg)

    def test_zero_dropout_passes(self, min_base_cfg):
        """lora_dropout=0 with lora_target_parameters should pass."""
        cfg = (
            DictDefault(
                adapter="lora",
                lora_target_parameters=["mlp.experts.gate_up_proj"],
                lora_dropout=0.0,
                load_in_8bit=True,
            )
            | min_base_cfg
        )
        result = validate_config(cfg)
        assert result["lora_dropout"] == 0.0


class TestPeftPatchIdempotency:
    """Test that patch_peft_target_parameters_matching is idempotent."""

    def test_double_call_does_not_stack_wrappers(self):
        """Calling patch twice should not double-wrap _inject_parameters."""
        from peft.tuners.tuners_utils import BaseTuner

        from axolotl.monkeypatch.moe_quant import (
            patch_peft_target_parameters_matching,
        )

        original = BaseTuner._inject_parameters
        try:
            patch_peft_target_parameters_matching()
            first_patched = BaseTuner._inject_parameters
            patch_peft_target_parameters_matching()
            second_patched = BaseTuner._inject_parameters
            # Should be same function, not double-wrapped
            assert first_patched is second_patched
        finally:
            BaseTuner._inject_parameters = original
            patch_peft_target_parameters_matching._axolotl_patched = False


class TestMoeAdapterTrainMergeRoundtrip:
    """E2E: train adapter on quantized MoE experts, then merge onto plain model.

    Verifies that param wrapping order during training matches merge, preventing
    size mismatch errors when loading adapters in standard PEFT/vLLM.
    """

    @staticmethod
    def _make_classes():
        """Return FakeExperts and FakeModel classes shared by both model builders."""
        import torch
        import torch.nn as nn

        class FakeExperts(nn.Module):
            def __init__(self):
                super().__init__()
                # Model definition order: gate_up_proj first, then down_proj.
                self.gate_up_proj = nn.Parameter(torch.randn(4, 16, 8))
                self.down_proj = nn.Parameter(torch.randn(4, 8, 16))

            def forward(self, x):
                x = torch.matmul(x, self.gate_up_proj[0].T)  # (batch, 16)
                x = torch.matmul(x, self.down_proj[0].T)  # (batch, 8)
                return x

        class FakeModel(nn.Module):
            def __init__(self):
                super().__init__()
                self.linear = nn.Linear(8, 8)
                self.experts = FakeExperts()

            def forward(self, x):
                return self.linear(x) + self.experts(x)

        return FakeExperts, FakeModel

    @staticmethod
    def _make_quantized_model():
        """Training model: parametrizations registered in alphabetical order."""
        import torch.nn as nn
        import torch.nn.utils.parametrize as P

        from axolotl.monkeypatch.moe_quant import _moe_load_state

        _, FakeModel = TestMoeAdapterTrainMergeRoundtrip._make_classes()

        class PassthroughParametrization(nn.Module):
            def forward(self, x):
                return x

        model = FakeModel()

        # Record definition order before parametrization (mirrors real loading).
        _moe_load_state["expert_param_order"]["experts"] = list(
            model.experts._parameters.keys()
        )

        # Register in alphabetical order to expose the ordering mismatch.
        P.register_parametrization(
            model.experts, "down_proj", PassthroughParametrization(), unsafe=True
        )
        P.register_parametrization(
            model.experts, "gate_up_proj", PassthroughParametrization(), unsafe=True
        )
        return model

    @staticmethod
    def _make_plain_model():
        """Merge model: no parametrizations — standard branch uses definition order."""
        _, FakeModel = TestMoeAdapterTrainMergeRoundtrip._make_classes()
        return FakeModel()

    def test_train_save_merge_no_size_mismatch(self, tmp_path):
        """Train on quantized experts, merge onto plain model — must not raise."""
        import torch
        from peft import LoraConfig, PeftModel, get_peft_model
        from peft.tuners.tuners_utils import BaseTuner

        from axolotl.monkeypatch.moe_quant import (
            _moe_load_state,
            patch_peft_target_parameters_matching,
        )

        adapter_dir = tmp_path / "adapter"
        lora_cfg = LoraConfig(
            r=4,
            lora_alpha=8,
            target_modules=[],
            target_parameters=["experts.gate_up_proj", "experts.down_proj"],
            lora_dropout=0.0,
            bias="none",
        )
        original_inject = BaseTuner._inject_parameters

        # Training phase: quantized model (parametrized branch) with axolotl patch.
        _moe_load_state["expert_param_order"] = {}
        patch_peft_target_parameters_matching()
        try:
            peft_model = get_peft_model(self._make_quantized_model(), lora_cfg)
        finally:
            BaseTuner._inject_parameters = original_inject
            patch_peft_target_parameters_matching._axolotl_patched = False

        optimizer = torch.optim.SGD(peft_model.parameters(), lr=1e-3)
        for _ in range(3):
            peft_model(torch.randn(2, 8)).sum().backward()
            optimizer.step()
            optimizer.zero_grad()
        peft_model.save_pretrained(str(adapter_dir))

        # Merge with standard PEFT (no axolotl patch) to verify external compatibility.
        loaded = PeftModel.from_pretrained(self._make_plain_model(), str(adapter_dir))
        merged = loaded.merge_and_unload()
        assert merged is not None


================================================
FILE: tests/utils/test_grpo_rw_fnc.py
================================================
import os

import pytest

from axolotl.core.trainers.grpo import GRPOStrategy


def test_get_rollout_func_loads_successfully():
    """Test that a valid rollout function can be loaded"""
    rollout_func = GRPOStrategy.get_rollout_func("os.path.join")
    assert callable(rollout_func)
    assert rollout_func == os.path.join


def test_get_rollout_func_invalid_module_raises_error():
    """Test that invalid module path raises clear ValueError"""
    with pytest.raises(ValueError, match="Rollout function .* not found"):
        GRPOStrategy.get_rollout_func("nonexistent_module.my_func")


================================================
FILE: tests/utils/test_import_helper.py
================================================
"""
test cases for axolotl.utils.import_helper
"""

import pytest

from axolotl.utils.import_helper import get_cls_from_module_str


def test_get_cls_from_module_str():
    cls = get_cls_from_module_str("axolotl.core.trainers.base.AxolotlTrainer")
    assert cls.__name__ == "AxolotlTrainer"


def test_get_cls_from_module_str_empty_string():
    with pytest.raises(ValueError, match="module_str must be a non-empty string"):
        get_cls_from_module_str("")


def test_get_cls_from_module_str_whitespace_only():
    with pytest.raises(ValueError, match="module_str must be a non-empty string"):
        get_cls_from_module_str("   ")


def test_get_cls_from_module_str_invalid_format():
    with pytest.raises(ValueError, match="Invalid module string format"):
        get_cls_from_module_str("single_part")


def test_get_cls_from_module_str_nonexistent_module():
    with pytest.raises(ImportError, match="Failed to import module"):
        get_cls_from_module_str("nonexistent.module.Class")


def test_get_cls_from_module_str_nonexistent_class():
    with pytest.raises(AttributeError, match="Class 'NonExistentClass' not found"):
        get_cls_from_module_str("axolotl.core.trainers.base.NonExistentClass")


================================================
FILE: tests/utils/test_mistral3_processor.py
================================================
"""Tests for Mistral3Processor with transformers v5 ProcessorMixin integration"""

from unittest.mock import MagicMock

import pytest
import torch
from transformers.feature_extraction_utils import BatchFeature

from axolotl.utils.mistral.mistral3_processor import Mistral3Processor
from axolotl.utils.mistral.mistral_tokenizer import HFMistralTokenizer


@pytest.fixture()
def mock_tokenizer():
    """Create a mock HFMistralTokenizer that passes v5 ProcessorMixin isinstance checks."""
    return MagicMock(spec=HFMistralTokenizer)


@pytest.fixture()
def processor(mock_tokenizer):
    return Mistral3Processor(tokenizer=mock_tokenizer)


class TestMistral3ProcessorInit:
    def test_tokenizer_is_set(self, processor, mock_tokenizer):
        assert processor.tokenizer is mock_tokenizer

    def test_chat_template_is_none(self, processor):
        assert processor.chat_template is None

    def test_audio_tokenizer_is_none(self, processor):
        assert processor.audio_tokenizer is None


class TestApplyChatTemplateTokenized:
    """Test apply_chat_template with tokenize=True, return_dict=True"""

    @pytest.fixture()
    def batched_conversations(self):
        return [
            [
                {"role": "user", "content": "Describe this image."},
                {"role": "assistant", "content": "It is red."},
            ],
            [
                {"role": "user", "content": "What is this?"},
                {"role": "assistant", "content": "A cat."},
            ],
        ]

    def test_returns_batch_feature_with_pixel_values(
        self, processor, mock_tokenizer, batched_conversations
    ):
        pixel_values = torch.randn(2, 3, 224, 224, dtype=torch.float64)
        mock_tokenizer.apply_chat_template.return_value = {
            "input_ids": torch.tensor([[1, 2, 3], [4, 5, 6]]),
            "attention_mask": torch.tensor([[1, 1, 1], [1, 1, 1]]),
            "pixel_values": pixel_values,
        }

        result = processor.apply_chat_template(
            batched_conversations, tokenize=True, return_dict=True
        )

        assert isinstance(result, BatchFeature)
        assert "pixel_values" in result
        assert "image_sizes" in result
        assert result["pixel_values"].dtype == torch.float32
        assert result["image_sizes"].shape == (2, 2)
        assert result["image_sizes"][0].tolist() == [224, 224]

    def test_returns_batch_feature_without_pixel_values(
        self, processor, mock_tokenizer, batched_conversations
    ):
        mock_tokenizer.apply_chat_template.return_value = {
            "input_ids": torch.tensor([[1, 2, 3], [4, 5, 6]]),
            "attention_mask": torch.tensor([[1, 1, 1], [1, 1, 1]]),
        }

        result = processor.apply_chat_template(
            batched_conversations, tokenize=True, return_dict=True
        )

        assert isinstance(result, BatchFeature)
        assert "input_ids" in result
        assert "image_sizes" not in result


class TestApplyChatTemplateNotTokenized:
    def test_single_conversation_returns_unwrapped(self, processor, mock_tokenizer):
        """Single conversation (not batched) should return unwrapped result."""
        single_conversation = [
            {"role": "user", "content": "Hello"},
            {"role": "assistant", "content": "Hi"},
        ]
        mock_tokenizer.apply_chat_template.return_value = [
            "<s>[INST]Hello[/INST]Hi</s>"
        ]

        result = processor.apply_chat_template(
            single_conversation, tokenize=False, return_dict=False
        )

        assert result == "<s>[INST]Hello[/INST]Hi</s>"

    def test_batched_conversations_returns_list(self, processor, mock_tokenizer):
        batched = [
            [
                {"role": "user", "content": "Hello"},
                {"role": "assistant", "content": "Hi"},
            ],
            [
                {"role": "user", "content": "Bye"},
                {"role": "assistant", "content": "Bye"},
            ],
        ]
        mock_tokenizer.apply_chat_template.return_value = ["text1", "text2"]

        result = processor.apply_chat_template(
            batched, tokenize=False, return_dict=False
        )

        assert result == ["text1", "text2"]


class TestCall:
    def test_delegates_to_tokenizer(self, processor, mock_tokenizer):
        mock_tokenizer.return_value = {
            "input_ids": [1, 2, 3],
            "attention_mask": [1, 1, 1],
        }

        result = processor("Hello world")

        mock_tokenizer.assert_called_once()
        assert isinstance(result, BatchFeature)


class TestReturnTensorsValidation:
    def test_rejects_non_pt_return_tensors(self, processor):
        conversation = [
            {"role": "user", "content": "Hello"},
            {"role": "assistant", "content": "Hi"},
        ]

        with pytest.raises(ValueError, match=r"only supports.*return_tensors='pt'"):
            processor.apply_chat_template(
                conversation, tokenize=True, return_dict=True, return_tensors="np"
            )


================================================
FILE: tests/utils/test_train.py
================================================
"""test for train checkpoint utils"""

import os

from axolotl.utils.dict import DictDefault
from axolotl.utils.train import determine_last_checkpoint


def test_determine_last_checkpoint(temp_dir):
    cfg = DictDefault(
        output_dir=temp_dir,
    )
    for cpt_idx in [1, 9, 10, 20]:
        os.makedirs(
            os.path.join(cfg.output_dir, f"checkpoint-{cpt_idx}"), exist_ok=True
        )

    last_checkpoint = determine_last_checkpoint(cfg, update=False)
    assert last_checkpoint == os.path.join(cfg.output_dir, "checkpoint-20")

    cfg.resume_from_checkpoint = None
    cfg.auto_resume_from_checkpoints = True
    determine_last_checkpoint(cfg, update=True)
    assert cfg.resume_from_checkpoint == os.path.join(cfg.output_dir, "checkpoint-20")